# Random Forests 1

---

*Goals*

- Start building random forest models: build simple forests (less trees, etc.)
- Grid search 12 stacked matrices 
- Use 5-fold CV
- Save results in CSV file for further analysis

*Features*



## Setup

In [90]:
import os
import re
import time
import numpy as np
import pandas as pd
import scipy.sparse as sp
from scipy.sparse import csr_matrix
from datetime import datetime
dt_object = datetime.fromtimestamp(time.time())
day, T = str(dt_object).split('.')[0].split(' ')
print('Revised on: ' + day)

Revised on: 2021-01-01


## Load Data

In [104]:
# load target
raw_path = os.path.join("data","1_raw")
filename = "y_train.csv"
y = pd.read_csv(os.path.join(raw_path, filename))
y = np.array(y.iloc[:,0].ravel())
y[y=='ham'] = 0
y[y=='spam'] = 1
y = y.astype('int')

# load 12 matrices
proc_dir = os.path.join("data","2_processed")
Xnames = [x for x in os.listdir(proc_dir) if re.search('.npz', x)]
Xs = []
for i, X in enumerate(Xnames):
    path_ = os.path.join(proc_dir, Xnames[i])
    Xs.append(sp.load_npz(path_))

## For X_bot 

In [132]:
X_bot = Xs[0].toarray()

In [133]:
X_train, X_test, y_train, y_test = train_test_split(X_bot, y, stratify=y)

In [138]:
# show the distribution
y_train.sum() / len(y_train), y_test.sum() / len(y_test)

(0.13264957264957264, 0.13230769230769232)

## First strategy: Optimize for sensitivity using GridSearchCV with the scoring argument.

In [144]:
clf = RandomForestClassifier(n_jobs=-1)

param_grid = {
    'min_samples_split': [5, 10],
    'n_estimators' : [50, 100],
    'max_depth': [3, 5], 
    'max_features': [25, 50] # mtry
}

scorers = {
    'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score),
    'accuracy_score': make_scorer(accuracy_score)
}

In [147]:
def grid_search_wrapper(refit_score='precision_score'):
    """Fits a GridSearchCV classifier using refit_score for optimization
       Prints classifier's performance metrics
    """
    T1 = time.time()
    skf = StratifiedKFold(n_splits=10)
    grid_search = GridSearchCV(clf, param_grid, scoring=scorers, refit=refit_score,
                           cv=skf, return_train_score=True, n_jobs=-1)
    
    grid_search.fit(X_train, y_train) 

    # make the predictions
    y_pred = grid_search.predict(X_test)

    print('Best params for {}'.format(refit_score))
    print(grid_search.best_params_)

    # confusion matrix on the test data.
    print('\nConfusion matrix of Random Forest optimized for {} on the test data:'.format(refit_score))
    print(pd.DataFrame(confusion_matrix(y_test, y_pred),
                 columns=['pred_neg', 'pred_pos'], index=['neg', 'pos']))
    
    mins, secs = divmod(time.time() - T1, 60)
    print(f'Elapsed: {mins:0.0f} m {secs:0.0f} s')
    return grid_search

In [148]:
grid_search_clf = grid_search_wrapper(refit_score='precision_score')

Best params for precision_score
{'max_depth': 3, 'max_features': 50, 'min_samples_split': 5, 'n_estimators': 50}

Confusion matrix of Random Forest optimized for precision_score on the test data:
     pred_neg  pred_pos
neg       846         0
pos       106        23
Elapsed: 0 m 22 s


## DIY GridSearchCV

I created this DIY grid search before trying to implement the same using sklearn's GridSearchCV which is probably faster but more complex.

In [44]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, recall_score

def scikitlearn_cv(clf, X, y, seed_, cv=5, test_size=.25, n_jobs=-1):
    # setup scorer for each metric
    scorer_ = {
        'acc': make_scorer(accuracy_score),
        'tpr': make_scorer(recall_score, pos_label=1),
        'tnr': make_scorer(recall_score, pos_label=0)
    }
    # return mean cv score for each metric
    acc = cross_val_score(clf, X, y, cv=cv, scoring=scorer_['acc'], n_jobs=n_jobs)
    tpr = cross_val_score(clf, X, y, cv=cv, scoring=scorer_['tpr'], n_jobs=n_jobs)
    tnr = cross_val_score(clf, X, y, cv=cv, scoring=scorer_['tnr'], n_jobs=n_jobs)
    return acc.mean(), tpr.mean(), tnr.mean()

def collect_cvs(clf, Xs, Xnames, y, seed_, cv=5, test_size=.25):
    # instantiate lists
    accs, tprs, tnrs, secs = [], [], [], []
    # cross-validate and collect metrics
    for X in Xs:
        start_cv = time.time()
        acc, tpr, tnr = scikitlearn_cv(clf, X, y, 
                                       seed_=seed_, cv=cv, test_size=test_size)
        accs.append(round(acc, 4))
        tprs.append(round(tpr, 4))
        tnrs.append(round(tnr, 4))
        secs.append(round(time.time() - start_cv, 1))
    # create dictionary
    data = {'representation': Xnames,
            'mean_accuracy': accs,
            'mean_sensitivity': tprs, 
            'mean_specificity': tnrs,
            'elapsed_seconds':secs}
    return data

def grid_search(Xs, Xnames, y, cv_seed, rf_seed, 
                n_estimators, max_features, max_samples, max_depth, max_leaf_nodes, 
                cv=5, n_jobs=-1):
    start_ = time.time()
    # instantiate list of data frames
    list_of_dfs = []
    # collect cv metrics for each mtry value
    for mtry in max_features:
        rf_clf = RandomForestClassifier(random_state=rf_seed,
                                        n_estimators=n_estimators,
                                        max_features=mtry,
                                        max_samples=max_samples,
                                        max_depth=max_depth,
                                        max_leaf_nodes=max_leaf_nodes,
                                        n_jobs=n_jobs,
                                        verbose=0)        
        data = collect_cvs(rf_clf, Xs, Xnames, y, seed_=cv_seed, cv=cv)
        df = pd.DataFrame(data)
        df['mtry'] = mtry        
        list_of_dfs.append(df)
    # flatten list of data frames and reset index
    flattened_df = pd.concat(list_of_dfs)
    ix_num = len(max_features) * len(Xs)
    flattened_df.index = range(ix_num)
    mins, secs = divmod(time.time() - start_, 60)
    print(f'Elapsed: {mins:0.0f} m {secs:0.0f} s')
    return flattened_df

Static decisions:
- 100 trees
- 500 max samples (out of 3900)
- 10 max depth of tree
- 50 max leaf nodes
- 5-fold CV

In [56]:
cv_data1 = grid_search(Xs, Xnames, y,
                       cv_seed=7379, 
                       rf_seed=3551,
                       n_estimators=100,
                       max_features=[10,25,50,100], # what the DIY gridsearch varies on
                       max_samples=500,
                       max_depth=10,
                       max_leaf_nodes=50,
                       cv=5)

Elapsed: 5 m 44 s


In [57]:
cv_data1

Unnamed: 0,representation,mean_accuracy,mean_sensitivity,mean_specificity,elapsed_seconds,mtry
0,X_bot.npz,0.8944,0.2031,1.0,4.8,10
1,X_bot_feat.npz,0.9059,0.2902,1.0,2.2,10
2,X_bot_svd.npz,0.9369,0.5493,0.9962,6.0,10
3,X_bot_svd_cos.npz,0.9354,0.5279,0.9976,5.9,10
4,X_bot_svd_feat.npz,0.9418,0.5822,0.9967,6.0,10
5,X_bot_svd_feat_cos.npz,0.94,0.5629,0.9976,5.9,10
6,X_bot_tfidf.npz,0.8962,0.2167,1.0,2.3,10
7,X_bot_tfidf_feat.npz,0.9028,0.2669,1.0,2.5,10
8,X_bot_tfidf_svd.npz,0.9254,0.437,1.0,6.2,10
9,X_bot_tfidf_svd_cos.npz,0.9223,0.414,1.0,6.0,10


## Scikit-Learn GridSearchCV

[(See Docs)](https://github.com/scikit-learn/scikit-learn/blob/0fb307bf3/sklearn/ensemble/_forest.py#L883)

In [66]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix

param_grid = {'n_estimators': [50],
          'max_features':[5, 10],
          'max_samples':[100],
          'max_depth':[2],
          'max_leaf_nodes':[10]}

# pass scorer for tpr and tnr?
scorers = {
    'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score),
    'accuracy_score': make_scorer(accuracy_score)
}

clf = RandomForestClassifier(n_jobs=-1)

#grid_search_cv = GridSearchCV(rf_clf, params, n_jobs=-1, verbose=0, cv=3)

In [71]:
X_bot = Xs[0]

In [67]:
def grid_search_wrapper(refit_score='precision_score'):
    """
    fits a GridSearchCV classifier using refit_score for optimization
    prints classifier performance metrics
    """
    skf = StratifiedKFold(n_splits=10)
    
    grid_search = GridSearchCV(clf, param_grid, scoring=scorers, refit=refit_score,
                               cv=skf, return_train_score=True, n_jobs=-1)
    
    grid_search.fit(X_bot.values, y.values)

    # make the predictions
    y_pred = grid_search.predict(X_test.values)

    print('Best params for {}'.format(refit_score))
    print(grid_search.best_params_)

    # confusion matrix on the test data
    print('\nConfusion matrix of Random Forest optimized for {} on the test data:'.format(refit_score))
    print(pd.DataFrame(confusion_matrix(y_test, y_pred),
                       columns=['pred_neg', 'pred_pos'], index=['neg', 'pos']))
    return grid_search

In [68]:
grid_search_clf = grid_search_wrapper(refit_score='precision_score')

NameError: name 'X_train' is not defined

In [51]:
start_ = time.time()

grid_search_cvs = []
for X in Xs:
    grid_search_cvs.append(grid_search_cv.fit(X, y))
    
mins, secs = divmod(time.time() - start_, 60)
print(f'Elapsed: {mins:0.0f} m {secs:0.0f} s')

Elapsed: 0 m 9 s


In [54]:
grid_search_cvs[0].cv_results_

{'mean_fit_time': array([0.31153758, 0.33834481]),
 'std_fit_time': array([0.00703965, 0.01539574]),
 'mean_score_time': array([0.10768429, 0.10326926]),
 'std_score_time': array([0.00331278, 0.00288083]),
 'param_max_depth': masked_array(data=[2, 2],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_max_features': masked_array(data=[5, 10],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_max_leaf_nodes': masked_array(data=[10, 10],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_max_samples': masked_array(data=[100, 100],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[50, 50],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 2,
   'max_features': 5,
   'max_leaf_nodes': 10,
   'max_samples': 100,
   'n_es

In [11]:
dir_path = os.path.join("data","3_modeling")
try:
    os.stat(dir_path)
except:
    os.mkdir(dir_path)

In [12]:
file_path = os.path.join(dir_path, "cv_data1.csv")
cv_data1.to_csv(file_path, index=False)

---