# Random Forests 3

---

__This Notebook__:

- the command line script failed often, so I only got the X_bot (first representation) that way
- this notebook performs 5 grid searches for the rest of the non-tfidf models:

    1. X_bot_feat
    2. X_bot_svd
    3. X_bot_svd_cos
    4. X_bot_svd_feat
    5. X_bot_svd_feat_cos
    
    
- I also altered the `gridsearch_wrapper` function to take in the models so as to be able to make partial runs, and added functionality to save gridsearches as they complete, so that if the wrapper fails or I kill it manually I don't lose the completed searches.

## Setup

In [1]:
import os
import re
import time
import joblib
import numpy as np
import pandas as pd
import scipy.sparse as sp

from datetime import datetime
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, recall_score, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

dt_object = datetime.fromtimestamp(time.time())
day, T = str(dt_object).split('.')[0].split(' ')
print('Revised on: ' + day)

Revised on: 2021-01-05


## Load Data

In [2]:
# load target
raw_path = os.path.join("data","1_raw")
filename = "y_train.csv"
y = pd.read_csv(os.path.join(raw_path, filename))
y = np.array(y.iloc[:,0].ravel())
y[y=='ham'] = 0
y[y=='spam'] = 1
y = y.astype('int')

# load 12 matrices
proc_dir = os.path.join("data","2_processed")
Xnames = [x for x in os.listdir(proc_dir) if re.search('.npz', x)]
Xs = []
for ix, X in enumerate(Xnames):
    path_ = os.path.join(proc_dir, Xnames[ix])
    Xs.append(sp.load_npz(path_))

In [3]:
# 12 representations
for ix, Xname in enumerate(Xnames):
    Xname = Xname.split('.')[0]
    print(ix+1, Xname)

1 X_bot
2 X_bot_feat
3 X_bot_svd
4 X_bot_svd_cos
5 X_bot_svd_feat
6 X_bot_svd_feat_cos
7 X_bot_tfidf
8 X_bot_tfidf_feat
9 X_bot_tfidf_svd
10 X_bot_tfidf_svd_cos
11 X_bot_tfidf_svd_feat
12 X_bot_tfidf_svd_feat_cos


## Grid search random forest models

Using sklearn's GridSearchCV with 10-fold cross validation on a shallow param grid, varying representations.

In [4]:
def print_eval_metrics(y_val, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_val, y_pred).ravel()
    acc = (tp + tn) / (tp + tn + fp + fn)
    tpr = tp / (tp + fn)
    tnr = tn / (tn + fp)
    print(f'accuracy: {acc:0.4f}')
    print(f'sensitivity: {tpr:0.4f}')
    print(f'specificity: {tnr:0.4f}')

In [5]:
def gridsearch_wrapper(Xs, Xnames, test=False, k=10):
    """
    Performs grid searches and collects them in a list.
    Args:
        Xs: the numeric matrices
        Xnames: their names
        test: faster, shallower searches for testing
        k: the number of CV folds
    """
    
    start_time = time.time()
    model_dir = os.path.join("data", "3_modeling")
    
    # instantiate list of dicts to gather results
    gridsearches = []
    for ix, X_name in enumerate(Xnames):

        X_ = Xs[ix].toarray()
        X_name = X_name.split('.')[0]

        # split into training and validation sets
        X_train, X_val, y_train, y_val = train_test_split(X_, y, stratify=y)

        # setup testing param grid
        test_param_grid = {
            'min_samples_split': [5, 10, 20], 
            'n_estimators' : [50, 100],
            'max_depth': [10, 20],
            'max_features': [50, 100, 200]
        }

        # setup param grid for final not-too-deep search
        param_grid = {
            'min_samples_split': [5, 10, 15],
            'n_estimators' : [100, 200],
            'max_depth': [5, 10, 20],
            'max_features': [50, 100, 250, 500]
        }

        # setup scorers
        scorers = {
            'acc': make_scorer(accuracy_score),
            'tpr': make_scorer(recall_score, pos_label=1), # sensitivity, recall
            'tnr': make_scorer(recall_score, pos_label=0) # specificity, selectivity
        }

        # instantiate estimator
        clf = RandomForestClassifier(n_jobs=-1, random_state=42)

        # instantiate k-fold gridsearch
        cv_folds = StratifiedKFold(n_splits=k)
    
        if test == True:
            grid_search_clf = GridSearchCV(clf, test_param_grid, # test grid
                                           scoring=scorers, 
                                           refit='tpr', cv=cv_folds, 
                                           return_train_score=True, n_jobs=-1)
        else:
            grid_search_clf = GridSearchCV(clf, param_grid,
                                           scoring=scorers, 
                                           refit='tpr', cv=cv_folds, 
                                           return_train_score=True, n_jobs=-1)           

        # train models
        print(f'\nTraining {ix+1}: {X_name}...')
        start_gs = time.time()
        grid_search_clf.fit(X_train, y_train)
        elapsed_secs = time.time() - start_gs
        print(f'Elapsed: {elapsed_secs:0.0f} s')

        # predict
        y_pred = grid_search_clf.predict(X_val)
        print(f'Best params: {grid_search_clf.best_params_}')

        # confusion matrix on validation set
        print(f'Confusion matrix on validation set:')
        print(pd.DataFrame(confusion_matrix(y_val, y_pred),
                           columns=['pred_neg', 'pred_pos'],
                           index=['neg', 'pos']))
        # eval metrics
        print('Evaluation metrics:')
        print_eval_metrics(y_val, y_pred)

        data = {'representation':X_name,
                'gridsearch_res':grid_search_clf}
        
        # save gridsearch
        filename = ''.join([str(ix+1), "_", Xname, "_rf_gridsearch.joblib"])
        file_path = os.path.join(model_dir, filename)                                                    
        joblib.dump(data, file_path)
        
        # gather results into a list of dicts
        gridsearches.append(data)
        
    mins, secs = divmod(time.time() - start_time, 60)
    print(f'\nTot elapsed: {mins:0.0f} m {secs:0.0f} s')
    return gridsearches

In [7]:
results = gridsearch_wrapper(Xs=Xs[1:6], Xnames=Xnames[1:6], test=False, k=10)


Training 1: X_bot_feat...
Elapsed: 1469 s
Best params: {'max_depth': 20, 'max_features': 500, 'min_samples_split': 10, 'n_estimators': 100}
Confusion matrix on validation set:
     pred_neg  pred_pos
neg       841         5
pos        13       116
Evaluation metrics:
accuracy: 0.9815
sensitivity: 0.8992
specificity: 0.9941

Training 2: X_bot_svd...
Elapsed: 3872 s
Best params: {'max_depth': 20, 'max_features': 100, 'min_samples_split': 10, 'n_estimators': 200}
Confusion matrix on validation set:
     pred_neg  pred_pos
neg       837         9
pos        16       113
Evaluation metrics:
accuracy: 0.9744
sensitivity: 0.8760
specificity: 0.9894

Training 3: X_bot_svd_cos...
Elapsed: 3605 s
Best params: {'max_depth': 20, 'max_features': 50, 'min_samples_split': 10, 'n_estimators': 200}
Confusion matrix on validation set:
     pred_neg  pred_pos
neg       840         6
pos        19       110
Evaluation metrics:
accuracy: 0.9744
sensitivity: 0.8527
specificity: 0.9929

Training 4: X_bot_sv

## Persist results

In [9]:
results

[{'representation': 'X_bot_feat',
  'gridsearch_res': GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
               estimator=RandomForestClassifier(n_jobs=-1, random_state=42),
               n_jobs=-1,
               param_grid={'max_depth': [5, 10, 20],
                           'max_features': [50, 100, 250, 500],
                           'min_samples_split': [5, 10, 15],
                           'n_estimators': [100, 200]},
               refit='tpr', return_train_score=True,
               scoring={'acc': make_scorer(accuracy_score),
                        'tnr': make_scorer(recall_score, pos_label=0),
                        'tpr': make_scorer(recall_score, pos_label=1)})},
 {'representation': 'X_bot_svd',
  'gridsearch_res': GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
               estimator=RandomForestClassifier(n_jobs=-1, random_state=42),
               n_jobs=-1,
               param_grid={'max_dep

In [11]:
model_dir = os.path.join("data", "3_modeling")
file_path = os.path.join(model_dir, "".join(["01052021", "_rf_gridsearches.joblib"]))

# save all gridsearches
joblib.dump(results, file_path)

['data\\3_modeling\\01052021_rf_gridsearches.joblib']

---