# Random Forests 2

---

__This Notebook__:

Shallow gridsearch on all 12 representations.

## Setup

In [1]:
import os
import re
import time
import numpy as np
import pandas as pd
import scipy.sparse as sp
import matplotlib.pyplot as plt 
plt.style.use("ggplot")

from datetime import datetime
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer
from sklearn.metrics import recall_score, accuracy_score, precision_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

dt_object = datetime.fromtimestamp(time.time())
day, T = str(dt_object).split('.')[0].split(' ')
print('Revised on: ' + day)

Revised on: 2021-01-03


## Load Data

In [2]:
# load target
raw_path = os.path.join("data","1_raw")
filename = "y_train.csv"
y = pd.read_csv(os.path.join(raw_path, filename))
y = np.array(y.iloc[:,0].ravel())
y[y=='ham'] = 0
y[y=='spam'] = 1
y = y.astype('int')

# load 12 matrices
proc_dir = os.path.join("data","2_processed")
Xnames = [x for x in os.listdir(proc_dir) if re.search('.npz', x)]
Xs = []
for ix, X in enumerate(Xnames):
    path_ = os.path.join(proc_dir, Xnames[ix])
    Xs.append(sp.load_npz(path_))

In [13]:
# 12 representations
for ix, Xname in enumerate(Xnames):
    Xname = Xname.split('.')[0]
    print(ix+1, Xname)

1 X_bot
2 X_bot_feat
3 X_bot_svd
4 X_bot_svd_cos
5 X_bot_svd_feat
6 X_bot_svd_feat_cos
7 X_bot_tfidf
8 X_bot_tfidf_feat
9 X_bot_tfidf_svd
10 X_bot_tfidf_svd_cos
11 X_bot_tfidf_svd_feat
12 X_bot_tfidf_svd_feat_cos


## Grid search random forest models

Using sklearn's GridSearchCV with 10-fold cross validation on a shallow param grid, varying representations.

In [4]:
def print_eval_metrics(y_val, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_val, y_pred).ravel()
    acc = (tp + tn) / (tp + tn + fp + fn)
    tpr = tp / (tp + fn)
    tnr = tn / (tn + fp)
    print(f'accuracy: {acc:0.4f}')
    print(f'sensitivity: {tpr:0.4f}')
    print(f'specificity: {tnr:0.4f}')

In [None]:
# instantiate list of dicts to gather results
gridsearches = []
for ix, X_name in enumerate(Xnames):
    
    X_ = Xs[ix].toarray()
    X_name = X_name.split('.')[0]
        
    # split into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_, y, stratify=y)

    # setup shallow param grid
    param_grid = {
        'min_samples_split': [5, 10, 15], 
        'n_estimators' : [100, 200],
        'max_depth': [5, 10, 20],
        'max_features': [50, 100, 250]
    }

    # setup scorers
    scorers = {
        'acc': make_scorer(accuracy_score),
        'tpr': make_scorer(recall_score, pos_label=1), # sensitivity, recall
        'tnr': make_scorer(recall_score, pos_label=0) # specificity, selectivity
    }
    
    # instantiate estimator
    clf = RandomForestClassifier(n_jobs=-1, random_state=42)

    # instantiate 10-fold gridsearch
    cv_folds = StratifiedKFold(n_splits=10)
    grid_search_clf = GridSearchCV(clf, param_grid, scoring=scorers, refit='tpr', 
                                   cv=cv_folds, return_train_score=True, n_jobs=-1)

    # train models
    print(f'\nTraining {ix+1}: {X_name}...')
    start_gs = time.time()
    grid_search_clf.fit(X_train, y_train)
    elapsed_secs = time.time() - start_gs
    print(f'Elapsed: {elapsed_secs:0.0f} s')
    
    # predict
    y_pred = grid_search_clf.predict(X_val)
    print(f'Best params: {grid_search_clf.best_params_}')
    
    # confusion matrix on validation set
    print(f'Confusion matrix on validation set:')
    print(pd.DataFrame(confusion_matrix(y_val, y_pred),
                       columns=['pred_neg', 'pred_pos'],
                       index=['neg', 'pos']))
    # eval metrics
    print('Evaluation metrics:')
    print_eval_metrics(y_val, y_pred)
    
    # gather results into a list of dicts
    data = {'representation':X_name,
            'gridsearch_res':grid_search_clf}
    
    gridsearches.append(data)


Training 1: X_bot...


## Persist results

In [7]:
import joblib

model_dir = os.path.join("data", "3_modeling")
file_path = os.path.join(model_dir, "01032020_rf_gridsearches_shallowparams.joblib")

joblib.dump(gridsearches, file_path)

['data\\3_modeling\\01032020_rf_gridsearches_shallowparams.joblib']

---