# Random Forests

---

*Features*

- Add new features

*Results*

## Setup & Load Data

In [1]:
import os
import time
import json
import urlextract
import numpy as np
import pandas as pd

from datetime import datetime
from nltk.stem import WordNetLemmatizer

start_time = time.time()
dt_object = datetime.fromtimestamp(time.time())
day, T = str(dt_object).split('.')[0].split(' ')
print('Revised on: ' + day)

def load_data(data):
    raw_path = os.path.join("..","data","1_raw")
    filename = ''.join([data, ".csv"])
    out_dfm = pd.read_csv(os.path.join(raw_path, filename))
    out_arr = np.array(out_dfm.iloc[:,0].ravel())
    return out_arr

X_train = load_data("X_train")
y_train = load_data("y_train")

y = y_train.copy()

# transform y_array into int type
y[y=='ham'] = 0
y[y=='spam'] = 1
y = y.astype('int')

# load contractions map for custom cleanup
with open("contractions_map.json") as f:
    contractions_map = json.load(f)

Revised on: 2020-12-29


## BoT & Tfidf

In [2]:
import custom.clean_preprocess as cp
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

pipe = Pipeline([('counter', cp.DocumentToNgramCounterTransformer(n_grams=3)),
                 ('bot', cp.WordCounterToVectorTransformer(vocabulary_size=2000)),
                 ('tfidf', TfidfTransformer(sublinear_tf=True))
                ])

X_counter = pipe['counter'].fit_transform(X_train)
X_bot = pipe['bot'].fit_transform(X_counter)
X_tfidf = pipe.fit_transform(X_train)

### SVD

In [5]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds
from sklearn.utils.extmath import svd_flip

def perform_SVD(X, n_components=300):
    X_array = X.asfptype()
    U, Sigma, VT = svds(X_array.T, k=n_components)
    # reverse outputs
    Sigma = Sigma[::-1]
    U, VT = svd_flip(U[:, ::-1], VT[::-1])
    # return V 
    V = VT.T
    return V

X_svd_bot = perform_SVD(X_bot)
X_svd_tfidf = perform_SVD(X_tfidf)

## Cosine Similarity

In [6]:
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

X_cossim_svd_bot = cosine_similarity(X_svd_bot)
X_cossim_svd_tfidf = cosine_similarity(X_svd_tfidf)

train_df = pd.DataFrame({'sms':X_train, 'target':y_train})

# get spam indexes
spam_ix = train_df.loc[train_df['target']=='spam'].index

# calculate average spam similarity on SVD
mean_spam_sims_bot, mean_spam_sims_tfidf = [], []

for ix in range(X_cossim_svd_bot.shape[0]):
    mean_spam_sims_bot.append(np.mean(X_cossim_svd_bot[ix, spam_ix]))
    mean_spam_sims_tfidf.append(np.mean(X_cossim_svd_tfidf[ix, spam_ix]))

# stack representations
X_bot_cossim_bot = sp.hstack((csr_matrix(mean_spam_sims_bot).T, X_bot))
X_tfidf_cossim_tfidf = sp.hstack((csr_matrix(mean_spam_sims_tfidf).T, X_tfidf))

## Feature Engineering

In [7]:
%%capture output

# custom feature engineering module
import custom.feature_engineering as Fe

try:
    clean_train_docs, X_train_feat = Fe.DocumentToFeaturesCounterTransformer().fit_transform(X_train)
except Warning as e:
    pass # avoids RuntimeErrors because of divisions by zero in calculating means/stds

# impute with zeros and remove RSR
X_train_feat[np.isnan(X_train_feat)] = 0
X_feat = X_train_feat[:,:6] 

# stack with BoT and SVD of Tfidf
X_bot_feat = sp.hstack((X_bot, X_feat))
X_svd_feat = sp.hstack((csr_matrix(X_svd_tfidf), X_feat))

## Random Forests

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, recall_score

def scikitlearn_cv(clf, X, y, seed_, cv=5, test_size=.25):
    
    scorer_ = {
        'acc': make_scorer(accuracy_score),
        'tpr': make_scorer(recall_score, pos_label=1),
        'tnr': make_scorer(recall_score, pos_label=0)
    }
    
    acc = cross_val_score(clf, X, y, cv=cv, verbose=0, scoring=scorer_['acc'], n_jobs=-1)
    tpr = cross_val_score(clf, X, y, cv=cv, verbose=0, scoring=scorer_['tpr'], n_jobs=-1)
    tnr = cross_val_score(clf, X, y, cv=cv, verbose=0, scoring=scorer_['tnr'], n_jobs=-1)
    
    return acc.mean(), tpr.mean(), tnr.mean()

def collect_cvs(clf, Xs, Xnames, y, seed_, cv=10, test_size=.25):

    accs, tprs, tnrs, secs = [], [], [], []
    for X in Xs:
        start_cv = time.time()
        acc, tpr, tnr = scikitlearn_cv(clf, X, y, seed_=seed_, cv=cv, test_size=test_size)
        accs.append(round(acc, 4))
        tprs.append(round(tpr, 4))
        tnrs.append(round(tnr, 4))
        secs.append(round(time.time() - start_cv, 1))

    data = {'Representation': Xnames,
            'mean_accuracy': accs,
            'mean_sensitivity': tprs, 
            'mean_specificity': tnrs,
            'elapsed_seconds':secs
           }
    
    return data

def build_random_forests(Xs, Xnames, y, cv_seed, rf_seed, mtry_, trees, 
                         max_leaf_nodes, cv=5, max_samples=None, n_jobs=-1):
    """Given:
           Xs: a list of X representations (training data)
           Xnames: a list their names (descriptions)
           y: the target variable
           cv_seed: random seed for cross validation
           rf_seed: random seed for rf classifier
           mtry_: a list of values for the max_features param
           trees: number of trees
           max_leaf_nodes: max number of leaf nodes
           cv: number of folds, defaults to k=5
           max_samples: max num of samples, defaults to None
           n_jobs: defaults to -1 (all cores but one)
       Return:
           A dataframe of results of cv over various mtry values
           With mean accuracy, sensitivity, specificity
    """
    list_of_dfs = []
    for mtry in mtry_:
        rf_clf = RandomForestClassifier(n_estimators=trees,
                                        max_samples=None,
                                        max_features=mtry,
                                        max_leaf_nodes=max_leaf_nodes,
                                        random_state=rf_seed,
                                        n_jobs=n_jobs,
                                        verbose=0)
        
        data = collect_cvs(rf_clf, Xs, Xnames, y, seed_=cv_seed, cv=cv)
        df = pd.DataFrame(data)
        df['mtry'] = mtry
        
        list_of_dfs.append(df)
     
    flattened_df = pd.concat(list_of_dfs)
    
    # reset index
    ix_num = len(mtry_) * len(Xs)
    flattened_df.index = range(ix_num)
    
    return flattened_df

## Custom Gridsearches

10-fold CV with on random forest with 500 trees and 99 max leaf nodes on all 7 representations - varying mtry values.

In [10]:
Xs = [
      X_bot, 
      X_tfidf, 
      X_svd_bot,
      X_svd_tfidf,
      X_bot_feat,
      X_svd_feat,
      X_bot_cossim_bot, 
      X_tfidf_cossim_tfidf
     ]

Xnames = [
          'BoT', 
          'Tfidf', 
          'SVD on BoT'
          'SVD on Tfidf', 
          'BoT + features',
          'SVD + features',
          'Cossim on BoT',
          'Cossim on Tfidf'
         ]

In [12]:
#cv_data1 = build_random_forests(Xs, 
#                                Xnames, 
#                                y, 
#                                cv_seed=423, 
#                                rf_seed=514, 
#                                mtry_=[5, 10, 20, 25, 40 , 50, 
#                                       75, 100, 150, 200, 250, 
#                                       300, 350, 400, 450, None],
#                                trees=500, 
#                                max_leaf_nodes=99, 
#                                cv=10)

# arrays must be of same length --- try smaller CVs, try printing more 

In [None]:
cv_data2 = build_random_forests(Xs, 
                                Xnames, 
                                y, 
                                cv_seed=423, 
                                rf_seed=514, 
                                mtry_=[10, 25, 50, 100, 250, 
                                       500, None],
                                trees=1000, 
                                max_leaf_nodes=99, 
                                cv=10)

In [None]:
cv_data1

*Results*

- SVD on Tfidf takes the longest, and sensitivity isn't improving nor better
- Cosine Similarities on Tfidf have the best sensitivities

*Decision*

- ditch SVD on Tfidf
- try SVD on BoT
- try rest of cosine similarities

---