In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.pipeline import Pipeline

# import vectorizers
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import time 

In [2]:
pwd

'/Users/emilynaftalin/Data_Science/General Assembly/dsi/projects/project_3/practice_code'

In [3]:
stoic_zen = pd.read_csv('../datasets/clean_stoic_zen_tokenized.csv', )

In [4]:
stoic_zen.head()

Unnamed: 0,label,merged,tokens
0,0,you need to fight your mind every time it trie...,"['you', 'need', 'to', 'fight', 'your', 'mind',..."
1,0,a phone call creates stronger bonds than text ...,"['a', 'phone', 'call', 'creates', 'stronger', ..."
2,0,"everything in your life every experience, ever...","['everything', 'in', 'your', 'life', 'every', ..."
3,0,the parable of the mexican fisherman got me re...,"['the', 'parable', 'of', 'the', 'mexican', 'fi..."
4,0,the key to success and productivity isn t to t...,"['the', 'key', 'to', 'success', 'and', 'produc..."


In [6]:
stoic_zen.tail()

Unnamed: 0,label,merged,tokens
5982,1,stoicism and self improvement i have just rece...,"['stoicism', 'and', 'self', 'improvement', 'i'..."
5983,1,should i go cold turkey on entertainment to pr...,"['should', 'i', 'go', 'cold', 'turkey', 'on', ..."
5984,1,"free law of attraction, the secret pdf and boo...","['free', 'law', 'of', 'attraction', 'the', 'se..."
5985,1,anxiety i have trouble rationalising with my a...,"['anxiety', 'i', 'have', 'trouble', 'rationali..."
5986,1,"""some poor, phoneless fool is probably sitting...","['some', 'poor', 'phoneless', 'fool', 'is', 'p..."


_NB: Making test size 0.25 in instead of 0.33 as it was in the KNN / Decision Tree Notebook_

In [7]:
X = stoic_zen['merged']
y = stoic_zen['label']

# splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                   y, 
                                                   test_size=0.25,
                                                   shuffle=True,
                                                   stratify=y,
                                                   random_state=42)

## Modeling

## **`Random Forest` / `Extra Trees`**

### `Random Forest + CountVectorizer` 

#### Transforming and Vectorizing the data

In [14]:
cvec = CountVectorizer(stop_words='english')

cvec.fit(X_train)

X_train_cvec = cvec.transform(X_train)
X_test_cvec = cvec.transform(X_test)

_Will start with a Random Forest Classifier using CountVectorized data and default hyperparameters._ 

In [17]:
rf = RandomForestClassifier(random_state=48)

rf.fit(X_train_cvec, y_train)

RandomForestClassifier(random_state=48)

In [18]:
cross_val_score(rf, X_train_cvec, y_train).mean()

0.8601336302895323

In [20]:
cross_val_score(rf, X_test_cvec, y_test).mean()

0.8436722408026756

In [22]:
print(f'Random Forest (default hyperparameters) score on training set: {rf.score(X_train_cvec, y_train)}')
print(f'Random Forest (default hyperparameters) score on testing set: {rf.score(X_test_cvec, y_test)}')

Random Forest (default hyperparameters) score on training set: 0.999554565701559
Random Forest (default hyperparameters) score on testing set: 0.8577154308617234


### `Extra Trees + CountVectorizer` 

In [24]:
et = ExtraTreesClassifier(random_state=48)

et.fit(X_train_cvec, y_train)

ExtraTreesClassifier(random_state=48)

In [25]:
cross_val_score(et, X_train_cvec, y_train).mean()

0.8623608017817371

In [26]:
cross_val_score(et, X_test_cvec, y_test).mean()

0.8450167224080267

In [27]:
print(f'Extra Trees (default hyperparameters) score on training set: {et.score(X_train_cvec, y_train)}')
print(f'Extra Tress (default hyperparameters) score on testing set: {et.score(X_test_cvec, y_test)}')

Extra Trees (default hyperparameters) score on training set: 0.999554565701559
Extra Tress (default hyperparameters) score on testing set: 0.8643954575818303


### `Random + CountVectorizer Pipe` 

_Both Random Forest and Extra Trees Classifiers are **severely** overfit; next will move to a Pipeline and will tune a few hyperparameters._

In [30]:
pipe = Pipeline([
    ('cvec', CountVectorizer(stop_words='english', ngram_range=(1,2))),
    ('rf', RandomForestClassifier(max_features=0.6, max_depth=5))
])

In [31]:
cross_val_score(pipe, X_train, y_train, cv=3).mean()

0.8171525760969353

In [32]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('cvec',
                 CountVectorizer(ngram_range=(1, 2), stop_words='english')),
                ('rf', RandomForestClassifier(max_depth=5, max_features=0.6))])

In [33]:
pipe.score(X_train, y_train)

0.8356347438752784

In [34]:
pipe.score(X_test, y_test)

0.8156312625250501

_Less overfit!_

### `Extra Trees + CountVectorizer Pipeline` 

In [35]:
pipe_et = Pipeline([
    ('cvec', CountVectorizer(stop_words='english', ngram_range=(1,2))),
    ('et', ExtraTreesClassifier(max_features=0.6, max_depth=5))
])

In [36]:
pipe_et.fit(X_train, y_train)

Pipeline(steps=[('cvec',
                 CountVectorizer(ngram_range=(1, 2), stop_words='english')),
                ('et', ExtraTreesClassifier(max_depth=5, max_features=0.6))])

In [37]:
pipe_et.score(X_train, y_train)

0.856792873051225

In [38]:
pipe_et.score(X_test, y_test)

0.8356713426853707

### `Extra Trees + TfidfVectorizer Pipeline` 

In [41]:
pipe_et_tf = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1,2))),
    ('et', ExtraTreesClassifier(max_features=0.6, max_depth=5))
])

pipe_et_tf.fit(X_train, y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(ngram_range=(1, 2), stop_words='english')),
                ('et', ExtraTreesClassifier(max_depth=5, max_features=0.6))])

In [42]:
pipe_et_tf.score(X_train, y_train)

0.834966592427617

In [43]:
pipe_et_tf.score(X_test, y_test)

0.8176352705410822

_Extra Trees has better accuracy than the Random Forest Pipeline but is still overfit._

### `Extra Trees + TfidfVectorizer Pipeline --> Gridsearch` 

_To save time, will start with a gridsearch using Extra Trees since the accuracy was higher on the above pipeline._

_I will countvectorizer as my transformer because this had higher accuracy than tfidf._

_Hyperparameters are also informed by the best parameters found in the Decision Tree grid search in another notebook._ 

In [46]:
# building cvec pipeline in two stages 
pipe_rf_cvec = Pipeline([
    ('cvec', CountVectorizer()),
    ('et', ExtraTreesClassifier())
])

# hyperparamters for transformer and estimator 
pipe_params_rf_cvec = {
    'cvec__max_features':[2000, 3000, 4000],
    'cvec__ngram_range': [(1,2),(1,3)],
    'cvec__min_df': [0.75, 1],
    'et__ccp_alpha': [0.001, 0.1, 0, 1],
    'et__max_depth': [4,6,8],
    'et__min_samples_leaf': [4,6,8],
    'et__min_samples_split': [5,10]
}

# gridsearch instantiation 
grid_rf_cvec = GridSearchCV(pipe_rf_cvec,
                            pipe_params_rf_cvec,
                            cv = 5,
                            verbose = 2, 
                            n_jobs = 8
)

In [45]:
# {'dt__ccp_alpha': 0.001,
#  'dt__max_depth': 8,
#  'dt__min_samples_leaf': 6,
#  'dt__min_samples_split': 5,
#  'tfidf__max_features': 3000,
#  'tfidf__min_df': 1,
#  'tfidf__ngram_range': (1, 3)}

In [47]:
import time 

t0 = time.time()

# gridsearching on training data
grid_rf_cvec.fit(X_train, y_train)

print(f'This cell took {time.time() - t0} seconds to run')

Fitting 5 folds for each of 864 candidates, totalling 4320 fits


        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan       

This cell took 2968.1651360988617 seconds to run


In [48]:
grid_rf_cvec.best_score_

0.7841870824053453

In [49]:
grid_rf_cvec.best_params_

{'cvec__max_features': 2000,
 'cvec__min_df': 1,
 'cvec__ngram_range': (1, 3),
 'et__ccp_alpha': 0.001,
 'et__max_depth': 8,
 'et__min_samples_leaf': 8,
 'et__min_samples_split': 5}

In [50]:
print(f'Extra Tress Gridsearch Training Score: {grid_rf_cvec.score(X_train, y_train)}')
print(f'Extra Tress Gridsearch Testing Score: {grid_rf_cvec.score(X_test, y_test)}')

Extra Tress Gridsearch Training Score: 0.7866369710467706
Extra Tress Gridsearch Testing Score: 0.7682030728122913


_poor accuracy but not too overfit_