# Twitter Sentiment Analysis - POC
---

## 7. Quickly train various models

**Current State**: I've trained three baseline models with little tweaking, and fine-tuned a single decision tree, on a Bag-of-Word subset of $m\approx250k, n=50k$ of the training data, using cross validation, and got the following mean accuracy scores:

- Logistic Regression: $78.8\%$
- Naive Bayes: $77.6\%$ 
- SGD (log loss): $77.1\%$ 
- Decision Tree: $69.0\%$

**This Notebook**: Build random forests (DIY and using Scikit-learn's RandomForestClassifier class), perhaps even a tiny GridSearchCV with choice parameters to see whether I can crack $80\%$ accuracy on the test set.

In [1]:
import re
import os
import time
import json

import numpy as np
import pandas as pd
import scipy.sparse as sp

from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

### Load $m\approx250k$, $n=50k$ training subset

In [2]:
# processed dir
proc_dir = os.path.join("..","data","3_processed","sentiment140")
X_train_transformed = sp.load_npz(os.path.join(proc_dir, "X_train_transformed_BoW_250k_50k.npz"))
with open(os.path.join(proc_dir, "y_array_250k.npy"), 'rb') as f:
    y_array = np.load(f)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X_train_transformed, 
                                                    y_array, 
                                                    test_size=0.2, 
                                                    random_state=42)

### Scikit-learn's RandomForestClassifier

[(source)](https://github.com/scikit-learn/scikit-learn/blob/0fb307bf3/sklearn/ensemble/_forest.py#L883)

```
class sklearn.ensemble.RandomForestClassifier(n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None
```

Note that `max_features`'s default is "auto", which is the same as "sqrt", meaning: `max_features=sqrt(n_features)`

In our case: $\sqrt{50000}\approx224$ max features. This is arguably the reason this will run a lot faster than my DIY random forest, which considered all the features.

[(source)](https://github.com/scikit-learn/scikit-learn/blob/0fb307bf39bbdacd6ed713c00724f8f871d60370/sklearn/tree/_classes.py#L597)

```
class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
    [...]
    max_features : int, float or {"auto", "sqrt", "log2"}, default=None
        The number of features to consider when looking for the best split:
            [...]
            - If None, then `max_features=n_features`.
```         

In [10]:
from sklearn.ensemble import RandomForestClassifier

In [4]:
#scores = cross_val_score(rf_clf, X_train_transformed, y_array, cv=3, verbose=0, scoring='accuracy')
#print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
#Accuracy: 0.75 (+/- 0.00)

In [5]:
#rf_clf.fit(X_train, y_train) 6.6min

In [6]:
#y_pred = rf_clf.predict(X_test)

In [7]:
#round(accuracy_score(y_test, y_pred), 4) # 0.7528

### GridSearch 1

* $\text{n_estimators}=100$, vary `max_depth`

In [20]:
start_gridsearch = time.time()

depth, runtime, scores = [], [], []
for i in range(1, 10):
    depth.append(i)
    start_loop = time.time()
    clf = RandomForestClassifier(n_estimators=100, 
                                 max_features="sqrt",
                                 random_state=42,
                                 max_depth=i, 
                                 n_jobs=-1)
    scores.append(
        cross_val_score(clf, 
                        X_train_transformed, 
                        y_array, 
                        cv=3, 
                        verbose=0, 
                        scoring='accuracy')
                 )
    runtime.append(round(time.time() - start_loop, 1))
    print(f'Finished loop {i:0.0f} | runtime: {runtime[0]:0.2f}s')
    
mins, secs = divmod(time.time() - start_gridsearch, 60)
print(f'Gridsearch total time: {mins:0.0f} mins and {secs:0.0f} secs')

Finished loop 1 | runtime: 7.70s
Finished loop 2 | runtime: 7.70s
Finished loop 3 | runtime: 7.70s
Finished loop 4 | runtime: 7.70s
Finished loop 5 | runtime: 7.70s
Finished loop 6 | runtime: 7.70s
Finished loop 7 | runtime: 7.70s
Finished loop 8 | runtime: 7.70s
Finished loop 9 | runtime: 7.70s
Gridsearch total time: 2 mins and 24 secs


In [26]:
df1 = pd.DataFrame(
    {'depth': depth,
     'runtime': runtime,
     'min acc': [np.min(score) for score in scores],
     'mean acc': [np.mean(score) for score in scores],
     'max acc': [np.max(score) for score in scores]
    })
df1

Unnamed: 0,depth,runtime,min acc,mean acc,max acc
0,1,7.7,0.608496,0.634116,0.659867
1,2,9.7,0.657743,0.676444,0.687087
2,3,11.8,0.680446,0.692526,0.699188
3,4,14.0,0.689071,0.702463,0.710661
4,5,16.1,0.700762,0.711594,0.719238
5,6,17.9,0.705844,0.715522,0.720801
6,7,19.8,0.716892,0.722498,0.72566
7,8,22.5,0.717536,0.724386,0.731315
8,9,24.6,0.720661,0.726216,0.732019


In [17]:
start_gridsearch = time.time()

depth, runtime, accuracy = [], [], []
for i in range(10, 100, 10):
    depth.append(i)
    start_clf = time.time()
    clf = RandomForestClassifier(n_estimators=100, 
                                 max_depth=i, 
                                 n_jobs=6, 
                                 random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy.append(round(accuracy_score(y_test, y_pred), 4))
    runtime.append(round(time.time() - start_clf, 1))
    
mins, secs = divmod(time.time() - start_gridsearch, 60)
print(f'Gridsearch total time: {mins:0.0f} mins and {secs:0.0f} secs')

Gridsearch total time: 15 mins and 25 secs


In [18]:
df2 = pd.DataFrame(
    {'depth': depth,
     'runtime': runtime,
     'accuracy': accuracy
    })
df2

Unnamed: 0,depth,runtime,accuracy
0,10,13.4,0.7309
1,20,27.9,0.7355
2,30,43.4,0.7442
3,40,63.0,0.7494
4,50,83.6,0.7531
5,60,111.5,0.7586
6,70,158.9,0.7607
7,80,193.1,0.7643
8,90,230.1,0.7671


In [19]:
start_gridsearch = time.time()

depth, runtime, accuracy = [], [], []
for i in range(100, 1100, 100):
    depth.append(i)
    start_clf = time.time()
    clf = RandomForestClassifier(n_estimators=100, 
                                 max_depth=i, 
                                 n_jobs=6, 
                                 random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy.append(round(accuracy_score(y_test, y_pred), 4))
    runtime.append(round(time.time() - start_clf, 1))
    
mins, secs = divmod(time.time() - start_gridsearch, 60)
print(f'Gridsearch total time: {mins:0.0f} mins and {secs:0.0f} secs')

Gridsearch total time: 98 mins and 58 secs


In [20]:
df3 = pd.DataFrame(
    {'depth': depth,
     'runtime': runtime,
     'accuracy': accuracy
    })
df3

Unnamed: 0,depth,runtime,accuracy
0,100,216.3,0.7695
1,200,531.9,0.7777
2,300,867.3,0.7795
3,400,588.4,0.7793
4,500,598.6,0.7806
5,600,584.9,0.7808
6,700,647.2,0.7781
7,800,637.5,0.7799
8,900,604.2,0.7794
9,1000,661.8,0.7789


In [25]:
rf_gridsearch = pd.concat([df1,df2,df3])

In [26]:
# save gridsearch results
model_tuning_dir = os.path.join("..","data","4_models","sentiment140","tuning")

try:
    os.stat(model_tuning_dir)
except:
    os.mkdir(model_tuning_dir)
    
filepath = os.path.join(model_tuning_dir, "POC7_rf_gridsearch1.csv")
rf_gridsearch.to_csv(filepath, index=False)

### GridSearch 2

* $\text{max_depth}=500$, vary `n_estimators`

In [None]:
start_gridsearch = time.time()

ntrees, runtime, accuracy = [], [], []
for i in range(100, 500, 100):
    ntrees.append(i)
    start_clf = time.time()
    clf = RandomForestClassifier(n_estimators=i,
                                 max_depth=500, 
                                 n_jobs=6, 
                                 random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy.append(round(accuracy_score(y_test, y_pred), 4))
    runtime.append(round(time.time() - start_clf, 1))
    
mins, secs = divmod(time.time() - start_gridsearch, 60)

In [34]:
print(f'Gridsearch total time: {mins:0.0f} mins and {secs:0.0f} secs')

Gridsearch total time: 165 mins and 11 secs


In [39]:
df4 = pd.DataFrame(
    {'ntrees': ntrees,
     'runtime': runtime,
     'accuracy': accuracy
    })
df4

Unnamed: 0,ntrees,runtime,accuracy
0,100,543.0,0.7806
1,200,1162.2,0.7826
2,300,1770.1,0.7843
3,400,2259.7,0.7843


In [40]:
# time notebook
mins, secs = divmod(time.time() - start_notebook, 60)
print(f'Total running time: {mins:0.0f} minute(s) and {secs:0.0f} second(s).')

Total running time: 378 minute(s) and 0 second(s).


---