In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV

X_train = pd.read_pickle('./data/X_train_pickle.pkl')
X_test = pd.read_pickle('./data/X_test_pickle.pkl')
y_train = pd.read_pickle('./data/y_train_pickle.pkl')
y_test = pd.read_pickle('./data/y_test_pickle.pkl')

# Random Forest Classifier

In [2]:
# adjust the variables to your liking.
cv = 5                               # Set how many cross validations you would like.
n_est_range = list(range(120,161,5))   # Set the range of estimators.
depth_range = list(range(16,22))     # Set the range of depth.

In [3]:
# Run this cell once to save results to your disk.
# Will take about <5 minutes to run. 
rfc = RandomForestClassifier()
param_grid = [{'n_estimators': n_est_range, 
               'max_depth': depth_range}] 

rf_grid_search = GridSearchCV(rfc, param_grid, cv=cv,   # Just passing in the variables declared above
                              scoring='f1',             # score best models based on F1 score
                              n_jobs=-2,                # Uses all but one of machine's processors.
                              return_train_score=True,  # ???
                              verbose=50)               # verbose > 0 gives us a progress bar to check on.
rf_grid_search.fit(X_train, y_train) 
%store rf_grid_search

Fitting 5 folds for each of 54 candidates, totalling 270 fits
[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done   1 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-2)]: Done   2 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-2)]: Done   3 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-2)]: Done   4 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-2)]: Done   5 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-2)]: Done   6 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-2)]: Done   7 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-2)]: Done   8 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-2)]: Done   9 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-2)]: Done  10 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-2)]: Done  11 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-2)]: Done  12 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-2)]: Done  13 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-2)]: Done  14 tasks      | elapsed:   

In [4]:
rf_grid_search.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=21, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=135,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

# End of GridSearch -> Calculate Results

In [5]:
%store -r rf_grid_search

rfc_pred = rf_grid_search.predict(X_test)                     # predicting y hat
RF_accuracy = round(accuracy_score(y_test, rfc_pred)*100, 2)  # calculating accuracy
RF_f1 = round(rf_grid_search.best_score_*100, 2)              # calculating F1 Score

print("After hyper tuning the RF model, the best accuracy we could compute was {} with a f1 score of {}".format(RF_accuracy, RF_f1))

rf = {'accuracy': RF_accuracy,
      'f1': RF_f1 }
%store rf

After hyper tuning the RF model, the best accuracy we could compute was 85.36 with a f1 score of 67.86
Stored 'rf' (dict)
