In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score #classification_report, confusion_matrix

In [2]:
X_train = pd.read_pickle('./data/X_train_pickle.pkl')
X_test = pd.read_pickle('./data/X_test_pickle.pkl')
y_train = pd.read_pickle('./data/y_train_pickle.pkl')
y_test = pd.read_pickle('./data/y_test_pickle.pkl')

# Random Forest Classifier

In [26]:
# Preparing to Hyper tune RFC using GridSearchCV
cv = 5                             # Set how many cross validations you would like.
est_range = list(range(120,161,5)) # Set the range of estimators.
depth_range = list(range(16,22))   # Set the range of depth.

In [None]:
# This only needs to be run on the first use of this notebook, or if changes have been made to the variables above.
rfc = RandomForestClassifier()

param_grid = [                     # GridSearchCV params requires a 'list', so we created a dictionary within the list to pass multiple params.
    {'n_estimators': est_range, 
     'max_depth': depth_range}
] 

grid_search = GridSearchCV(rfc, param_grid, cv=cv,          # Just passing in the variables declared above
                          scoring='neg_mean_squared_error', # base scoring on the NMSE - higher return values are better than lower return values
                          return_train_score=True,          # 
                          verbose=50)                       # verbose > 0 gives us a progress bar to check on.

# running a grid search through range of estimators and range of depth resulting in 48 fit tests.
grid_search.fit(X_train, y_train) 
%store grid_search

In [15]:
%store -r grid_search
print("Best depth and estimator params are:", grid_search.best_params_)
print("Best Overall RFC parameters is: \n", grid_search.best_estimator_)

Best depth and estimator params are: {'max_depth': 19, 'n_estimators': 140}
Best Overall RFC parameters is: 
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=19, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=140,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)


In [8]:
# predicting y hat
rfc_pred = grid_search.predict(X_test)

# checking accuracy
RF_accuracy = round(accuracy_score(y_test, rfc_pred)*100, 2)

# checking F1 Score
RF_f1 = round(f1_score(y_test, rfc_pred)*100, 2)

# Storing RF scores for comparisons.
rf = {
    'accuracy': RF_accuracy,
    'f1': RF_f1
}

print("After hyper tuning the RF model, the best accuracy we could compute was {} with a f1 score of {}".format(RF_accuracy, RF_f1))
%store rf

After hyper tuning the RF model, the best accuracy we could compute was 85.21 with a f1 score of 66.44
Stored 'rf' (dict)
