In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV

In [2]:
X_train = pd.read_pickle('./data/X_train_pickle.pkl')
X_test = pd.read_pickle('./data/X_test_pickle.pkl')
y_train = pd.read_pickle('./data/y_train_pickle.pkl')
y_test = pd.read_pickle('./data/y_test_pickle.pkl')

# Random Forest Classifier

In [26]:
# Hyper tuning RFC within a range of degrees
cv = 5                             # Set how many cross validations you would like.
est_range = list(range(120,161,5)) # Set the range of estimators.
depth_range = list(range(16,22))   # Set the range of depth.

rfc = RandomForestClassifier()
param_grid = [
    {'n_estimators': est_range, 
     'max_depth': depth_range}
] 

grid_search = GridSearchCV(rfc, param_grid, cv=cv,          # Just passing in the variables declared above
                          scoring='neg_mean_squared_error', # base scoring on the NMSE - higher return values are better than lower return values
                          return_train_score=True,          # 
                          verbose=50)                       # verbose > 0 gives us a progress bar to check on.

In [None]:
# run of the mill fit method with training data.
grid_search.fit(X_train, y_train) 
%store grid_search

In [3]:
%store -r grid_search
grid_search.best_params_

{'max_depth': 19, 'n_estimators': 140}

In [4]:
grid_search.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=19, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=140,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [6]:
# predicting y hat
rfc_pred = grid_search.predict(X_test)

# checking accuracy
acc_score = accuracy_score(y_test, rfc_pred)
RF_accuracy = round(acc_score*100, 2)

# checking F1 Score
f1_sc = f1_score(y_test, rfc_pred)
RF_f1 = round(f1_sc*100, 2)

print("After hyper tuning the RF model, the best accuracy we could compute was {} with a f1 score of {}".format(RF_accuracy, RF_f1))

rf = {
    'accuracy': RF_accuracy,
    'f1': RF_f1
}
%store rf

After hyper tuning the RF model, the best accuracy we could compute was 85.21 with a f1 score of 66.44
Stored 'rf' (dict)
