In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV

In [2]:
X_train = pd.read_pickle('./data/X_train_pickle.pkl')
X_test = pd.read_pickle('./data/X_test_pickle.pkl')
y_train = pd.read_pickle('./data/y_train_pickle.pkl')
y_test = pd.read_pickle('./data/y_test_pickle.pkl')

# Random Forest Classifier

In [5]:
scores = {}
# Hyper tuning RFC within a range of degrees
for num in range(50,151):
    rfc = RandomForestClassifier(random_state = 23,     # like np.random.seed
                                 n_estimators=num,      # Why do we use 100 estimators?
                                 max_depth=8            # Why do we use a max of 8?
                                ).fit(X_train, y_train) # run of the mill fit method with training data.
    # predicting y hat
    rfc_pred = rfc.predict(X_test)

    # checking accuracy
    acc_score = accuracy_score(y_test, rfc_pred)
    RF_accuracy = round(acc_score*100, 2)

    # checking F1 Score
    f1_sc = f1_score(y_test, rfc_pred)
    RF_f1 = round(f1_sc*100, 2)
    
    # adding scores to dictionary and printing progress.     
    scores[num] = [acc_score, f1_sc] 
    print(num)

In [13]:
high_acc = 0
high_f1 = 0

# Running through each result of the RandomForestClassifier scores to determine optimal degree for best f1 and/or accuracy.
for degree, (acc_score, f1_sc) in scores.items():
#     comparing accuracy to highest accuracy score
    if acc_score > high_acc:
        high_acc = acc_score
        this_f1 = f1_sc
        acc_deg = degree

#     comparing f1 to highest f1 score
    if f1_sc > high_f1:
        high_f1 = f1_sc
        this_acc = acc_score
        f1_deg = degree
        
print("A degree of {} results in the highest accuracy of {} but with an f1 score of {}".format(acc_deg, round(high_acc, 4), round(this_f1, 4)))
print("A degree of {} results in the highest f1 score of {} but with an accuracy of {}".format(f1_deg, round(high_f1, 4), round(this_acc, 4)))

A degree of 66 results in the highest accuracy of 0.8451 but with an f1 score of 0.6268
A degree of 51 results in the highest f1 score of 0.6281 but with an accuracy of 0.8446


In [1]:
# Hyper tuning RFC within a range of degrees
scores = {}
for depth in range(2,20):
    for estimator in range(50,151):
        rfc = RandomForestClassifier(random_state = 23,     # like np.random.seed
                                     n_estimators=estimator,      # Why do we use 100 estimators?
                                     max_depth=depth            # Why do we use a max of 8?
                                    ).fit(X_train, y_train) # run of the mill fit method with training data.
        # predicting y hat
        rfc_pred = rfc.predict(X_test)

        # checking accuracy
        acc_score = accuracy_score(y_test, rfc_pred)
        RF_accuracy = round(acc_score*100, 2)

        # checking F1 Score
        f1_sc = f1_score(y_test, rfc_pred)
        RF_f1 = round(f1_sc*100, 2)

        # adding scores to dictionary and printing progress.  
        str_depth = str(depth)
        str_est = str(estimator)
        key = str_depth + "-" +str_est
        scores[key] = [acc_score, f1_sc] 
        print(depth, estimator)

NameError: name 'RandomForestClassifier' is not defined

In [5]:
high_acc = 0
high_f1 = 0

# Running through each result of the RandomForestClassifier scores to determine optimal degree for best f1 and/or accuracy.
for degree, (acc_score, f1_sc) in scores.items():
#     comparing accuracy to highest accuracy score
    if acc_score > high_acc:
        high_acc = acc_score
        this_f1 = f1_sc
        acc_deg = degree

#     comparing f1 to highest f1 score
    if f1_sc > high_f1:
        high_f1 = f1_sc
        this_acc = acc_score
        f1_deg = degree
        
print("A depth-degree of {} results in the highest accuracy of {} but with an f1 score of {}".format(acc_deg, round(high_acc, 4), round(this_f1, 4)))
print("A depth-degree of {} results in the highest f1 score of {} but with an accuracy of {}".format(f1_deg, round(high_f1, 4), round(this_acc, 4)))

A depth-degree of 18-97 results in the highest accuracy of 0.8548 but with an f1 score of 0.6692
A depth-degree of 18-94 results in the highest f1 score of 0.6697 but with an accuracy of 0.8546


In [12]:
# Hyper tuning RFC within a range of degrees
cv = 5                             # Set how many cross validations you would like.
est_range = list(range(120,161,5)) # Set the range of estimators.
depth_range = list(range(16,22))   # Set the range of depth.

rfc = RandomForestClassifier()
param_grid = [
    {'n_estimators': est_range, 
     'max_depth': depth_range}
] 

grid_search = GridSearchCV(rfc, param_grid, cv=cv,          # Just passing in the variables declared above
                          scoring='neg_mean_squared_error', # base scoring on the NMSE - higher return values are better than lower return values
                          return_train_score=True,          # 
                          verbose=50)                       # verbose > 0 gives us a progress bar to check on.

# run of the mill fit method with training data.
grid_search.fit(X_train, y_train) 

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] max_depth=16, n_estimators=120 ..................................
[CV]  max_depth=16, n_estimators=120, score=(train=-0.102, test=-0.139), total=   1.5s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.7s remaining:    0.0s
[CV] max_depth=16, n_estimators=120 ..................................
[CV]  max_depth=16, n_estimators=120, score=(train=-0.105, test=-0.141), total=   1.4s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.4s remaining:    0.0s
[CV] max_depth=16, n_estimators=120 ..................................
[CV]  max_depth=16, n_estimators=120, score=(train=-0.103, test=-0.136), total=   1.4s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    5.0s remaining:    0.0s
[CV] max_depth=16, n_estimators=120 ..................................
[CV]  max_depth=16, n_estimators=120, score=(train=-0.099, test=-0.146),

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [13]:
grid_search.best_params_

{'max_depth': 19, 'n_estimators': 140}

In [14]:
grid_search.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=19, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=140,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)