In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score 

In [3]:
X_train = pd.read_pickle('./data/X_train_pickle.pkl')
X_test = pd.read_pickle('./data/X_test_pickle.pkl')
y_train = pd.read_pickle('./data/y_train_pickle.pkl')
y_test = pd.read_pickle('./data/y_test_pickle.pkl')

# Random Forest Classifier

In [5]:
scores = {}
# Hyper tuning RFC within a range of degrees
for num in range(50,151):
    rfc = RandomForestClassifier(random_state = 23,     # like np.random.seed
                                 n_estimators=num,      # Why do we use 100 estimators?
                                 max_depth=8            # Why do we use a max of 8?
                                ).fit(X_train, y_train) # run of the mill fit method with training data.
    # predicting y hat
    rfc_pred = rfc.predict(X_test)

    # checking accuracy
    acc_score = accuracy_score(y_test, rfc_pred)
    RF_accuracy = round(acc_score*100, 2)

    # checking F1 Score
    f1_sc = f1_score(y_test, rfc_pred)
    RF_f1 = round(f1_sc*100, 2)
    
    # adding scores to dictionary and printing progress.     
    scores[num] = [acc_score, f1_sc] 
    print(num)

In [13]:
high_acc = 0
high_f1 = 0

# Running through each result of the RandomForestClassifier scores to determine optimal degree for best f1 and/or accuracy.
for degree, (acc_score, f1_sc) in scores.items():
#     comparing accuracy to highest accuracy score
    if acc_score > high_acc:
        high_acc = acc_score
        this_f1 = f1_sc
        acc_deg = degree

#     comparing f1 to highest f1 score
    if f1_sc > high_f1:
        high_f1 = f1_sc
        this_acc = acc_score
        f1_deg = degree
        
print("A degree of {} results in the highest accuracy of {} but with an f1 score of {}".format(acc_deg, round(high_acc, 4), round(this_f1, 4)))
print("A degree of {} results in the highest f1 score of {} but with an accuracy of {}".format(f1_deg, round(high_f1, 4), round(this_acc, 4)))

A degree of 66 results in the highest accuracy of 0.8451 but with an f1 score of 0.6268
A degree of 51 results in the highest f1 score of 0.6281 but with an accuracy of 0.8446


A degree of 66 would be the best because we value accuracy the most. 