# Exploring Random Forests

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.ensemble import RandomForestClassifier

In [2]:
main_df = pd.read_csv("https://breathecode.herokuapp.com/asset/internal-link?id=421&path=diabetes.csv")

In [3]:
main_df.head(3)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1


In [86]:
train_df, test_df = train_test_split(main_df, test_size=0.17, random_state=4)

In [8]:
main_df.shape

(768, 9)

In [9]:
train_df.shape

(614, 9)

In [87]:
X_train = train_df.drop(columns="Outcome")
y_train = train_df["Outcome"]

X_test = test_df.drop(columns="Outcome")
y_test = test_df["Outcome"]

In [76]:
#this function makes predictions and prints out various evaluation metrics
def model_evaluator(X_matrix, y_target, model):
    preds = model.predict(X_matrix)
    
    print(f"Accuracy score: {round(accuracy_score(y_true = y_target, y_pred = preds),4)}")
    print(f"Precision score: {round(precision_score(y_true = y_target, y_pred = preds),4)}")
    print(f"Recall score: {round(recall_score(y_true = y_target, y_pred = preds),4)}")
    print(f"F1 score:{round(f1_score(y_true = y_target, y_pred = preds),4)}")
    print(f"Confusion matrix: \n{confusion_matrix(y_true = y_target, y_pred = preds)}")
    print(f"Classification report: \n{classification_report(y_true = y_target, y_pred = preds)}")

In [88]:
rf_model = RandomForestClassifier(random_state=101)

rf_model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [89]:
model_evaluator(X_train, y_train, rf_model)

Accuracy score: 1.0
Precision score: 1.0
Recall score: 1.0
F1 score:1.0
Confusion matrix: 
[[413   0]
 [  0 224]]
Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       413
           1       1.00      1.00      1.00       224

    accuracy                           1.00       637
   macro avg       1.00      1.00      1.00       637
weighted avg       1.00      1.00      1.00       637



In [90]:
model_evaluator(X_test, y_test, rf_model)

Accuracy score: 0.7863
Precision score: 0.7222
Recall score: 0.5909
F1 score:0.65
Confusion matrix: 
[[77 10]
 [18 26]]
Classification report: 
              precision    recall  f1-score   support

           0       0.81      0.89      0.85        87
           1       0.72      0.59      0.65        44

    accuracy                           0.79       131
   macro avg       0.77      0.74      0.75       131
weighted avg       0.78      0.79      0.78       131



In [91]:
np.round(pd.DataFrame(rf_model.feature_importances_, rf_model.feature_names_in_) * 100, 2)

Unnamed: 0,0
Pregnancies,8.85
Glucose,23.69
BloodPressure,9.31
SkinThickness,7.03
Insulin,7.61
BMI,16.74
DiabetesPedigreeFunction,12.55
Age,14.23


### Our Random Forest model is overfitting on the training data, Let's tune the hyperparameters

In [92]:
hyperparameter_dict = {
    "n_estimators": [20, 30, 100, 275],
    "max_depth": [None, 3, 5],
    "min_samples_split": [2, 6, 18],
    "min_samples_leaf": [2, 6, 18],
    "max_features": [None, 1, 6],
}

In [93]:
gridsearch_model = GridSearchCV(estimator=RandomForestClassifier(), 
                                    param_grid=hyperparameter_dict, 
                                    scoring="accuracy", 
                                    verbose=3,
                                    cv=2)

In [94]:
gridsearch_model.fit(X_train, y_train)

Fitting 2 folds for each of 324 candidates, totalling 648 fits
[CV 1/2] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=20;, score=0.765 total time=   0.0s
[CV 2/2] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=20;, score=0.726 total time=   0.0s
[CV 1/2] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=30;, score=0.771 total time=   0.1s
[CV 2/2] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=30;, score=0.733 total time=   0.1s
[CV 1/2] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.762 total time=   0.2s
[CV 2/2] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.758 total time=   0.2s
[CV 1/2] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=275;,

0,1,2
,estimator,RandomForestClassifier()
,param_grid,"{'max_depth': [None, 3, ...], 'max_features': [None, 1, ...], 'min_samples_leaf': [2, 6, ...], 'min_samples_split': [2, 6, ...], ...}"
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,2
,verbose,3
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,30
,criterion,'gini'
,max_depth,3
,min_samples_split,18
,min_samples_leaf,18
,min_weight_fraction_leaf,0.0
,max_features,
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
# first set of training data @ 20%
model_evaluator(X_train, y_train, gridsearch_model)

Accuracy score: 0.772
Precision score: 0.7468
Recall score: 0.5324
F1 score:0.6216
Confusion matrix: 
[[359  39]
 [101 115]]
Classification report: 
              precision    recall  f1-score   support

           0       0.78      0.90      0.84       398
           1       0.75      0.53      0.62       216

    accuracy                           0.77       614
   macro avg       0.76      0.72      0.73       614
weighted avg       0.77      0.77      0.76       614



In [None]:
# first set of training data @ 20%
model_evaluator(X_test, y_test, gridsearch_model)

Accuracy score: 0.8247
Precision score: 0.7551
Recall score: 0.7115
F1 score:0.7327
Confusion matrix: 
[[90 12]
 [15 37]]
Classification report: 
              precision    recall  f1-score   support

           0       0.86      0.88      0.87       102
           1       0.76      0.71      0.73        52

    accuracy                           0.82       154
   macro avg       0.81      0.80      0.80       154
weighted avg       0.82      0.82      0.82       154



In [61]:
np.round(pd.DataFrame(gridsearch_model.best_estimator_.feature_importances_,gridsearch_model.feature_names_in_)*100,2).sort_values(by=0, ascending=False)

Unnamed: 0,0
Glucose,58.1
BMI,18.15
Age,15.0
DiabetesPedigreeFunction,4.62
Pregnancies,1.65
SkinThickness,1.35
Insulin,0.75
BloodPressure,0.37


In [72]:
# second set of training data @ 15%
model_evaluator(X_train, y_train, gridsearch_model)

Accuracy score: 0.7807
Precision score: 0.7515
Recall score: 0.557
F1 score:0.6398
Confusion matrix: 
[[382  42]
 [101 127]]
Classification report: 
              precision    recall  f1-score   support

           0       0.79      0.90      0.84       424
           1       0.75      0.56      0.64       228

    accuracy                           0.78       652
   macro avg       0.77      0.73      0.74       652
weighted avg       0.78      0.78      0.77       652



In [73]:
# second set of training data @ 15%
model_evaluator(X_test, y_test, gridsearch_model)

Accuracy score: 0.8017
Precision score: 0.7576
Recall score: 0.625
F1 score:0.6849
Confusion matrix: 
[[68  8]
 [15 25]]
Classification report: 
              precision    recall  f1-score   support

           0       0.82      0.89      0.86        76
           1       0.76      0.62      0.68        40

    accuracy                           0.80       116
   macro avg       0.79      0.76      0.77       116
weighted avg       0.80      0.80      0.80       116



In [84]:
# third set of training data @ 25%
model_evaluator(X_train, y_train, gridsearch_model)

Accuracy score: 0.9028
Precision score: 0.8842
Recall score: 0.8317
F1 score:0.8571
Confusion matrix: 
[[352  22]
 [ 34 168]]
Classification report: 
              precision    recall  f1-score   support

           0       0.91      0.94      0.93       374
           1       0.88      0.83      0.86       202

    accuracy                           0.90       576
   macro avg       0.90      0.89      0.89       576
weighted avg       0.90      0.90      0.90       576



In [85]:
# third set of training data @ 25%
model_evaluator(X_test, y_test, gridsearch_model)

Accuracy score: 0.7917
Precision score: 0.6857
Recall score: 0.7273
F1 score:0.7059
Confusion matrix: 
[[104  22]
 [ 18  48]]
Classification report: 
              precision    recall  f1-score   support

           0       0.85      0.83      0.84       126
           1       0.69      0.73      0.71        66

    accuracy                           0.79       192
   macro avg       0.77      0.78      0.77       192
weighted avg       0.80      0.79      0.79       192



In [95]:
# fourth set of training data @ 17%
model_evaluator(X_train, y_train, gridsearch_model)

Accuracy score: 0.7786
Precision score: 0.7485
Recall score: 0.558
F1 score:0.6394
Confusion matrix: 
[[371  42]
 [ 99 125]]
Classification report: 
              precision    recall  f1-score   support

           0       0.79      0.90      0.84       413
           1       0.75      0.56      0.64       224

    accuracy                           0.78       637
   macro avg       0.77      0.73      0.74       637
weighted avg       0.77      0.78      0.77       637



In [96]:
# fourth set of training data @ 17%
model_evaluator(X_test, y_test, gridsearch_model)

Accuracy score: 0.8092
Precision score: 0.7317
Recall score: 0.6818
F1 score:0.7059
Confusion matrix: 
[[76 11]
 [14 30]]
Classification report: 
              precision    recall  f1-score   support

           0       0.84      0.87      0.86        87
           1       0.73      0.68      0.71        44

    accuracy                           0.81       131
   macro avg       0.79      0.78      0.78       131
weighted avg       0.81      0.81      0.81       131

