# Explore here

In [55]:
# Your code here
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.ensemble import RandomForestClassifier

from sklearn.tree import plot_tree

In [23]:
main_df = pd.read_csv('https://breathecode.herokuapp.com/asset/internal-link?id=421&path=diabetes.csv')

main_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [24]:
main_df.shape

(768, 9)

In [None]:
train_df, test_df = train_test_split(main_df, test_size = 0.2, random_state=4)

In [26]:
X_train = train_df.drop(columns=['Outcome'])
y_train = train_df['Outcome']

X_test = test_df.drop(columns=['Outcome'])
y_test = test_df['Outcome']


In [27]:
#this function makes predictions and prints out various evaluation metrics
def model_evaluator(X_matrix, y_target, model):
    preds = model.predict(X_matrix)
    
    print(f"Accuracy score: {round(accuracy_score(y_true = y_target, y_pred = preds),4)}")
    print(f"Precision score: {round(precision_score(y_true = y_target, y_pred = preds),4)}")
    print(f"Recall score: {round(recall_score(y_true = y_target, y_pred = preds),4)}")
    print(f"F1 score:{round(f1_score(y_true = y_target, y_pred = preds),4)}")
    print(f"Confusion matrix: \n{confusion_matrix(y_true = y_target, y_pred = preds)}")
    print(f"Classification report: \n{classification_report(y_true = y_target, y_pred = preds)}")

In [28]:
rf_model = RandomForestClassifier(random_state=101)

rf_model.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
model_evaluator(X_train, y_train, rf_model)

Accuracy score: 1.0
Precision score: 1.0
Recall score: 1.0
F1 score:1.0
Confusion matrix: 
[[402   0]
 [  0 212]]
Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       402
           1       1.00      1.00      1.00       212

    accuracy                           1.00       614
   macro avg       1.00      1.00      1.00       614
weighted avg       1.00      1.00      1.00       614



In [None]:
model_evaluator(X_test, y_test, rf_model)

Accuracy score: 0.7403
Precision score: 0.6538
Recall score: 0.6071
F1 score:0.6296
Confusion matrix: 
[[80 18]
 [22 34]]
Classification report: 
              precision    recall  f1-score   support

           0       0.78      0.82      0.80        98
           1       0.65      0.61      0.63        56

    accuracy                           0.74       154
   macro avg       0.72      0.71      0.71       154
weighted avg       0.74      0.74      0.74       154



In [None]:
np.round(pd.DataFrame(rf_model.feature_importances_, rf_model.feature_names_in_)*100,2)

Unnamed: 0,0
Pregnancies,8.9
Glucose,26.53
BloodPressure,8.61
SkinThickness,7.02
Insulin,6.69
BMI,16.84
DiabetesPedigreeFunction,10.82
Age,14.6


### That wasn't great--our Random Forest model is overfitting on the training data. Let's tune the hyperparameters

In [32]:
hyperparameter_dict = {'n_estimators': [50, 100, 250],
                       'max_depth': [None, 5, 10],
                       'min_samples_split': [2, 4, 8],
                       'min_samples_leaf': [2, 4, 8]
                       }


In [38]:
gridsearch_model = GridSearchCV(estimator=RandomForestClassifier(),
                                param_grid=hyperparameter_dict,
                                scoring = 'accuracy',
                                verbose = 3,
                                cv=4)

In [39]:
gridsearch_model.fit(X_train, y_train)

Fitting 4 folds for each of 81 candidates, totalling 324 fits
[CV 1/4] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=50;, score=0.773 total time=   0.1s
[CV 2/4] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=50;, score=0.773 total time=   0.1s
[CV 3/4] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=50;, score=0.797 total time=   0.1s
[CV 4/4] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=50;, score=0.745 total time=   0.1s
[CV 1/4] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.766 total time=   0.1s
[CV 2/4] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.792 total time=   0.1s
[CV 3/4] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.791 total time=   0.1s
[CV 4/4] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.732 t

0,1,2
,estimator,RandomForestClassifier()
,param_grid,"{'max_depth': [None, 5, ...], 'min_samples_leaf': [2, 4, ...], 'min_samples_split': [2, 4, ...], 'n_estimators': [50, 100, ...]}"
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,4
,verbose,3
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,250
,criterion,'gini'
,max_depth,
,min_samples_split,8
,min_samples_leaf,4
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [40]:
model_evaluator(X_train, y_train, gridsearch_model)

Accuracy score: 0.9186
Precision score: 0.9323
Recall score: 0.8287
F1 score:0.8775
Confusion matrix: 
[[385  13]
 [ 37 179]]
Classification report: 
              precision    recall  f1-score   support

           0       0.91      0.97      0.94       398
           1       0.93      0.83      0.88       216

    accuracy                           0.92       614
   macro avg       0.92      0.90      0.91       614
weighted avg       0.92      0.92      0.92       614



In [41]:
model_evaluator(X_test, y_test, gridsearch_model)

Accuracy score: 0.7857
Precision score: 0.6939
Recall score: 0.6538
F1 score:0.6733
Confusion matrix: 
[[87 15]
 [18 34]]
Classification report: 
              precision    recall  f1-score   support

           0       0.83      0.85      0.84       102
           1       0.69      0.65      0.67        52

    accuracy                           0.79       154
   macro avg       0.76      0.75      0.76       154
weighted avg       0.78      0.79      0.78       154



In [53]:
np.round(pd.DataFrame(gridsearch_model.best_estimator_.feature_importances_,gridsearch_model.feature_names_in_)*100,3).sort_values(0, ascending=False)

Unnamed: 0,0
Glucose,27.437
BMI,16.792
Age,15.49
DiabetesPedigreeFunction,11.745
Pregnancies,7.849
BloodPressure,7.672
Insulin,6.525
SkinThickness,6.49


In [54]:
main_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [58]:
gridsearch_model.best_estimator_.decision_path()

TypeError: BaseForest.decision_path() missing 1 required positional argument: 'X'

# A MORE IDEAL WORKFLOW FOR PURER DATA SCIENCE

In [62]:
def splitter(num):
    train_df, test_df = train_test_split(main_df, test_size = num, random_state=4)

    X_train = train_df.drop(columns=['Outcome'])
    y_train = train_df['Outcome']
    X_test = test_df.drop(columns=['Outcome'])
    y_test = test_df['Outcome']

    return X_train, X_test, y_train, y_test


In [76]:
hyperparameter_dict = {'n_estimators': [50, 100],
                       'max_depth': [None, 5],
                       'min_samples_leaf': [2, 4]
                       }

def model_maker(X_train, X_test, y_train, y_test):
    gridsearch_model = GridSearchCV(estimator=RandomForestClassifier(),
                                param_grid=hyperparameter_dict,
                                scoring = 'accuracy',
                                cv=4)
    
    gridsearch_model.fit(X_train, y_train)
    print(gridsearch_model.best_params_)
    #print(gridsearch_model.best_params_)
    print("TRAINING DATA")
    model_evaluator(X_train, y_train, gridsearch_model)
    print("TESTING DATA")
    model_evaluator(X_test, y_test, gridsearch_model)

In [78]:
for i in [0.1, 0.15, 0.2, 0.25, 0.3]:
    print('*****************************')
    print(f'Data split of {i*100}%')
    sub_X_train, sub_X_test, sub_y_train, sub_y_test = splitter(i)
    model_maker(sub_X_train, sub_X_test, sub_y_train, sub_y_test)

*****************************
Data split of 10.0%
{'max_depth': 5, 'min_samples_leaf': 4, 'n_estimators': 100}
TRAINING DATA
Accuracy score: 0.8336
Precision score: 0.836
Recall score: 0.6529
F1 score:0.7332
Confusion matrix: 
[[418  31]
 [ 84 158]]
Classification report: 
              precision    recall  f1-score   support

           0       0.83      0.93      0.88       449
           1       0.84      0.65      0.73       242

    accuracy                           0.83       691
   macro avg       0.83      0.79      0.81       691
weighted avg       0.83      0.83      0.83       691

TESTING DATA
Accuracy score: 0.8182
Precision score: 0.7727
Recall score: 0.6538
F1 score:0.7083
Confusion matrix: 
[[46  5]
 [ 9 17]]
Classification report: 
              precision    recall  f1-score   support

           0       0.84      0.90      0.87        51
           1       0.77      0.65      0.71        26

    accuracy                           0.82        77
   macro avg       0.8

In [79]:
gridsearch_model.best_params_

{'max_depth': None,
 'min_samples_leaf': 4,
 'min_samples_split': 8,
 'n_estimators': 250}

In [1]:
plt.figure(figsize=(50,50))
plot_tree(gridsearch_model.best_estimator_.estimators_[2], feature_names=X_train.columns)
plt.show()
plt.savefig('fig1.png');

NameError: name 'plt' is not defined