In [13]:
import pandas as pd
import numpy as np
# list for column headers
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
# open file with pd.read_csv
df = pd.read_csv("https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv", names=names)
print(df.shape)
# print head of data set
print(df.head())

(768, 9)
   preg  plas  pres  skin  test  mass   pedi  age  class
0     6   148    72    35     0  33.6  0.627   50      1
1     1    85    66    29     0  26.6  0.351   31      0
2     8   183    64     0     0  23.3  0.672   32      1
3     1    89    66    23    94  28.1  0.167   21      0
4     0   137    40    35   168  43.1  2.288   33      1


In [14]:
#We are trying to predict whether a patient has diabetes. This coincides with the ‘class’ column, which will be our independent variable.
X = df.drop('class', axis=1)
y = df['class']

In [15]:
#We’ll use train-test-split to split the data into training data and testing data.
from sklearn.model_selection import train_test_split
# implementing train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=66)

In [16]:
#Create the random forest model
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
# random forest model creation
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
# predictions
rfc_predict = rfc.predict(X_test)

In [17]:
#Evaluating Performance
#import cross_val_score, classification_report, and confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

In [18]:
#Let us run cross-validation to get a better overview of the results
rfc_cv_score = cross_val_score(rfc, X, y, cv=10, scoring='roc_auc')

In [19]:
#Print out the results
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())
#our model’s score based on the roc_auc score, which is .774. 

=== Confusion Matrix ===
[[155  21]
 [ 36  42]]


=== Classification Report ===
             precision    recall  f1-score   support

          0       0.81      0.88      0.84       176
          1       0.67      0.54      0.60        78

avg / total       0.77      0.78      0.77       254



=== All AUC Scores ===
[0.76481481 0.80074074 0.72814815 0.69740741 0.74185185 0.81740741
 0.7962963  0.87185185 0.76384615 0.75769231]


=== Mean AUC Score ===
Mean AUC Score - Random Forest:  0.774005698005698


In [20]:
#Tuning Hyperparameters
from sklearn.model_selection import RandomizedSearchCV
# number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# number of features at every split
max_features = ['auto', 'sqrt']

# max depth
max_depth = [int(x) for x in np.linspace(100, 500, num = 11)]
max_depth.append(None)
# create random grid
random_grid = {
 'n_estimators': n_estimators,
 'max_features': max_features,
 'max_depth': max_depth
 }
# Random search of parameters
rfc_random = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the model
rfc_random.fit(X_train, y_train)
# print results
print(rfc_random.best_params_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] n_estimators=1000, max_features=auto, max_depth=140 .............
[CV] n_estimators=1000, max_features=auto, max_depth=140 .............
[CV]  n_estimators=1000, max_features=auto, max_depth=140, total=   8.2s
[CV] n_estimators=1000, max_features=auto, max_depth=140 .............
[CV]  n_estimators=1000, max_features=auto, max_depth=140, total=   8.5s
[CV] n_estimators=1400, max_features=auto, max_depth=100 .............
[CV]  n_estimators=1000, max_features=auto, max_depth=140, total=   8.3s
[CV] n_estimators=1400, max_features=auto, max_depth=100 .............
[CV]  n_estimators=1400, max_features=auto, max_depth=100, total=  12.0s
[CV] n_estimators=1400, max_features=auto, max_depth=100 .............
[CV]  n_estimators=1400, max_features=auto, max_depth=100, total=  11.9s
[CV] n_estimators=800, max_features=sqrt, max_depth=260 ..............
[CV]  n_estimators=1400, max_features=auto, max_depth=100, total=  11.3s
[C

[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  3.1min


[CV]  n_estimators=1000, max_features=sqrt, max_depth=300, total=   7.0s
[CV] n_estimators=1800, max_features=auto, max_depth=420 .............
[CV]  n_estimators=1000, max_features=sqrt, max_depth=300, total=   7.5s
[CV] n_estimators=1800, max_features=auto, max_depth=420 .............
[CV]  n_estimators=1800, max_features=auto, max_depth=420, total=  13.8s
[CV] n_estimators=1800, max_features=auto, max_depth=420 .............
[CV]  n_estimators=1800, max_features=auto, max_depth=420, total=  13.5s
[CV] n_estimators=800, max_features=sqrt, max_depth=300 ..............
[CV]  n_estimators=800, max_features=sqrt, max_depth=300, total=   5.9s
[CV] n_estimators=800, max_features=sqrt, max_depth=300 ..............
[CV]  n_estimators=1800, max_features=auto, max_depth=420, total=  13.6s
[CV] n_estimators=800, max_features=sqrt, max_depth=300 ..............
[CV]  n_estimators=800, max_features=sqrt, max_depth=300, total=   6.3s
[CV] n_estimators=200, max_features=sqrt, max_depth=None ........

[CV]  n_estimators=1800, max_features=auto, max_depth=None, total=  16.2s
[CV] n_estimators=1200, max_features=auto, max_depth=140 .............
[CV]  n_estimators=1800, max_features=auto, max_depth=None, total=  15.9s
[CV] n_estimators=1200, max_features=auto, max_depth=140 .............
[CV]  n_estimators=1200, max_features=auto, max_depth=140, total=  10.5s
[CV] n_estimators=1200, max_features=auto, max_depth=140 .............
[CV]  n_estimators=1200, max_features=auto, max_depth=140, total=  10.7s
[CV] n_estimators=600, max_features=auto, max_depth=460 ..............
[CV]  n_estimators=1200, max_features=auto, max_depth=140, total=  10.8s
[CV] n_estimators=600, max_features=auto, max_depth=460 ..............
[CV]  n_estimators=600, max_features=auto, max_depth=460, total=   5.1s
[CV] n_estimators=600, max_features=auto, max_depth=460 ..............
[CV]  n_estimators=600, max_features=auto, max_depth=460, total=   5.0s
[CV] n_estimators=1800, max_features=sqrt, max_depth=100 ......

[CV]  n_estimators=1000, max_features=auto, max_depth=500, total=   7.3s
[CV] n_estimators=1400, max_features=auto, max_depth=460 .............
[CV]  n_estimators=1000, max_features=auto, max_depth=500, total=   7.6s
[CV] n_estimators=1400, max_features=auto, max_depth=460 .............
[CV]  n_estimators=1400, max_features=auto, max_depth=460, total=  10.8s
[CV] n_estimators=1400, max_features=auto, max_depth=460 .............
[CV]  n_estimators=1400, max_features=auto, max_depth=460, total=  10.9s
[CV] n_estimators=1600, max_features=auto, max_depth=500 .............
[CV]  n_estimators=1400, max_features=auto, max_depth=460, total=  10.1s
[CV] n_estimators=1600, max_features=auto, max_depth=500 .............
[CV]  n_estimators=1600, max_features=auto, max_depth=500, total=  11.9s
[CV] n_estimators=1600, max_features=auto, max_depth=500 .............
[CV]  n_estimators=1600, max_features=auto, max_depth=500, total=  12.2s
[CV] n_estimators=800, max_features=sqrt, max_depth=460 .......

[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed: 13.8min


[CV]  n_estimators=800, max_features=sqrt, max_depth=460, total=   5.7s
[CV] n_estimators=800, max_features=sqrt, max_depth=460 ..............
[CV]  n_estimators=1600, max_features=auto, max_depth=500, total=  11.8s
[CV] n_estimators=800, max_features=sqrt, max_depth=460 ..............
[CV]  n_estimators=800, max_features=sqrt, max_depth=460, total=   6.3s
[CV] n_estimators=1000, max_features=auto, max_depth=None ............
[CV]  n_estimators=800, max_features=sqrt, max_depth=460, total=   6.3s
[CV] n_estimators=1000, max_features=auto, max_depth=None ............
[CV]  n_estimators=1000, max_features=auto, max_depth=None, total=   7.8s
[CV] n_estimators=1000, max_features=auto, max_depth=None ............
[CV]  n_estimators=1000, max_features=auto, max_depth=None, total=   7.5s
[CV] n_estimators=2000, max_features=auto, max_depth=500 .............
[CV]  n_estimators=1000, max_features=auto, max_depth=None, total=   7.8s
[CV] n_estimators=2000, max_features=auto, max_depth=500 ......

[CV]  n_estimators=1600, max_features=sqrt, max_depth=420, total=  12.1s
[CV] n_estimators=2000, max_features=auto, max_depth=140 .............
[CV]  n_estimators=2000, max_features=auto, max_depth=140, total=  15.0s
[CV] n_estimators=2000, max_features=auto, max_depth=140 .............
[CV]  n_estimators=2000, max_features=auto, max_depth=140, total=  15.1s
[CV] n_estimators=1000, max_features=auto, max_depth=340 .............
[CV]  n_estimators=1000, max_features=auto, max_depth=340, total=   7.5s
[CV] n_estimators=1000, max_features=auto, max_depth=340 .............
[CV]  n_estimators=2000, max_features=auto, max_depth=140, total=  15.0s
[CV] n_estimators=1000, max_features=auto, max_depth=340 .............
[CV]  n_estimators=1000, max_features=auto, max_depth=340, total=   7.4s
[CV] n_estimators=1200, max_features=sqrt, max_depth=220 .............
[CV]  n_estimators=1000, max_features=auto, max_depth=340, total=   7.5s
[CV] n_estimators=1200, max_features=sqrt, max_depth=220 ......

[CV] n_estimators=1800, max_features=auto, max_depth=380 .............
[CV]  n_estimators=1800, max_features=auto, max_depth=380, total=  13.2s
[CV] n_estimators=1800, max_features=auto, max_depth=380 .............
[CV]  n_estimators=1800, max_features=auto, max_depth=380, total=  13.5s
[CV] n_estimators=400, max_features=auto, max_depth=420 ..............
[CV]  n_estimators=400, max_features=auto, max_depth=420, total=   2.8s
[CV] n_estimators=400, max_features=auto, max_depth=420 ..............
[CV]  n_estimators=400, max_features=auto, max_depth=420, total=   2.9s
[CV] n_estimators=400, max_features=auto, max_depth=420 ..............
[CV]  n_estimators=400, max_features=auto, max_depth=420, total=   2.9s
[CV] n_estimators=400, max_features=sqrt, max_depth=180 ..............
[CV]  n_estimators=1800, max_features=auto, max_depth=380, total=  13.3s
[CV] n_estimators=400, max_features=sqrt, max_depth=180 ..............
[CV]  n_estimators=400, max_features=sqrt, max_depth=180, total=   2

[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 24.8min finished


{'n_estimators': 800, 'max_features': 'sqrt', 'max_depth': 300}


In [None]:
#Our results were: ‘n_estimators’ = 600; ‘max_features’ = ‘sqrt’; ‘max_depth’: 300
#Let us plug these back into the model to see if it improved our performance.
rfc = RandomForestClassifier(n_estimators=600, max_depth=300, max_features='sqrt')
rfc.fit(X_train,y_train)
rfc_predict = rfc.predict(X_test)
rfc_cv_score = cross_val_score(rfc, X, y, cv=10, scoring='roc_auc')
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())