In [1]:
import pandas as pd

In [5]:
df = pd.read_csv("diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


#### Let’s create our model. We are trying to predict whether a patient has diabetes. This coincides with the ‘class’ column, which will be our independent variable. We’ll use all the other columns as features for our model.

In [6]:
X = df.drop(['Outcome'], axis =1)
y = df['Outcome']

In [8]:
from sklearn.model_selection import train_test_split


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.3, random_state =50)

Now, create a random forest model-

In [12]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

y_pred = rfc.predict(X_test)

#### Evaluting performance

In [13]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report

In [14]:
rfc_cv_score = cross_val_score(rfc, X, y, cv=10, scoring ='roc_auc')

Print it out all the result---

In [15]:
print("confusion matrix ")
print(confusion_matrix(y_test, y_pred))
print("\n")

print("Classification Report")
print(classification_report(y_test, y_pred))
print("\n")

print("All AUC score")
print(rfc_cv_score)
print("\n")

print("Mean AUC score ", rfc_cv_score.mean())

confusion matrix 
[[129  19]
 [ 41  42]]


Classification Report
              precision    recall  f1-score   support

           0       0.76      0.87      0.81       148
           1       0.69      0.51      0.58        83

    accuracy                           0.74       231
   macro avg       0.72      0.69      0.70       231
weighted avg       0.73      0.74      0.73       231



All AUC score
[0.79074074 0.8162963  0.83148148 0.72259259 0.80333333 0.83925926
 0.87111111 0.90407407 0.79153846 0.85923077]


Mean AUC score  0.8229658119658121


### Hyperparameter Tuning

#### We’ll use RandomizedSearchCV from sklearn to optimize our hyperparamaters.

In [17]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
#number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start =200, stop =2000, num =10)]

# number of features at every split
max_features = ['auto', 'sqrt']

# max depth
max_depth = [int(x) for x in np.linspace(100,500, num=11)]
max_depth.append(None)

# Create random grid
random_grid = {'n_estimators': n_estimators,
              'max_features': max_features,
              'max_depth': max_depth}


# Random search of parameters
rfc_random = RandomizedSearchCV(estimator = rfc, param_distributions=random_grid, n_iter=100, cv=3, verbose =2, 
                         random_state=42, n_jobs=-1)

# fit the model
rfc_random.fit(X_train, y_train)

# print the result
print(rfc_random.best_params_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   44.0s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  5.8min finished


{'n_estimators': 1400, 'max_features': 'sqrt', 'max_depth': 260}


In [19]:
rfc_best = RandomForestClassifier(n_estimators= 1400, max_depth = 260, max_features ='sqrt')
rfc_best.fit(X_train, y_train)

y_pred_best = rfc_best.predict(X_test)

rfc_best_cv = cross_val_score(rfc_best, X, y, cv=10, scoring="roc_auc")


print("confusion matrix ")
print(confusion_matrix(y_test, y_pred_best))
print("\n")

print("Classification Report")
print(classification_report(y_test, y_pred_best))
print("\n")

print("All AUC score")
print(rfc_best_cv)
print("\n")

print("Mean AUC score ", rfc_best_cv.mean())

confusion matrix 
[[129  19]
 [ 37  46]]


Classification Report
              precision    recall  f1-score   support

           0       0.78      0.87      0.82       148
           1       0.71      0.55      0.62        83

    accuracy                           0.76       231
   macro avg       0.74      0.71      0.72       231
weighted avg       0.75      0.76      0.75       231



All AUC score
[0.77703704 0.83185185 0.82925926 0.73185185 0.81185185 0.85703704
 0.86592593 0.91037037 0.80769231 0.85730769]


Mean AUC score  0.8280185185185184
