In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import f1_score
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_auc_score



pima_dataset = pd.read_csv('data/diabetes.csv')
pima_dataset.head()


X = pima_dataset.drop(['Outcome'], axis=1)
y = pima_dataset['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)
print(X_train.shape, X_test.shape)



print(pd.Series(y_train).value_counts(normalize = True))
print(pd.Series(y_test).value_counts(normalize = True))



model = RandomForestClassifier(random_state=42)


# Set folds
k=5

# Define grid
parameters = {'n_estimators': [100, 500], 'max_depth': [3, 5]}


grid_search = GridSearchCV(model, parameters, cv = k, scoring = "roc_auc")
grid_search.fit(X_train, y_train)


pd.DataFrame((grid_search.cv_results_)).sort_values("rank_test_score")
grid_search.best_estimator_

# Train model on the full training set with best estimator
best_model = grid_search.best_estimator_

# Fit the model on training data
best_model.fit(X_train, y_train)


## Training data confusion matrix
y_pred = best_model.predict(X_train)
cm = confusion_matrix(y_train, y_pred, labels = best_model.classes_)
plt = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
plt.plot(cmap = "Blues");

## Test data confusion matrix
y_pred = best_model.predict(X_test)
cm_test = confusion_matrix(y_test, y_pred, labels = best_model.classes_)
plt = ConfusionMatrixDisplay(confusion_matrix=cm_test, display_labels=best_model.classes_)
plt.plot(cmap = "Blues");

TN = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TP = cm[1][1]

accuracy = (TN + TP)/(TN + FP + FN + TP) 
print(accuracy)

# Model Performance (Accuracy) of the training data
accuracy_train = best_model.score(X_train, y_train)

# Model Performance (Accuracy) of the test data
accuracy_test = best_model.score(X_test, y_test)

print('Training Dataset Accuracy: ' + str(accuracy_train), ', Test Dataset Accuracy: ' + str(accuracy_test))

## Get the probability of predicting 1 on training data
y_pred = best_model.predict_proba(X_train)[:,1]
auc_train = roc_auc_score(y_train, y_pred)

## Get the probability of predicting 1 on test data
y_pred = best_model.predict_proba(X_test)[:,1]
auc_test = roc_auc_score(y_test, y_pred)

print('Training Dataset ROC AUC Score: ' + str(auc_train), ', Test Dataset ROC AUC Score: ' + str(auc_test))

