In [1]:
import datetime

import os
import pickle
import sys

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

## Config

In [2]:
root_dir = "./../"
data_path = os.path.join(root_dir, 'data')
diabetes_dataset_path = os.path.join(data_path, 'diabetes.csv')
models_path = os.path.join(root_dir, 'models')

## Load the dataset

In [3]:
df_diabetes = pd.read_csv(diabetes_dataset_path)
print(f"Features: {list(df_diabetes.columns[:-1])} Predict: {df_diabetes.columns[-1]}")

Features: ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age'] Predict: Outcome


In [4]:
# count the number of positive and negative labels
counts = df_diabetes["Outcome"].value_counts()

# print the counts
print("Positive labels (Outcome=1):", counts[1])
print("Negative labels (Outcome=0):", counts[0])

Positive labels (Outcome=1): 268
Negative labels (Outcome=0): 500


## Logistic Regression

In [5]:
# splitting the dataset into training and testing sets
X = df_diabetes.drop("Outcome", axis=1)
y = df_diabetes["Outcome"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# normalizing the dataset
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# creating the Logistic Regression model
lr_model = LogisticRegression()

# fit to the train data
lr_model.fit(X_train, y_train)

# making predictions on the test data
y_pred = lr_model.predict(X_test)

# evaluating the model performance
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Confusion matrix:\n{cm}")

Accuracy: 0.7532467532467533
Confusion matrix:
[[84 15]
 [23 32]]


## Decision Trees

In [6]:
# splitting the dataset into training and testing sets
X = df_diabetes.drop("Outcome", axis=1)
y = df_diabetes["Outcome"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# creating the Decision Tree model
dt_model = DecisionTreeClassifier()

# fitting the model on the training data and making predictions on the test data
dt_model.fit(X_train, y_train)
y_pred = dt_model.predict(X_test)

# evaluating the model performance
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print(f"Confusion matrix:\n{cm}")

Accuracy: 0.7532467532467533
Confusion matrix:
[[77 22]
 [16 39]]


## Random Forest

In [7]:
# splitting the dataset into training and testing sets
X = df_diabetes.drop("Outcome", axis=1)
y = df_diabetes["Outcome"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# creating the Random Forest model
rf_model = RandomForestClassifier()

# fitting the model on the training data and making predictions on the test data
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

# evaluating the model performance
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print(f"Confusion matrix:\n{cm}")

Accuracy: 0.7662337662337663
Confusion matrix:
[[80 19]
 [17 38]]


## SVM

In [8]:
# splitting the dataset into training and testing sets
X = df_diabetes.drop("Outcome", axis=1)
y = df_diabetes["Outcome"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# creating the SVM model
svm_model = SVC()

# fitting the model on the training data and making predictions on the test data
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)

# evaluating the model performance
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print(f"Confusion matrix:\n{cm}")

Accuracy: 0.7662337662337663
Confusion matrix:
[[87 12]
 [24 31]]


## Gradient Boosting

In [9]:
# splitting the dataset into training and testing sets
X = df_diabetes.drop("Outcome", axis=1)
y = df_diabetes["Outcome"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# creating the Gradient Boosting model
gb_model = GradientBoostingClassifier()

# fitting the model on the training data and making predictions on the test data
gb_model.fit(X_train, y_train)
y_pred = gb_model.predict(X_test)

# evaluating the model performance
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print(f"Confusion matrix:\n{cm}")

Accuracy: 0.7467532467532467
Confusion matrix:
[[78 21]
 [18 37]]


## Neural Networks

In [10]:
# splitting the dataset into training and testing sets
X = df_diabetes.drop("Outcome", axis=1)
y = df_diabetes["Outcome"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# creating the neural network model
nn_model = MLPClassifier(hidden_layer_sizes=(16,8,4), activation='relu', solver='adam', max_iter=200)

# fitting the model on the training data and making predictions on the test data
nn_model.fit(X_train, y_train)
y_pred = nn_model.predict(X_test)

# evaluating the model performance
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print(f"Confusion matrix:\n{cm}")

Accuracy: 0.6883116883116883
Confusion matrix:
[[89 10]
 [38 17]]


## Dump the models

In [11]:
# create folder with name_datestamp
name = "anirudh"
folder_name = datetime.datetime.now().date().strftime("%m%d%Y")
folder_path = os.path.join(models_path, name + "_" + folder_name)
if not os.path.exists(folder_path):
    os.makedirs(folder_path)
                      

with open(os.path.join(folder_path, 'lr_model.pkl'), 'wb') as file:
    pickle.dump(lr_model, file)

# Dump decision tree model to file
with open(os.path.join(folder_path, 'dt_model.pkl'), 'wb') as file:
    pickle.dump(dt_model, file)

# Dump random forest model to file
with open(os.path.join(folder_path, 'rf_model.pkl'), 'wb') as file:
    pickle.dump(rf_model, file)

# Dump SVM model to file
with open(os.path.join(folder_path, 'svm_model.pkl'), 'wb') as file:
    pickle.dump(svm_model, file)

# Dump gradient boosting model to file
with open(os.path.join(folder_path, 'gb_model.pkl'), 'wb') as file:
    pickle.dump(gb_model, file)

# Dump neural network model to file
with open(os.path.join(folder_path, 'nn_model.pkl'), 'wb') as file:
    pickle.dump(nn_model, file)