In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, precision_score, recall_score, fbeta_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import time

In [2]:
# Dataset source: https://www.kaggle.com/datasets/iammustafatz/diabetes-prediction-dataset

In [3]:
diabetes_df = pd.read_csv("diabetes_prediction_dataset.csv")
diabetes_df

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


In [4]:
gender_dummies = pd.get_dummies(diabetes_df['gender'])
gender_dummies

Unnamed: 0,Female,Male,Other
0,1,0,0
1,1,0,0
2,0,1,0
3,1,0,0
4,0,1,0
...,...,...,...
99995,1,0,0
99996,1,0,0
99997,0,1,0
99998,1,0,0


In [5]:
diabetes_df['Female'] = gender_dummies['Female']
diabetes_df['Male'] = gender_dummies['Male']
diabetes_df['Other'] = gender_dummies['Other']
diabetes_df.drop(['gender'], axis=1, inplace=True)
diabetes_df

Unnamed: 0,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes,Female,Male,Other
0,80.0,0,1,never,25.19,6.6,140,0,1,0,0
1,54.0,0,0,No Info,27.32,6.6,80,0,1,0,0
2,28.0,0,0,never,27.32,5.7,158,0,0,1,0
3,36.0,0,0,current,23.45,5.0,155,0,1,0,0
4,76.0,1,1,current,20.14,4.8,155,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
99995,80.0,0,0,No Info,27.32,6.2,90,0,1,0,0
99996,2.0,0,0,No Info,17.37,6.5,100,0,1,0,0
99997,66.0,0,0,former,27.83,5.7,155,0,0,1,0
99998,24.0,0,0,never,35.42,4.0,100,0,1,0,0


In [6]:
diabetes_df['smoking_history'].replace(['ever', 'not current'], 'former', inplace=True)
diabetes_df

Unnamed: 0,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes,Female,Male,Other
0,80.0,0,1,never,25.19,6.6,140,0,1,0,0
1,54.0,0,0,No Info,27.32,6.6,80,0,1,0,0
2,28.0,0,0,never,27.32,5.7,158,0,0,1,0
3,36.0,0,0,current,23.45,5.0,155,0,1,0,0
4,76.0,1,1,current,20.14,4.8,155,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
99995,80.0,0,0,No Info,27.32,6.2,90,0,1,0,0
99996,2.0,0,0,No Info,17.37,6.5,100,0,1,0,0
99997,66.0,0,0,former,27.83,5.7,155,0,0,1,0
99998,24.0,0,0,never,35.42,4.0,100,0,1,0,0


In [7]:
encoder = LabelEncoder()
diabetes_df['smoking_history'] = encoder.fit_transform(np.array(diabetes_df['smoking_history']).reshape(-1,1))
diabetes_df

  y = column_or_1d(y, warn=True)


Unnamed: 0,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes,Female,Male,Other
0,80.0,0,1,3,25.19,6.6,140,0,1,0,0
1,54.0,0,0,0,27.32,6.6,80,0,1,0,0
2,28.0,0,0,3,27.32,5.7,158,0,0,1,0
3,36.0,0,0,1,23.45,5.0,155,0,1,0,0
4,76.0,1,1,1,20.14,4.8,155,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
99995,80.0,0,0,0,27.32,6.2,90,0,1,0,0
99996,2.0,0,0,0,17.37,6.5,100,0,1,0,0
99997,66.0,0,0,2,27.83,5.7,155,0,0,1,0
99998,24.0,0,0,3,35.42,4.0,100,0,1,0,0


In [8]:
diabetes_df['age'] = diabetes_df['age'].astype('int64')
diabetes_df

Unnamed: 0,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes,Female,Male,Other
0,80,0,1,3,25.19,6.6,140,0,1,0,0
1,54,0,0,0,27.32,6.6,80,0,1,0,0
2,28,0,0,3,27.32,5.7,158,0,0,1,0
3,36,0,0,1,23.45,5.0,155,0,1,0,0
4,76,1,1,1,20.14,4.8,155,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
99995,80,0,0,0,27.32,6.2,90,0,1,0,0
99996,2,0,0,0,17.37,6.5,100,0,1,0,0
99997,66,0,0,2,27.83,5.7,155,0,0,1,0
99998,24,0,0,3,35.42,4.0,100,0,1,0,0


In [9]:
scaler = MinMaxScaler()
diabetes_df['age'] = scaler.fit_transform(np.array(diabetes_df['age']).reshape(-1,1))
diabetes_df['bmi'] = scaler.fit_transform(np.array(diabetes_df['bmi']).reshape(-1,1))
diabetes_df['HbA1c_level'] = scaler.fit_transform(np.array(diabetes_df['HbA1c_level']).reshape(-1,1))
diabetes_df['blood_glucose_level'] = scaler.fit_transform(np.array(diabetes_df['blood_glucose_level']).reshape(-1,1))
diabetes_df

Unnamed: 0,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes,Female,Male,Other
0,1.0000,0,1,3,0.177171,0.563636,0.272727,0,1,0,0
1,0.6750,0,0,0,0.202031,0.563636,0.000000,0,1,0,0
2,0.3500,0,0,3,0.202031,0.400000,0.354545,0,0,1,0
3,0.4500,0,0,1,0.156863,0.272727,0.340909,0,1,0,0
4,0.9500,1,1,1,0.118231,0.236364,0.340909,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
99995,1.0000,0,0,0,0.202031,0.490909,0.045455,0,1,0,0
99996,0.0250,0,0,0,0.085901,0.545455,0.090909,0,1,0,0
99997,0.8250,0,0,2,0.207983,0.400000,0.340909,0,0,1,0
99998,0.3000,0,0,3,0.296569,0.090909,0.090909,0,1,0,0


In [10]:
diabetes_X = diabetes_df.drop(['diabetes'], axis=1)
diabetes_X

Unnamed: 0,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,Female,Male,Other
0,1.0000,0,1,3,0.177171,0.563636,0.272727,1,0,0
1,0.6750,0,0,0,0.202031,0.563636,0.000000,1,0,0
2,0.3500,0,0,3,0.202031,0.400000,0.354545,0,1,0
3,0.4500,0,0,1,0.156863,0.272727,0.340909,1,0,0
4,0.9500,1,1,1,0.118231,0.236364,0.340909,0,1,0
...,...,...,...,...,...,...,...,...,...,...
99995,1.0000,0,0,0,0.202031,0.490909,0.045455,1,0,0
99996,0.0250,0,0,0,0.085901,0.545455,0.090909,1,0,0
99997,0.8250,0,0,2,0.207983,0.400000,0.340909,0,1,0
99998,0.3000,0,0,3,0.296569,0.090909,0.090909,1,0,0


In [11]:
diabetes_y = diabetes_df['diabetes']
diabetes_y

0        0
1        0
2        0
3        0
4        0
        ..
99995    0
99996    0
99997    0
99998    0
99999    0
Name: diabetes, Length: 100000, dtype: int64

In [12]:
diabetes_X

Unnamed: 0,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,Female,Male,Other
0,1.0000,0,1,3,0.177171,0.563636,0.272727,1,0,0
1,0.6750,0,0,0,0.202031,0.563636,0.000000,1,0,0
2,0.3500,0,0,3,0.202031,0.400000,0.354545,0,1,0
3,0.4500,0,0,1,0.156863,0.272727,0.340909,1,0,0
4,0.9500,1,1,1,0.118231,0.236364,0.340909,0,1,0
...,...,...,...,...,...,...,...,...,...,...
99995,1.0000,0,0,0,0.202031,0.490909,0.045455,1,0,0
99996,0.0250,0,0,0,0.085901,0.545455,0.090909,1,0,0
99997,0.8250,0,0,2,0.207983,0.400000,0.340909,0,1,0
99998,0.3000,0,0,3,0.296569,0.090909,0.090909,1,0,0


In [13]:
diabetes_y

0        0
1        0
2        0
3        0
4        0
        ..
99995    0
99996    0
99997    0
99998    0
99999    0
Name: diabetes, Length: 100000, dtype: int64

## Model Effectiveness Demo

This section showcases the precison and effectiveness of the model.

In [None]:
pip install catboost

In [28]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb
from catboost import CatBoostClassifier
import lightgbm as lgb
from sklearn.ensemble import GradientBoostingClassifier
import pickle
import random

best_model = None
with open('best_model.pkl', 'rb') as f:
    best_model = pickle.load(f)

row = random.randint(0, len(diabetes_X) - 1)

start = time.time()
predict_val = best_model.predict(np.array(diabetes_X.iloc[row, :]).reshape(1, -1))
predict_proba_val = best_model.predict_proba(np.array(diabetes_X.iloc[row, :]).reshape(1, -1))
end = time.time()

prediction_time = end - start

print("Values: \n" + str(diabetes_X.iloc[row, :]))
print("Predicted Value: " + str(predict_val[0]))
print("Actual Value: " + str(diabetes_y.iloc[row]))
print("Risk Probability: " + str(predict_proba_val[0][1]))
print("Prediction time: " + str(prediction_time))

Values: 
age                    0.137500
hypertension           0.000000
heart_disease          0.000000
smoking_history        2.000000
bmi                    0.176237
HbA1c_level            0.472727
blood_glucose_level    0.209091
Female                 0.000000
Male                   1.000000
Other                  0.000000
Name: 87210, dtype: float64
Predicted Value: 0
Actual Value: 0
Risk Probability: 0.0003089736933126809
Prediction time: 0.0031821727752685547


## Model Production/Application Demo

This section showcases the application of the modelto various scenarios.

In [23]:
age = input("age = ")
hypertension = input("hypertension = ")
heart_disease = input("heart_disease = ")
smoking_history = input("smoking_history = ")
bmi = input("bmi = ")
HbA1c_level = input("HbA1c_level = ")
blood_glucose_level = input("blood_glucose_level = ")
gender = input("Input gender (Female, Male, Other) = ")
female = 0
male = 0
other = 0
if gender.lower() == "female":
  female = 1
elif gender.lower() == "male":
  male = 1
else:
  other = 1

values = np.array([age, hypertension, heart_disease, smoking_history, bmi, HbA1c_level, blood_glucose_level, female, male, other], dtype=np.float64)

start = time.time()
predict_val = best_model.predict(values.reshape(1, -1))
predict_proba_val = best_model.predict_proba(values.reshape(1, -1))
end = time.time()

prediction_time = end - start

print("Predicted Value: " + str(predict_val[0]))
print("Risk Probability: " + str(predict_proba_val[0][1]))
print("Prediction time: " + str(prediction_time))

age = 25
hypertension = 0
heart_disease = 0
smoking_history = 0
bmi = 24.7
HbA1c_level = 0.1
blood_glucose_level = 0.5
Input gender (Female, Male, Other) = Female
Predicted Value: 0
Risk Probability: 5.54480853536548e-07
Prediction time: 0.0032715797424316406
