In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.ensemble import VotingClassifier, StackingClassifier

import warnings
warnings.filterwarnings("ignore")

In [10]:
df = pd.read_csv('diabetes_merged_dataset.csv')
df

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes,HbA1c_level
0,Female,0.08,0,0,No Info,11.88,5.7,80,0,5.7
1,Female,0.08,0,0,No Info,12.22,3.5,130,0,3.5
2,Female,0.08,0,0,No Info,12.50,4.5,155,0,4.5
3,Female,0.08,0,0,No Info,12.74,3.5,140,0,3.5
4,Female,0.08,0,0,No Info,12.77,4.5,160,0,4.5
...,...,...,...,...,...,...,...,...,...,...
154453,Other,47.00,0,0,never,36.76,6.6,90,0,6.6
154454,Other,47.00,0,0,never,36.76,3.5,200,0,3.5
154455,Other,52.00,0,0,not current,31.63,6.6,159,0,6.6
154456,Other,53.00,0,0,No Info,27.32,6.6,160,0,6.6


In [11]:
df.describe(include='all')

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes,HbA1c_level
count,154458,154458.0,154458.0,154458.0,154458,154458.0,154458.0,154458.0,154458.0,154458.0
unique,3,,,,6,,,,,
top,Female,,,,No Info,,,,,
freq,91657,,,,73979,,,,,
mean,,41.452624,0.051283,0.027535,,27.303686,5.48595,136.578863,0.05731,5.48595
std,,22.256107,0.220574,0.163637,,5.362304,1.040365,38.875502,0.232435,1.040365
min,,0.08,0.0,0.0,,10.01,3.5,80.0,0.0,3.5
25%,,24.0,0.0,0.0,,26.7,4.8,100.0,0.0,4.8
50%,,41.0,0.0,0.0,,27.32,5.8,140.0,0.0,5.8
75%,,58.0,0.0,0.0,,27.32,6.2,159.0,0.0,6.2


In [12]:
df.drop(columns=['hbA1c_level'], inplace=True)
df

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,blood_glucose_level,diabetes,HbA1c_level
0,Female,0.08,0,0,No Info,11.88,80,0,5.7
1,Female,0.08,0,0,No Info,12.22,130,0,3.5
2,Female,0.08,0,0,No Info,12.50,155,0,4.5
3,Female,0.08,0,0,No Info,12.74,140,0,3.5
4,Female,0.08,0,0,No Info,12.77,160,0,4.5
...,...,...,...,...,...,...,...,...,...
154453,Other,47.00,0,0,never,36.76,90,0,6.6
154454,Other,47.00,0,0,never,36.76,200,0,3.5
154455,Other,52.00,0,0,not current,31.63,159,0,6.6
154456,Other,53.00,0,0,No Info,27.32,160,0,6.6


In [13]:
df.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
blood_glucose_level    0
diabetes               0
HbA1c_level            0
dtype: int64

In [14]:
X = df.drop(columns=['diabetes'])
y = df['diabetes']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [20]:
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

### Hyperparameter tuning and cross-validation for each model


In [21]:
# 1. RandomForestClassifier
rf_param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5]
}
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])
rf_grid = GridSearchCV(rf_pipeline, rf_param_grid, cv=KFold(n_splits=5, shuffle=True, random_state=42), n_jobs=-1)
rf_grid.fit(X_train, y_train)
best_rf = rf_grid.best_estimator_

In [24]:
# evaluate RandomForestClassifier
rf_y_pred = best_rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_confusion = confusion_matrix(y_test, rf_y_pred)
rf_classification_report = classification_report(y_test, rf_y_pred)
print("RandomForestClassifier Accuracy:", rf_accuracy)
print("RandomForestClassifier Confusion Matrix:\n", rf_confusion)
print("RandomForestClassifier Classification Report:\n", rf_classification_report)

RandomForestClassifier Accuracy: 0.9806098666321378
RandomForestClassifier Confusion Matrix:
 [[29121     1]
 [  598  1172]]
RandomForestClassifier Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99     29122
           1       1.00      0.66      0.80      1770

    accuracy                           0.98     30892
   macro avg       0.99      0.83      0.89     30892
weighted avg       0.98      0.98      0.98     30892



In [29]:
# 2. XGBClassifier
xgb_param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [3, 6],
    'classifier__learning_rate': [0.01, 0.1]
}
xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
])
xgb_grid = GridSearchCV(xgb_pipeline, xgb_param_grid, cv=KFold(n_splits=5, shuffle=True, random_state=42), n_jobs=-1)
xgb_grid.fit(X_train, y_train)
best_xgb = xgb_grid.best_estimator_

# evaluate XGBClassifier
xgb_y_pred = best_xgb.predict(X_test)
xgb_accuracy = accuracy_score(y_test, xgb_y_pred)
xgb_confusion = confusion_matrix(y_test, xgb_y_pred)
xgb_classification_report = classification_report(y_test, xgb_y_pred)
print("XGBClassifier Accuracy:", xgb_accuracy)
print("XGBClassifier Confusion Matrix:\n", xgb_confusion)
print("XGBClassifier Classification Report:\n", xgb_classification_report)

XGBClassifier Accuracy: 0.9808688333549139
XGBClassifier Confusion Matrix:
 [[29116     6]
 [  585  1185]]
XGBClassifier Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99     29122
           1       0.99      0.67      0.80      1770

    accuracy                           0.98     30892
   macro avg       0.99      0.83      0.90     30892
weighted avg       0.98      0.98      0.98     30892



In [30]:
# 3. LGBMClassifier
lgbm_param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [-1, 10],
    'classifier__learning_rate': [0.01, 0.1]
}
lgbm_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LGBMClassifier(random_state=42))
])
lgbm_grid = GridSearchCV(lgbm_pipeline, lgbm_param_grid, cv=KFold(n_splits=5, shuffle=True, random_state=42), n_jobs=-1)
lgbm_grid.fit(X_train, y_train)
best_lgbm = lgbm_grid.best_estimator_

# evaluate LGBMClassifier
lgbm_y_pred = best_lgbm.predict(X_test)
lgbm_accuracy = accuracy_score(y_test, lgbm_y_pred)
lgbm_confusion = confusion_matrix(y_test, lgbm_y_pred)
lgbm_classification_report = classification_report(y_test, lgbm_y_pred)
print("LGBMClassifier Accuracy:", lgbm_accuracy)
print("LGBMClassifier Confusion Matrix:\n", lgbm_confusion)
print("LGBMClassifier Classification Report:\n", lgbm_classification_report)

[LightGBM] [Info] Number of positive: 7082, number of negative: 116484
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002555 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 418
[LightGBM] [Info] Number of data points in the train set: 123566, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.057314 -> initscore=-2.800198
[LightGBM] [Info] Start training from score -2.800198
LGBMClassifier Accuracy: 0.9807717208338729
LGBMClassifier Confusion Matrix:
 [[29118     4]
 [  590  1180]]
LGBMClassifier Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99     29122
           1       1.00      0.67      0.80      1770

    accuracy                           0.98     30892
   macro avg       0.99      0.83      0.89     30892
weighted avg       0.98      0.98      0.98     30892



In [31]:
# 4. CatBoostClassifier
cat_param_grid = {
    'classifier__iterations': [100, 200],
    'classifier__depth': [3, 6],
    'classifier__learning_rate': [0.01, 0.1]
}
cat_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', CatBoostClassifier(verbose=0, random_state=42))
])
cat_grid = GridSearchCV(cat_pipeline, cat_param_grid, cv=KFold(n_splits=5, shuffle=True, random_state=42), n_jobs=-1)
cat_grid.fit(X_train, y_train)
best_cat = cat_grid.best_estimator_

# Evaluate CatBoostClassifier
cat_y_pred = best_cat.predict(X_test)
cat_accuracy = accuracy_score(y_test, cat_y_pred)
cat_confusion = confusion_matrix(y_test, cat_y_pred)
cat_classification_report = classification_report(y_test, cat_y_pred)
print("CatBoostClassifier Accuracy:", cat_accuracy)
print("CatBoostClassifier Confusion Matrix:\n", cat_confusion)
print("CatBoostClassifier Classification Report:\n", cat_classification_report)

CatBoostClassifier Accuracy: 0.981063058396996
CatBoostClassifier Confusion Matrix:
 [[29110    12]
 [  573  1197]]
CatBoostClassifier Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99     29122
           1       0.99      0.68      0.80      1770

    accuracy                           0.98     30892
   macro avg       0.99      0.84      0.90     30892
weighted avg       0.98      0.98      0.98     30892



In [32]:
# Collect all best models for ensemble
best_models = [
    ('rf', best_rf),
    ('xgb', best_xgb),
    ('lgbm', best_lgbm),
    ('cat', best_cat)
]

In [33]:
# using votingclassifier
voting_clf = VotingClassifier(estimators=best_models, voting='soft')
voting_clf.fit(X_train, y_train)
# Evaluate the voting classifier
y_pred_voting = voting_clf.predict(X_test)
print("Voting Classifier Accuracy:", accuracy_score(y_test, y_pred_voting))
print("Voting Classifier Classification Report:\n", classification_report(y_test, y_pred_voting))

[LightGBM] [Info] Number of positive: 7082, number of negative: 116484
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002501 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 418
[LightGBM] [Info] Number of data points in the train set: 123566, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.057314 -> initscore=-2.800198
[LightGBM] [Info] Start training from score -2.800198
Voting Classifier Accuracy: 0.9807393499935259
Voting Classifier Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99     29122
           1       1.00      0.67      0.80      1770

    accuracy                           0.98     30892
   macro avg       0.99      0.83      0.89     30892
weighted avg       0.98      0.98      0.98     30892



In [37]:
# using stackingclassifier
stacking_clf = StackingClassifier(estimators=best_models, final_estimator=XGBClassifier(random_state=42, **xgb_grid.param_grid))
stacking_clf.fit(X_train, y_train)
# Evaluate the stacking classifier
y_pred_stacking = stacking_clf.predict(X_test)
print("Stacking Classifier Accuracy:", accuracy_score(y_test, y_pred_stacking))
print("Stacking Classifier Classification Report:\n", classification_report(y_test, y_pred_stacking))

[LightGBM] [Info] Number of positive: 7082, number of negative: 116484
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002716 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 418
[LightGBM] [Info] Number of data points in the train set: 123566, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.057314 -> initscore=-2.800198
[LightGBM] [Info] Start training from score -2.800198
[LightGBM] [Info] Number of positive: 5665, number of negative: 93187
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001353 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 418
[LightGBM] [Info] Number of data points in the train set: 98852, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.057308 -> initscore=-2.800301
[Lig

In [38]:
final_model = stacking_clf

In [40]:
df

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,blood_glucose_level,diabetes,HbA1c_level
0,Female,0.08,0,0,No Info,11.88,80,0,5.7
1,Female,0.08,0,0,No Info,12.22,130,0,3.5
2,Female,0.08,0,0,No Info,12.50,155,0,4.5
3,Female,0.08,0,0,No Info,12.74,140,0,3.5
4,Female,0.08,0,0,No Info,12.77,160,0,4.5
...,...,...,...,...,...,...,...,...,...
154453,Other,47.00,0,0,never,36.76,90,0,6.6
154454,Other,47.00,0,0,never,36.76,200,0,3.5
154455,Other,52.00,0,0,not current,31.63,159,0,6.6
154456,Other,53.00,0,0,No Info,27.32,160,0,6.6


In [42]:
def calculate_BMI(bb, tb):
    """
    Calculate Body Mass Index (BMI) given body weight and height.
    
    Parameters:
    bb (float): Body weight in kilograms.
    tb (float): Height in meters.
    
    Returns:
    float: Calculated BMI.
    """
    return bb / (tb ** 2)

In [50]:
# making an input manual in terminal while running for prediction using final_model
def predict_diabetes(input_data):
    input_df = pd.DataFrame([input_data])
    # input_transformed = preprocessor.transform(input_df)
    # prediction = final_model.predict(input_transformed)
    prediction = final_model.predict(input_df)
    return "Diabetic" if prediction[0] == 1 else "Not Diabetic"

In [52]:
if __name__ == "__main__":
    gender = str(input("Enter gender: "))
    age = float(input("Enter age: "))
    body_weight = float(input("Enter body weight (kg): "))
    body_height = float(input("Enter body height (m): "))
    BMI = calculate_BMI(body_weight, body_height)
    hypertension = int(input("Enter hypertension (0 or 1): "))
    heart_disease = int(input("Enter heart disease (0 or 1): "))
    smoking_history = int(input("Enter smoking history (0 or 1): "))
    glucose_level = float(input("Enter glucose level: "))
    HbA1c_level = float(input("Enter HbA1c level: "))
    input_data = {
        'gender': gender,
        'age': age,
        'hypertension': hypertension,
        'heart_disease': heart_disease,
        'smoking_history': smoking_history,
        'bmi': BMI,
        'blood_glucose_level': glucose_level,
        'HbA1c_level': HbA1c_level
    }
    result = predict_diabetes(input_data)

In [53]:
result

'Not Diabetic'

In [54]:
# Save the final model
import joblib
joblib.dump(final_model, 'diabtes_predict_model.pkl')
print("Model saved as diabtes_predict_model.pkl")

Model saved as diabtes_predict_model.pkl
