In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, precision_score, confusion_matrix, recall_score,accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler,StandardScaler


In [2]:
df= pd.read_csv('Data/diabetes_clean.csv')
df

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80,No,Yes,never,25.19,6.6,140.0,0
1,Female,54,No,No,no info,27.32,6.6,80.0,0
2,Male,28,No,No,never,27.32,5.7,158.0,0
3,Female,36,No,No,current,23.45,5.0,155.0,0
4,Male,76,Yes,Yes,current,20.14,4.8,155.0,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80,No,No,no info,27.32,6.2,90.0,0
99996,Female,2,No,No,no info,17.37,6.5,100.0,0
99997,Male,66,No,No,former,27.83,5.7,155.0,0
99998,Female,24,No,No,never,35.42,4.0,100.0,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  int64  
 2   hypertension         100000 non-null  object 
 3   heart_disease        100000 non-null  object 
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  float64
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(2), object(4)
memory usage: 6.9+ MB


In [4]:
df['hypertension'] = df['hypertension'].astype('category')
df['heart_disease'] = df['heart_disease'].astype('category')

In [5]:
X=df.drop(columns="diabetes")
y=df['diabetes']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
trf1=ColumnTransformer(transformers=[
    ('encode',OneHotEncoder(drop='first',handle_unknown='ignore'),['gender','hypertension','heart_disease','smoking_history']),
    ('scale',StandardScaler(),['age','bmi','HbA1c_level','blood_glucose_level'])
],remainder='passthrough')

In [25]:
trf2=ColumnTransformer(transformers=[
    ('encode',OneHotEncoder(drop='first',handle_unknown='ignore'),['gender','hypertension','heart_disease','smoking_history']),
    ('scale',MinMaxScaler(),['age','bmi','HbA1c_level','blood_glucose_level'])
],remainder='passthrough')

In [26]:
pipe1= Pipeline(steps=[
    ('trf1',trf1),
    ('smote',SMOTE(random_state=42)),
    ('lr',LogisticRegression())
])
pipe1.fit(X_train,y_train)

In [27]:
y_pred1 =pipe1.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred1))
print("Precision:", precision_score(y_test, y_pred1))
print("Recall:", recall_score(y_test, y_pred1))
print("F1 Score:", f1_score(y_test, y_pred1))
print("ROC-AUC:", roc_auc_score(y_test, pipe1.predict_proba(X_test)[:, 1]))
report = classification_report(y_test, y_pred1)
print(report)

Accuracy: 0.87175
Precision: 0.38690947479546056
Recall: 0.8583138173302107
F1 Score: 0.5333818446425322
ROC-AUC: 0.9514678067887525
              precision    recall  f1-score   support

           0       0.99      0.87      0.93     18292
           1       0.39      0.86      0.53      1708

    accuracy                           0.87     20000
   macro avg       0.69      0.87      0.73     20000
weighted avg       0.93      0.87      0.89     20000



In [28]:
param_grid1= {
    'lr__C': [0.001, 0.01, 0.1, 1, 10],  
    'lr__penalty': ['l2'],                
    'lr__solver': ['liblinear', 'saga'],  
    'lr__max_iter': [100, 200, 300]       
}

In [29]:
grid_search1 = GridSearchCV(pipe1, param_grid=param_grid1, cv=5, scoring='recall', n_jobs=-1)
grid_search1.fit(X_train, y_train)
print("Best Parameters:", grid_search1.best_params_)
print("Best Score:", grid_search1.best_score_)

Best Parameters: {'lr__C': 0.001, 'lr__max_iter': 100, 'lr__penalty': 'l2', 'lr__solver': 'liblinear'}
Best Score: 0.8776495755672379


In [30]:
model1 = grid_search1.best_estimator_
y_pred1 = model1.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred1))
print("Precision:", precision_score(y_test, y_pred1))
print("Recall:", recall_score(y_test, y_pred1))
print("F1 Score:", f1_score(y_test, y_pred1))
print("ROC-AUC:", roc_auc_score(y_test, pipe1.predict_proba(X_test)[:, 1]))
report = classification_report(y_test, y_pred1)
print(report)

Accuracy: 0.86165
Precision: 0.3689031938598663
Recall: 0.8723653395784543
F1 Score: 0.5185314076909692
ROC-AUC: 0.9514678067887525
              precision    recall  f1-score   support

           0       0.99      0.86      0.92     18292
           1       0.37      0.87      0.52      1708

    accuracy                           0.86     20000
   macro avg       0.68      0.87      0.72     20000
weighted avg       0.93      0.86      0.89     20000



In [39]:
pipe2= Pipeline(steps=[
    ('trf1',trf2),
    ('smote',SMOTE(random_state=42)),
    ('rf',RandomForestClassifier(class_weight='balanced', random_state=42))
])
pipe2.fit(X_train,y_train)

In [40]:
param_grid2 = {
    'rf__n_estimators': [100, 200],
    'rf__max_depth': [None, 10, 20],
    'rf__min_samples_split': [2, 5],
    'rf__min_samples_leaf': [1, 2],
    'rf__class_weight': ['balanced', {0: 1, 1: 3}]
}

In [41]:
grid_search2 = GridSearchCV(
    pipe2,
    param_grid=param_grid2,
    scoring='f1',
    cv=5,
    n_jobs=-1,
    verbose=1
    )
grid_search2.fit(X_train, y_train)
print("Best Parameters:", grid_search2.best_params_)
print("Best Score:", grid_search2.best_score_)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters: {'rf__class_weight': 'balanced', 'rf__max_depth': None, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 5, 'rf__n_estimators': 200}
Best Score: 0.7328438783211357


In [42]:
model2 = grid_search2.best_estimator_
y_pred2 = model2.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred2))
print("Precision:", precision_score(y_test, y_pred2))
print("Recall:", recall_score(y_test, y_pred2))
print("F1 Score:", f1_score(y_test, y_pred2))
print("ROC-AUC:", roc_auc_score(y_test, pipe2.predict_proba(X_test)[:, 1]))
report = classification_report(y_test, y_pred2)
print(report)

Accuracy: 0.9566
Precision: 0.7551640340218712
Recall: 0.727751756440281
F1 Score: 0.7412045319022064
ROC-AUC: 0.9595834372508221
              precision    recall  f1-score   support

           0       0.97      0.98      0.98     18292
           1       0.76      0.73      0.74      1708

    accuracy                           0.96     20000
   macro avg       0.86      0.85      0.86     20000
weighted avg       0.96      0.96      0.96     20000



In [13]:
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

In [31]:
pipe3= Pipeline(steps=[
    ('trf2',trf2),
    ('smote',SMOTE(random_state=42)),
    ('xgb',XGBClassifier(eval_metric='logloss'))
])
pipe3.fit(X_train,y_train)

In [32]:
param_grid3 = {
    'xgb__n_estimators': [100, 300],
    'xgb__max_depth': [3, 5],
    'xgb__learning_rate': [0.1, 0.2],
    'xgb__subsample': [0.8, 1.0],
    'xgb__colsample_bytree': [0.8, 1.0]
}

In [33]:
grid_search3 = GridSearchCV(
    pipe3,
    param_grid=param_grid3,
    scoring='recall', 
    cv=3,
    n_jobs=-1,
    verbose=1
)
grid_search3.fit(X_train, y_train)
print("Best Parameters:", grid_search3.best_params_)
print("Best Score:", grid_search3.best_score_)

Fitting 3 folds for each of 32 candidates, totalling 96 fits
Best Parameters: {'xgb__colsample_bytree': 0.8, 'xgb__learning_rate': 0.1, 'xgb__max_depth': 3, 'xgb__n_estimators': 100, 'xgb__subsample': 0.8}
Best Score: 0.8162544169611308


In [34]:
model3 = grid_search3.best_estimator_
y_pred3 = model3.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred3))
print("Precision:", precision_score(y_test, y_pred3))
print("Recall:", recall_score(y_test, y_pred3))
print("F1 Score:", f1_score(y_test, y_pred3))
print("ROC-AUC:", roc_auc_score(y_test, pipe3.predict_proba(X_test)[:, 1]))
report = classification_report(y_test, y_pred3)
print(report)

Accuracy: 0.93355
Precision: 0.5768762677484787
Recall: 0.832552693208431
F1 Score: 0.6815240833932422
ROC-AUC: 0.9740184726459296
              precision    recall  f1-score   support

           0       0.98      0.94      0.96     18292
           1       0.58      0.83      0.68      1708

    accuracy                           0.93     20000
   macro avg       0.78      0.89      0.82     20000
weighted avg       0.95      0.93      0.94     20000



In [49]:
from sklearn.ensemble import StackingClassifier
stacked = StackingClassifier(
    estimators=[
        ('lr_pipe1', pipe1),
        ('xgb_pipe3',pipe3)
    ],
    final_estimator=LogisticRegression(max_iter=1000),
    n_jobs=-1,
)

stacked.fit(X_train, y_train)



In [50]:
y_pred = stacked.predict(X_test)
y_proba = stacked.predict_proba(X_test)[:, 1]
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))

Accuracy: 0.9659
Precision: 0.8874622356495468
Recall: 0.6879391100702577
F1 Score: 0.775065963060686
ROC-AUC: 0.9714462747436716
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     18292
           1       0.89      0.69      0.78      1708

    accuracy                           0.97     20000
   macro avg       0.93      0.84      0.88     20000
weighted avg       0.96      0.97      0.96     20000



In [51]:
import joblib
with open('diabetes_model.pkl', 'wb') as file:
    joblib.dump(stacked,file, compress=9)

In [52]:
report = classification_report(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
with open("performance_report.txt", "w") as f:
    f.write("Model: Stacked Ensemble\n")
    f.write("Accuracy: {:.4f}\n".format(accuracy_score(y_test, y_pred)))
    f.write("Precision: {:.4f}\n".format(precision_score(y_test, y_pred)))
    f.write("Recall: {:.4f}\n".format(recall_score(y_test, y_pred)))
    f.write("F1 Score: {:.4f}\n".format(f1_score(y_test, y_pred)))
    f.write("ROC-AUC: {:.4f}\n".format(roc_auc_score(y_test, y_proba)))
    f.write("\nClassification Report:\n")
    f.write(report)
    f.write("\nConfusion Matrix:\n")
    f.write(np.array2string(cm))
