In [None]:
!pip install catboost



### Importing Required Libraries

In [None]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier

import joblib

import warnings
warnings.filterwarnings('ignore')

##### Importing Data

In [None]:
df = pd.read_csv('/content/diabetes_data_fe.csv')
df.head()

Unnamed: 0,age,gender,ethnicity,education_level,income_level,employment_status,smoking_status,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,...,insulin_level,hba1c,diagnosed_diabetes,homa_ir,tg_hdl_ratio,glysemic_gap,mean_arterial_pressure,obesity_risk_index,lifestyle_risk_score,cardiometabolic_burden
0,58,Male,Asian,Highschool,Lower-Middle,Employed,Never,0,215,5.7,...,6.36,8.18,1,2.135704,3.536585,100,96.666667,27.145,8.053883,1
1,48,Female,White,Highschool,Middle,Employed,Former,1,143,6.7,...,2.0,5.63,0,0.459259,0.545455,57,93.666667,18.48,8.836815,0
2,60,Male,Hispanic,Highschool,Middle,Unemployed,Never,1,57,6.4,...,5.07,7.51,1,1.477185,0.545455,77,87.0,17.982,8.252377,0
3,74,Female,Black,Highschool,Low,Retired,Never,0,49,3.4,...,5.28,9.03,1,1.812148,2.8,114,102.0,23.584,5.447273,0
4,46,Male,White,Graduate,Middle,Retired,Never,1,109,7.2,...,12.74,7.2,1,4.30958,3.076923,47,75.333333,16.536,5.131042,0


In [None]:
df_1 = df.copy()
df_1.head()

Unnamed: 0,age,gender,ethnicity,education_level,income_level,employment_status,smoking_status,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,...,insulin_level,hba1c,diagnosed_diabetes,homa_ir,tg_hdl_ratio,glysemic_gap,mean_arterial_pressure,obesity_risk_index,lifestyle_risk_score,cardiometabolic_burden
0,58,Male,Asian,Highschool,Lower-Middle,Employed,Never,0,215,5.7,...,6.36,8.18,1,2.135704,3.536585,100,96.666667,27.145,8.053883,1
1,48,Female,White,Highschool,Middle,Employed,Former,1,143,6.7,...,2.0,5.63,0,0.459259,0.545455,57,93.666667,18.48,8.836815,0
2,60,Male,Hispanic,Highschool,Middle,Unemployed,Never,1,57,6.4,...,5.07,7.51,1,1.477185,0.545455,77,87.0,17.982,8.252377,0
3,74,Female,Black,Highschool,Low,Retired,Never,0,49,3.4,...,5.28,9.03,1,1.812148,2.8,114,102.0,23.584,5.447273,0
4,46,Male,White,Graduate,Middle,Retired,Never,1,109,7.2,...,12.74,7.2,1,4.30958,3.076923,47,75.333333,16.536,5.131042,0


##### Data Preprocessing

In [None]:
num_feat = df_1.select_dtypes(exclude='object').drop(
    ['family_history_diabetes','hypertension_history','cardiovascular_history','cardiometabolic_burden','diagnosed_diabetes','physical_activity_minutes_per_week','systolic_bp','ldl_cholesterol','triglycerides','insulin_level'],
    axis=1
).columns

num_feat_skewed = ['physical_activity_minutes_per_week','systolic_bp','ldl_cholesterol','triglycerides','insulin_level']

cat_feat = ['gender', 'ethnicity', 'employment_status', 'smoking_status']


Label Encoding Education & Income

In [None]:
edu_mapping = {
    'No formal': 0,
    'Highschool': 1,
    'Graduate': 2,
    'Postgraduate': 3
}

income_mapping = {
    'Low': 0,
    'Lower-Middle': 1,
    'Middle': 2,
    'Upper-Middle': 3,
    'High': 4
}

In [None]:
df_1['edu_level_encoded'] = df_1['education_level'].map(edu_mapping)

In [None]:
df_1['income_level_encoded'] = df_1['income_level'].map(income_mapping)

In [None]:
df_1.drop(['education_level','income_level'], axis=1, inplace=True)

In [None]:
df_1.head()

Unnamed: 0,age,gender,ethnicity,employment_status,smoking_status,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,...,diagnosed_diabetes,homa_ir,tg_hdl_ratio,glysemic_gap,mean_arterial_pressure,obesity_risk_index,lifestyle_risk_score,cardiometabolic_burden,edu_level_encoded,income_level_encoded
0,58,Male,Asian,Employed,Never,0,215,5.7,7.9,7.9,...,1,2.135704,3.536585,100,96.666667,27.145,8.053883,1,1,1
1,48,Female,White,Employed,Former,1,143,6.7,6.5,8.7,...,0,0.459259,0.545455,57,93.666667,18.48,8.836815,0,1,2
2,60,Male,Hispanic,Unemployed,Never,1,57,6.4,10.0,8.1,...,1,1.477185,0.545455,77,87.0,17.982,8.252377,0,1,2
3,74,Female,Black,Retired,Never,0,49,3.4,6.6,5.2,...,1,1.812148,2.8,114,102.0,23.584,5.447273,0,1,0
4,46,Male,White,Retired,Never,1,109,7.2,7.4,5.0,...,1,4.30958,3.076923,47,75.333333,16.536,5.131042,0,2,2


In [None]:
num_transformer = StandardScaler()
num_skewed_transformer = RobustScaler()
onehot_transformer = OneHotEncoder(drop='first', handle_unknown='ignore')

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("StandardScaler", num_transformer, num_feat),
        ("RobustScaler", num_skewed_transformer, num_feat_skewed),
        ("OneHotEncoder", onehot_transformer, cat_feat)
    ],
    remainder='passthrough'
)

##### Spliting data into X & y variables

In [None]:
X = df_1.drop('diagnosed_diabetes', axis=1)

In [None]:
X.head()

Unnamed: 0,age,gender,ethnicity,employment_status,smoking_status,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,...,hba1c,homa_ir,tg_hdl_ratio,glysemic_gap,mean_arterial_pressure,obesity_risk_index,lifestyle_risk_score,cardiometabolic_burden,edu_level_encoded,income_level_encoded
0,58,Male,Asian,Employed,Never,0,215,5.7,7.9,7.9,...,8.18,2.135704,3.536585,100,96.666667,27.145,8.053883,1,1,1
1,48,Female,White,Employed,Former,1,143,6.7,6.5,8.7,...,5.63,0.459259,0.545455,57,93.666667,18.48,8.836815,0,1,2
2,60,Male,Hispanic,Unemployed,Never,1,57,6.4,10.0,8.1,...,7.51,1.477185,0.545455,77,87.0,17.982,8.252377,0,1,2
3,74,Female,Black,Retired,Never,0,49,3.4,6.6,5.2,...,9.03,1.812148,2.8,114,102.0,23.584,5.447273,0,1,0
4,46,Male,White,Retired,Never,1,109,7.2,7.4,5.0,...,7.2,4.30958,3.076923,47,75.333333,16.536,5.131042,0,2,2


In [None]:
y = df_1['diagnosed_diabetes']

In [None]:
y

Unnamed: 0,diagnosed_diabetes
0,1
1,0
2,1
3,1
4,1
...,...
99995,0
99996,0
99997,0
99998,0


##### Spliting data into train & test sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

Definning function to evaluate models

In [None]:
def eval_model(pipeline,X_train,X_test,y_train,y_test):
  pipeline.fit(X_train,y_train)
  y_pred = pipeline.predict(X_test)
  y_proba = pipeline.predict_proba(X_test)[:,1]

  print("Classifer Report:\n", classification_report(y_test,y_pred))
  print("ROC-AUC", roc_auc_score(y_test,y_proba))
  print("\nConfusion Matrix:\n",confusion_matrix(y_test,y_pred))

##### Logistic Regression

In [None]:
log_reg = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(max_iter=1000, random_state=42))
    ]
)

In [None]:
eval_model(log_reg,X_train,X_test,y_train,y_test)

Classifer Report:
               precision    recall  f1-score   support

           0       0.84      0.81      0.82      8077
           1       0.87      0.89      0.88     11923

    accuracy                           0.86     20000
   macro avg       0.85      0.85      0.85     20000
weighted avg       0.86      0.86      0.86     20000

ROC-AUC 0.9327420383306191

Confusion Matrix:
 [[ 6514  1563]
 [ 1286 10637]]


##### Decision Tree

In [None]:
decision_tree = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('classifier', DecisionTreeClassifier(
            criterion='gini',
            max_depth=5,
            min_samples_leaf=50,
            min_samples_split=100,
            random_state=42
        ))
    ]
)

In [None]:
eval_model(decision_tree,X_train,X_test,y_train,y_test)

Classifer Report:
               precision    recall  f1-score   support

           0       0.84      1.00      0.91      8077
           1       1.00      0.87      0.93     11923

    accuracy                           0.92     20000
   macro avg       0.92      0.93      0.92     20000
weighted avg       0.93      0.92      0.92     20000

ROC-AUC 0.9439143006592247

Confusion Matrix:
 [[ 8077     0]
 [ 1596 10327]]


In [None]:
def cv_score(pipeline):
  return cross_val_score(
    pipeline,
    X_train,y_train,
    cv=5,
    scoring='roc_auc'
  )

In [None]:
cv_score(decision_tree)

array([0.94796264, 0.94575365, 0.94054407, 0.93983831, 0.94492306])

In [None]:
def get_feat_imp(pipeline):
  importances = pipeline.named_steps['classifier'].feature_importances_
  feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()

  importance = pd.Series(
    importances,
    index = feature_names
  ).sort_values(ascending=False)

  return importance.head(15)

In [None]:
get_feat_imp(decision_tree)

Unnamed: 0,0
StandardScaler__hba1c,0.956192
StandardScaler__glucose_fasting,0.038673
remainder__family_history_diabetes,0.002332
StandardScaler__age,0.00152
RobustScaler__physical_activity_minutes_per_week,0.001006
StandardScaler__diet_score,0.000177
RobustScaler__systolic_bp,9.9e-05
StandardScaler__alcohol_consumption_per_week,0.0
StandardScaler__screen_time_hours_per_day,0.0
StandardScaler__sleep_hours_per_day,0.0


##### Random Forest

In [None]:
rf_clf = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(
            n_estimators=300,
            max_depth=8,
            min_samples_split=100,
            min_samples_leaf=50,
            n_jobs=-1,
            random_state=42
        ))
    ]
)

In [None]:
eval_model(rf_clf,X_train,X_test,y_train,y_test)

Classifer Report:
               precision    recall  f1-score   support

           0       0.84      1.00      0.91      8077
           1       1.00      0.87      0.93     11923

    accuracy                           0.92     20000
   macro avg       0.92      0.93      0.92     20000
weighted avg       0.93      0.92      0.92     20000

ROC-AUC 0.9428577397883791

Confusion Matrix:
 [[ 8075     2]
 [ 1593 10330]]


In [None]:
cv_score(rf_clf)

array([0.94731882, 0.94511027, 0.94124876, 0.94019295, 0.94348451])

In [None]:
get_feat_imp(rf_clf)

Unnamed: 0,0
StandardScaler__hba1c,0.5004
StandardScaler__glucose_postprandial,0.244771
StandardScaler__glucose_fasting,0.109309
StandardScaler__glysemic_gap,0.101481
remainder__family_history_diabetes,0.014424
StandardScaler__homa_ir,0.007849
StandardScaler__age,0.004779
RobustScaler__physical_activity_minutes_per_week,0.002423
StandardScaler__obesity_risk_index,0.001916
StandardScaler__bmi,0.001657


##### XGBoost

In [None]:
xgb_clf = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('classifier', XGBClassifier(
            n_estimators=300,
            max_depth=6,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            n_jobs=-1
        ))
    ]
)

In [None]:
eval_model(xgb_clf,X_train,X_test,y_train,y_test)

Classifer Report:
               precision    recall  f1-score   support

           0       0.84      1.00      0.91      8077
           1       1.00      0.87      0.93     11923

    accuracy                           0.92     20000
   macro avg       0.92      0.93      0.92     20000
weighted avg       0.93      0.92      0.92     20000

ROC-AUC 0.9408708458616637

Confusion Matrix:
 [[ 8062    15]
 [ 1592 10331]]


##### LightGBM

In [None]:
lgbm_clf = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('classifier', lgb.LGBMClassifier(
            n_estimators=500,
            max_depth=5,
            learning_rate=0.01,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            n_jobs=-1
        ))
    ]
)

In [None]:
eval_model(lgbm_clf,X_train,X_test,y_train,y_test)

[LightGBM] [Info] Number of positive: 48075, number of negative: 31925
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.032591 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4079
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.600938 -> initscore=0.409373
[LightGBM] [Info] Start training from score 0.409373
Classifer Report:
               precision    recall  f1-score   support

           0       0.84      1.00      0.91      8077
           1       1.00      0.87      0.93     11923

    accuracy                           0.92     20000
   macro avg       0.92      0.93      0.92     20000
weighted avg       0.93      0.92      0.92     20000

ROC-AUC 0.9451307127133329

Confusion Matrix:
 [[ 8077     0]
 [ 1596 10327]]


In [None]:
cv_score(lgbm_clf)

[LightGBM] [Info] Number of positive: 38460, number of negative: 25540
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026548 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4066
[LightGBM] [Info] Number of data points in the train set: 64000, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.600938 -> initscore=0.409373
[LightGBM] [Info] Start training from score 0.409373
[LightGBM] [Info] Number of positive: 38460, number of negative: 25540
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.027181 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4049
[LightGBM] [Info] Number of data points in the train set: 64000, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.600938 -> initscore=0.409373
[LightGBM] [Info] Start training from score 0.409373
[LightGBM] [Info

array([0.95059037, 0.94899206, 0.94275629, 0.94333427, 0.94669977])

##### Catboost

In [None]:
cat_clf = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('classifier', CatBoostClassifier(
            iterations=500,
            depth=5,
            learning_rate=0.01,
            verbose=0,
            random_state=42
        ))
    ]
)

In [None]:
eval_model(cat_clf,X_train,X_test,y_train,y_test)

Classifer Report:
               precision    recall  f1-score   support

           0       0.84      1.00      0.91      8077
           1       1.00      0.87      0.93     11923

    accuracy                           0.92     20000
   macro avg       0.92      0.93      0.92     20000
weighted avg       0.93      0.92      0.92     20000

ROC-AUC 0.9458749127004756

Confusion Matrix:
 [[ 8077     0]
 [ 1596 10327]]


In [None]:
cv_score(cat_clf)

array([0.95121259, 0.94923906, 0.94399536, 0.94377731, 0.94721711])

##### HyperParameter Tunning

In [None]:
from scipy.stats import randint, uniform

In [None]:
cat_params = {
    'classifier__iterations': randint(200,600),
    'classifier__depth': randint(4,8),
    'classifier__learning_rate': uniform(0.01,0.1),
    'classifier__l2_leaf_reg': uniform(1, 10),
    'classifier__border_count': randint(32, 128)
}

In [None]:
cat_pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('classifier', CatBoostClassifier(
            verbose=0,
            random_state=42
        ))
    ]
)

In [None]:
random_search = RandomizedSearchCV(
    estimator=cat_pipe,
    param_distributions=cat_params,
    n_iter=10,
    cv=5,
    scoring='roc_auc',
    verbose=2,
    n_jobs=-1,
    random_state=42
)

In [None]:
random_search.fit(X_train,y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [None]:
best_params = random_search.best_params_
best_params

{'classifier__border_count': 107,
 'classifier__depth': 5,
 'classifier__iterations': 221,
 'classifier__l2_leaf_reg': np.float64(1.070663052197174),
 'classifier__learning_rate': np.float64(0.012306242504141576)}

In [None]:
best_cat_pipeline = random_search.best_estimator_

In [None]:
y_pred = best_cat_pipeline.predict(X_test)
y_prob = best_cat_pipeline.predict_proba(X_test)[:, 1]

In [None]:
print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))

              precision    recall  f1-score   support

           0       0.84      0.99      0.91      8077
           1       0.99      0.87      0.93     11923

    accuracy                           0.92     20000
   macro avg       0.91      0.93      0.92     20000
weighted avg       0.93      0.92      0.92     20000

ROC-AUC: 0.9457789646081444


In [None]:
joblib.dump(cat_clf, "best_cat_model.pkl")

['best_cat_model.pkl']