In [16]:
from collections import Counter
from itertools import islice
import pickle

import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import (GridSearchCV, StratifiedKFold, KFold,
                                     cross_validate, train_test_split, cross_validate)
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.dummy import DummyClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

## Preprocessing  Pipeline
1. Lower case all column names
2. Columns to one hot encode:
    nominal_columns = ["highbp","highchol","cholcheck","smoker","stroke","heartdiseaseorattack","physactivity","fruits","veggies","hvyalcoholconsump","anyhealthcare","nodocbccost","diffwalk","sex"]
3. Columns to standardize:
    numerical_columns = ["bmi","age","income","menthlth","physhlth","education","genhlth"]
    



In [5]:
data = pd.read_csv("../data/raw/diabetes_binary_health_indicators_BRFSS2015.csv")
data.columns = data.columns.str.lower()

In [6]:
# Split data into train and test datasets. Cross validation will be performed on train dataset.
X = data.drop(columns=["diabetes_binary"])
y = data["diabetes_binary"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2025, shuffle=True, stratify=y)

In [7]:
# Building Preprocessing Pipeline Function
def create_pipeline(model: tuple[str, BaseEstimator])->Pipeline:
    nominal_columns = ["highbp","highchol","cholcheck","smoker","stroke","heartdiseaseorattack","physactivity","fruits","veggies","hvyalcoholconsump","anyhealthcare","nodocbccost","diffwalk","sex"]
    numerical_columns = ["bmi","age","income","menthlth","physhlth","education","genhlth"]

    column_trans = ColumnTransformer([
            ('numerical', StandardScaler(), numerical_columns),
            ('categorical', OneHotEncoder(drop='first', handle_unknown='ignore'), nominal_columns)
        ])

    pipe = Pipeline([("transformer", column_trans),model])
    return pipe

# Candidate Model Evaluation
def model_metrics(model, y_test, y_pred):

    # positive_label = '1'
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred,zero_division=np.nan)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred,zero_division=np.nan)
    roc_area = roc_auc_score(np.array(y_test, dtype='float64'), np.array(y_pred, dtype='float64'))

    result_df = pd.DataFrame({'Model': [model], 'Accuracy': [accuracy], 'Precision': [precision], 'Recall': [recall], 'F1': [f1], 'ROC AUC': [roc_area]})
    return result_df

log_pipe = create_pipeline(('model', LogisticRegression(random_state=2024)))
rf_pipe = create_pipeline(('model', RandomForestClassifier(random_state=2024)))
svc_pipe = create_pipeline(('model', SVC(random_state=2024)))
knn = create_pipeline(('model', KNeighborsClassifier()))
gbc = create_pipeline(('model', GradientBoostingClassifier(random_state=2024)))
xgb = create_pipeline(('model', XGBClassifier(random_state=2024)))
xgb

In [8]:
# Split train dataset futher into train and validate dataset

X_train_dev, X_test_validate, y_train_dev, y_test_validate = train_test_split(X_train, y_train, test_size=0.2, random_state=2025, shuffle=True, stratify=y_train)
print(X_train_dev.shape)

(182649, 21)


In [None]:
# Candidate Model Fit / Predict
rf_pipe.fit(X_train_dev, y_train_dev)
log_pipe.fit(X_train_dev, y_train_dev)
svc_pipe.fit(X_train_dev, y_train_dev)
knn.fit(X_train_dev, y_train_dev)
gbc.fit(X_train_dev, y_train_dev)
xgb.fit(X_train_dev, y_train_dev)

rf_pred = rf_pipe.predict(X_test_validate)
log_pred = log_pipe.predict(X_test_validate)
svc_pred = svc_pipe.predict(X_test_validate)
knn_pred = knn.predict(X_test_validate)
gbc_pred = gbc.predict(X_test_validate)
xgb_pred = xgb.predict(X_test_validate)

In [16]:
actual = y_test_validate.values

rf_metrics = model_metrics('Random Forest Classifier', actual, rf_pred)
log_metrics = model_metrics('Logistic Regression', actual, log_pred)
svc_metrics = model_metrics('Support Vector Classifier', actual, svc_pred)
knn_metrics = model_metrics('KNN Classifier', actual, knn_pred)
gbc_metrics = model_metrics('Gradient Boosting Classifier', actual, gbc_pred)
xgb_metrics = model_metrics('XGB Classifier', actual, xgb_pred)

results = pd.concat([rf_metrics, log_metrics, svc_metrics, knn_metrics, gbc_metrics, xgb_metrics]).reset_index(drop=True)
results

Unnamed: 0,Model,Accuracy,Precision,Recall,F1,ROC AUC
0,Random Forest Classifier,0.859405,0.487042,0.17133,0.253488,0.57106
1,Logistic Regression,0.863259,0.53224,0.153097,0.237793,0.565658
2,Support Vector Classifier,0.865208,0.642366,0.073405,0.131753,0.533394
3,KNN Classifier,0.847229,0.40577,0.207796,0.274844,0.579268
4,Gradient Boosting Classifier,0.866522,0.57189,0.166929,0.258426,0.57335
5,XGB Classifier,0.866018,0.561492,0.175102,0.266954,0.576483


In [17]:
# While accuracy is relatively high at 84% minimum for all models, it is important to highlight that there is significant class imbalance
# Create a dummy classifier that always predicts the most frequent class (all zeros in this case)
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train_dev, y_train_dev)
dummy_pred = dummy_clf.predict(X_test_validate)
dummy_metrics = model_metrics('Dummy Classifier (Most Frequent)', y_test_validate, dummy_pred)

# Add dummy classifier results to the overall results
results_with_dummy = pd.concat([dummy_metrics, results]).reset_index(drop=True).sort_values(by=["Accuracy","Precision"], ascending=False)
results_with_dummy

Unnamed: 0,Model,Accuracy,Precision,Recall,F1,ROC AUC
5,Gradient Boosting Classifier,0.866522,0.57189,0.166929,0.258426,0.57335
6,XGB Classifier,0.866018,0.561492,0.175102,0.266954,0.576483
3,Support Vector Classifier,0.865208,0.642366,0.073405,0.131753,0.533394
2,Logistic Regression,0.863259,0.53224,0.153097,0.237793,0.565658
0,Dummy Classifier (Most Frequent),0.860675,,0.0,0.0,0.5
1,Random Forest Classifier,0.859405,0.487042,0.17133,0.253488,0.57106
4,KNN Classifier,0.847229,0.40577,0.207796,0.274844,0.579268


## Attempt to handle Label Imbalance
Strategies Explored:
- weight handling
- under sampling
- over sampling
- SMOTE

The above strategies will be explored for XGBoost only as running it for all models is costly

In [9]:
# Weight
class_counts = Counter(y_train_dev)
total_samples = len(y_train_dev)
scale_pos_weight = class_counts[0] / class_counts[1] 

actual = y_test_validate.values
xgb_weighted = create_pipeline(('model', XGBClassifier(random_state=2024, scale_pos_weight = scale_pos_weight)))
xgb_weighted.fit(X_train_dev, y_train_dev)
xgb_pred_weighted = xgb_weighted.predict(X_test_validate)
xgb_metrics_weight = model_metrics('XGB Classifier', actual, xgb_pred_weighted)
xgb_metrics_weight

Unnamed: 0,Model,Accuracy,Precision,Recall,F1,ROC AUC
0,XGB Classifier,0.728402,0.310373,0.776957,0.443557,0.748749


In [10]:
# Under Sampling
rus = RandomUnderSampler(random_state=2024)
X_under, y_under = rus.fit_resample(X_train_dev, y_train_dev)

xgb_under = create_pipeline(('model', XGBClassifier(random_state=2024)))
xgb_under.fit(X_under, y_under)
xgb_pred_under = xgb_under.predict(X_test_validate)
xgb_metrics_under  = model_metrics('XGB Classifier', actual, xgb_pred_under)
xgb_metrics_under 

Unnamed: 0,Model,Accuracy,Precision,Recall,F1,ROC AUC
0,XGB Classifier,0.715503,0.301206,0.789374,0.436032,0.746459


In [11]:
# Over Sampling
sm = SMOTE(random_state=42)
X_over, y_over = sm.fit_resample(X_train_dev, y_train_dev)

xgb_over = create_pipeline(('model', XGBClassifier(random_state=2024)))
xgb_over.fit(X_over, y_over)
xgb_pred_over = xgb_over.predict(X_test_validate)
xgb_metrics_over = model_metrics('XGB Classifier', actual, xgb_pred_over)
xgb_metrics_over

Unnamed: 0,Model,Accuracy,Precision,Recall,F1,ROC AUC
0,XGB Classifier,0.855178,0.465569,0.26674,0.339163,0.608587


In [12]:
xgb_params = {
    'model__n_estimators': [100],
    'model__max_depth': [4, 5, 6],
    'model__learning_rate': [0.05, 0.1, 0.2],
    'model__colsample_bytree':[0.8, 1.0],
    'model__subsample':[0.8, 1.0],
    'under_sample__random_state': [2025]
}

nominal_columns = ["highbp","highchol","cholcheck","smoker","stroke","heartdiseaseorattack","physactivity",
                    "fruits","veggies","hvyalcoholconsump","anyhealthcare","nodocbccost","diffwalk","sex"]
numerical_columns = ["bmi","age","income","menthlth","physhlth","education","genhlth"]

column_trans = ColumnTransformer([
    ('numerical', StandardScaler(), numerical_columns),
    ('categorical', OneHotEncoder(drop='first', handle_unknown='ignore'), nominal_columns)
])

xgb_pipe = ImbPipeline([
    ("transformer", column_trans),
    ("under_sample", RandomUnderSampler()),
    ('model', XGBClassifier())
])

cv = KFold(n_splits=10, shuffle=True, random_state=2024)
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
search = GridSearchCV(xgb_pipe, xgb_params, scoring=scoring, cv=cv, n_jobs=4, refit='precision', verbose=3)
xgb_result = search.fit(X_train_dev, y_train_dev)
print(xgb_result.best_params_)
print(pd.DataFrame(xgb_result.cv_results_).head(10))

Fitting 10 folds for each of 36 candidates, totalling 360 fits
{'model__colsample_bytree': 0.8, 'model__learning_rate': 0.1, 'model__max_depth': 4, 'model__n_estimators': 100, 'model__subsample': 0.8, 'under_sample__random_state': 2025}
   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0       1.578107      0.148315         0.255181        0.005510   
1       1.287156      0.041921         0.250677        0.012672   
2       1.491000      0.074129         0.263242        0.011871   
3       1.313951      0.065037         0.260984        0.016857   
4       1.605237      0.053937         0.297465        0.014887   
5       1.397598      0.064275         0.284414        0.024747   
6       1.258444      0.034981         0.249671        0.016536   
7       1.141923      0.038337         0.246175        0.021882   
8       1.372594      0.049955         0.261856        0.013840   
9       1.214107      0.047823         0.263725        0.017386   

   param_model__colsample

In [13]:
xgb_params = {
    'model__n_estimators': [130, 150, 170],
    'model__max_depth': [3, 4, 5],
    'model__learning_rate': [0.01,0.05, 0.08],
    'model__colsample_bytree':[0.4, 0.5, 0.6],
    'model__subsample':[0.4, 0.5, 0.6],
}

cv = KFold(n_splits=10, shuffle=True, random_state=2024)
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
search = GridSearchCV(xgb_pipe, xgb_params, scoring=scoring, cv=cv, n_jobs=4, refit='precision', verbose=3)
xgb_result = search.fit(X_train_dev, y_train_dev)
print(xgb_result.best_params_)

Fitting 10 folds for each of 243 candidates, totalling 2430 fits
{'model__colsample_bytree': 0.4, 'model__learning_rate': 0.08, 'model__max_depth': 3, 'model__n_estimators': 130, 'model__subsample': 0.4}


In [14]:
xgb_params = {key.split("__")[1]:value for key, value in xgb_result.best_params_.items() if key.startswith("model__")}
under_params = {key.split("__")[1]:value for key, value in xgb_result.best_params_.items() if key.startswith("under_sample__")}

In [15]:
xgb_model = XGBClassifier(**xgb_params)

nominal_columns = ["highbp","highchol","cholcheck","smoker","stroke","heartdiseaseorattack","physactivity",
                    "fruits","veggies","hvyalcoholconsump","anyhealthcare","nodocbccost","diffwalk","sex"]
numerical_columns = ["bmi","age","income","menthlth","physhlth","education","genhlth"]

column_trans = ColumnTransformer([
    ('numerical', StandardScaler(), numerical_columns),
    ('categorical', OneHotEncoder(drop='first', handle_unknown='ignore'), nominal_columns)
])
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

xgb_pipe = ImbPipeline([
    ("transformer", column_trans),
    ("undersample", RandomUnderSampler(**under_params)),
    ('model', xgb_model)
])
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=2024)
scores = cross_validate(estimator = xgb_pipe, X= X_test_validate, y = y_test_validate, scoring=scoring,cv= cv, verbose=3)

[CV] END  accuracy: (test=0.735) f1: (test=0.469) precision: (test=0.325) recall: (test=0.837) roc_auc: (test=0.847) total time=   0.2s
[CV] END  accuracy: (test=0.710) f1: (test=0.426) precision: (test=0.294) recall: (test=0.772) roc_auc: (test=0.808) total time=   0.2s
[CV] END  accuracy: (test=0.728) f1: (test=0.451) precision: (test=0.314) recall: (test=0.803) roc_auc: (test=0.838) total time=   0.2s
[CV] END  accuracy: (test=0.731) f1: (test=0.451) precision: (test=0.315) recall: (test=0.792) roc_auc: (test=0.832) total time=   0.3s
[CV] END  accuracy: (test=0.710) f1: (test=0.429) precision: (test=0.296) recall: (test=0.781) roc_auc: (test=0.817) total time=   0.2s
[CV] END  accuracy: (test=0.719) f1: (test=0.434) precision: (test=0.302) recall: (test=0.772) roc_auc: (test=0.819) total time=   0.2s
[CV] END  accuracy: (test=0.724) f1: (test=0.440) precision: (test=0.307) recall: (test=0.780) roc_auc: (test=0.828) total time=   0.2s
[CV] END  accuracy: (test=0.719) f1: (test=0.439

In [19]:
# Using protocol=5 is recommended to reduce memory usage and make it faster to store and load any large NumPy array stored as a fitted attribute in the model. 
with open("../model/trained_model.pkl", "wb") as model:
    pickle.dump(xgb_pipe, model, 5)