In [6]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import (GridSearchCV, StratifiedKFold,
                                     cross_validate, train_test_split)
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from xgboost import XGBClassifier

## Preprocessing  Pipeline
1. Lower case all column names
2. Columns to one hot encode:
    nominal_columns = ["highbp","highchol","cholcheck","smoker","stroke","heartdiseaseorattack","physactivity","fruits","veggies","hvyalcoholconsump","anyhealthcare","nodocbccost","diffwalk","sex"]
3. Columns to standardize:
    numerical_columns = ["bmi","age","income","menthlth","physhlth","education","genhlth"]
    



In [7]:
data = pd.read_csv("../data/raw/diabetes_binary_health_indicators_BRFSS2015.csv")
data.columns = data.columns.str.lower()

In [8]:
# Split data into train and test datasets. Cross validation will be performed on train dataset.
X = data.drop(columns=["diabetes_binary"])
y = data["diabetes_binary"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2025, shuffle=True, stratify=y)

In [9]:
# Building Preprocessing Pipeline Function
def create_pipeline(model: tuple[str, BaseEstimator])->Pipeline:
    nominal_columns = ["highbp","highchol","cholcheck","smoker","stroke","heartdiseaseorattack","physactivity","fruits","veggies","hvyalcoholconsump","anyhealthcare","nodocbccost","diffwalk","sex"]
    numerical_columns = ["bmi","age","income","menthlth","physhlth","education","genhlth"]

    column_trans = ColumnTransformer([
            ('numerical', StandardScaler(), numerical_columns),
            ('categorical', OneHotEncoder(drop='first', handle_unknown='ignore'), nominal_columns)
        ])

    pipe = Pipeline([("transformer", column_trans),model])
    return pipe

log_pipe = create_pipeline(('model', LogisticRegression(random_state=2024)))
rf_pipe = create_pipeline(('model', RandomForestClassifier(random_state=2024)))
svc_pipe = create_pipeline(('model', SVC(random_state=2024)))
knn = create_pipeline(('model', KNeighborsClassifier()))
gbc = create_pipeline(('model', GradientBoostingClassifier(random_state=2024)))
xgb = create_pipeline(('model', XGBClassifier(random_state=2024)))
xgb

In [None]:
# Split train dataset futher into train and validate dataset

X_train_dev, X_test_dev, y_train_validate, y_test_validate, = train_test_split(X_train, y_train, test_size=0.2, random_state=2025, shuffle=True, stratify=y_train)
print(X_train_dev.shape)

# Candidate Model Fit / Predict
rf_pipe.fit(X_train_dev, y_train_validate)
log_pipe.fit(X_train_dev, y_train_validate)
svc_pipe.fit(X_train_dev, y_train_validate)
knn.fit(X_train_dev, y_train_validate)
gbc.fit(X_train_dev, y_train_validate)
xgb.fit(X_train_dev, y_train_validate)

rf_pred = rf_pipe.predict(X_test_dev)
log_pred = log_pipe.predict(X_test_dev)
svc_pred = svc_pipe.predict(X_test_dev)
knn_pred = knn.predict(X_test_dev)
gbc_pred = gbc.predict(X_test_dev)
xgb_pred = xgb.predict(X_test_dev)

(182649, 21)


In [8]:
# Candidate Model Evaluation
def model_metrics(model, y_test, y_pred):

    # positive_label = '1'
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_area = roc_auc_score(np.array(y_test, dtype='float64'), np.array(y_pred, dtype='float64'))

    result_df = pd.DataFrame({'Model': [model], 'Accuracy': [accuracy], 'Precision': [precision], 'Recall': [recall], 'F1': [f1], 'ROC AUC': [roc_area]})
    return result_df

actual = y_test.values

rf_metrics = model_metrics('Random Forest Classifier', actual, rf_pred)
log_metrics = model_metrics('Logistic Regression', actual, log_pred)
svc_metrics = model_metrics('Support Vector Classifier', actual, svc_pred)
knn_metrics = model_metrics('KNN Classifier', actual, knn_pred)
gbc_metrics = model_metrics('Gradient Boosting Classifier', actual, gbc_pred)
xgb_metrics = model_metrics('XGB Classifier', actual, xgb_pred)

results = pd.concat([rf_metrics, log_metrics, svc_metrics, knn_metrics, gbc_metrics, xgb_metrics]).reset_index(drop=True)
results

NameError: name 'rf_pred' is not defined