In [1]:
# 0) SHARED SETUP
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score,
    RocCurveDisplay, precision_recall_fscore_support
)

RANDOM_STATE = 42

# ===== Load the (balanced) binary dataset =====
df = pd.read_csv("diabetes_binary_5050split_health_indicators_BRFSS2015.csv")

# Target & features
TARGET = "Diabetes_binary"
X = df.drop(columns=[TARGET])
y = df[TARGET].astype(int)

# If you work on the multi-class dataset:
# TARGET = "Diabetes_012"
# df = pd.read_csv("diabetes_012_health_indicators_BRFSS2015.csv")
# y = df[TARGET].astype(int)
# X = df.drop(columns=[TARGET])

# Train/validation split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

def evaluate(model, X_test, y_test, name="Model"):
    """Prints core metrics and returns a dict for logging/aggregation."""
    y_pred = model.predict(X_test)
    # For binary problems we can compute ROC-AUC on probabilities:
    try:
        y_proba = model.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, y_proba)
    except Exception:
        auc = np.nan

    print(f"\n{name} — Classification report")
    print(classification_report(y_test, y_pred, digits=4))
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
    if not np.isnan(auc):
        print(f"ROC-AUC: {auc:.4f}")

    return {
        "model": name,
        "roc_auc": auc,
        **{
            f"f1_{label}": f1
            for label, f1 in zip(
                sorted(np.unique(y_test)),
                precision_recall_fscore_support(y_test, y_pred, average=None)[2]
            )
        }
    }


In [2]:
from sklearn.linear_model import LogisticRegression

logreg_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        solver="lbfgs",
        max_iter=2000,
        multi_class="auto",  # 'multinomial' for Diabetes_012, 'auto' handles both
        class_weight=None    # set to 'balanced' for imbalanced (non-5050) data
    ))
])

logreg_pipe.fit(X_train, y_train)
logreg_metrics = evaluate(logreg_pipe, X_test, y_test, "Logistic Regression")





Logistic Regression — Classification report
              precision    recall  f1-score   support

           0     0.7551    0.7277    0.7411      7070
           1     0.7372    0.7639    0.7503      7069

    accuracy                         0.7458     14139
   macro avg     0.7461    0.7458    0.7457     14139
weighted avg     0.7461    0.7458    0.7457     14139

Confusion matrix:
 [[5145 1925]
 [1669 5400]]
ROC-AUC: 0.8232


In [3]:
param_grid = {
    "clf__C": [0.01, 0.1, 1, 10],
    "clf__penalty": ["l2"],
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
grid_lr = GridSearchCV(logreg_pipe, param_grid, scoring="f1_macro", cv=cv, n_jobs=-1)
grid_lr.fit(X_train, y_train)
best_lr = grid_lr.best_estimator_
evaluate(best_lr, X_test, y_test, "LogReg (Tuned)")





LogReg (Tuned) — Classification report
              precision    recall  f1-score   support

           0     0.7551    0.7277    0.7411      7070
           1     0.7372    0.7639    0.7503      7069

    accuracy                         0.7458     14139
   macro avg     0.7461    0.7458    0.7457     14139
weighted avg     0.7461    0.7458    0.7457     14139

Confusion matrix:
 [[5145 1925]
 [1669 5400]]
ROC-AUC: 0.8232


{'model': 'LogReg (Tuned)',
 'roc_auc': np.float64(0.8232188552404137),
 'f1_0': np.float64(0.7411408815903198),
 'f1_1': np.float64(0.7503126302626094)}