In [132]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing   import StandardScaler, OneHotEncoder
from sklearn.compose         import ColumnTransformer
from sklearn.pipeline        import Pipeline
from imblearn.over_sampling  import SMOTE
from imblearn.pipeline       import Pipeline as ImbPipeline
from xgboost                 import XGBClassifier
from sklearn.calibration     import CalibratedClassifierCV
from sklearn.metrics         import roc_auc_score, precision_recall_curve

In [133]:
# 1. Data load & basic cleaning
df = pd.read_csv("data/diabetes_prediction_dataset.csv")
df = df[df.gender != "Other"].copy()
df["is_male"] = (df.gender == "Male").astype(int)
df.drop("gender", axis=1, inplace=True)

# 2. Feature groups
num_cols = ["age", "bmi", "HbA1c_level", "blood_glucose_level"]
cat_cols = ["smoking_history"]
bin_cols = ["is_male", "hypertension", "heart_disease"]

preproc = ColumnTransformer([
    ("num", StandardScaler(),                num_cols),
    ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), cat_cols),
    ("bin", "passthrough",                   bin_cols)
], remainder="drop")

In [134]:
# 3. Train-test split
X = df.drop("diabetes", axis=1)
y = df["diabetes"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [135]:
# 4. SMOTE + XGBoost
smote = SMOTE(sampling_strategy=1.0, random_state=42)

xgb = XGBClassifier(
    objective="binary:logistic",
    eval_metric="auc",
    n_estimators=400,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=1,
    scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(),
    n_jobs=-1,
    random_state=42
)

imb_pipe = ImbPipeline([
    ("preprocessor", preproc),
    ("smote",        smote),
    ("clf",          xgb)
])

In [136]:
# 5. Isotonic calibration (5-fold CV)
clf = CalibratedClassifierCV(imb_pipe, cv=5, method="isotonic")
clf.fit(X_train, y_train)

# 6. Evaluation
proba = clf.predict_proba(X_test)[:, 1]
print("ROC-AUC:", roc_auc_score(y_test, proba))


ROC-AUC: 0.9781757536593912


In [137]:
# 7. Choose threshold for recall ≥ 0.85 and best precision
precision, recall, thr = precision_recall_curve(y_test, proba)
mask = np.where(recall[:-1] >= 0.85)[0]
opt_idx = mask[np.argmax(precision[mask])]
best_thr = thr[opt_idx]
print(f"Chosen threshold = {best_thr:.4f}  |  P={precision[opt_idx]:.3f}  R={recall[opt_idx]:.3f}")

Chosen threshold = 0.1443  |  P=0.598  R=0.850


In [138]:
# 8. Save artefacts
joblib.dump(clf, "models/diabetes_xgb_calibrated.joblib")
np.save("models/opt_threshold.npy", np.array([best_thr]), allow_pickle=False)
print("Artifacts saved: diabetes_xgb_calibrated.joblib , opt_threshold.npy")

Artifacts saved: diabetes_xgb_calibrated.joblib , opt_threshold.npy
