In [2]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing   import StandardScaler, OneHotEncoder
from sklearn.compose         import ColumnTransformer
from sklearn.pipeline        import Pipeline
from imblearn.over_sampling  import SMOTE
from imblearn.pipeline       import Pipeline as ImbPipeline
from xgboost                 import XGBClassifier
from sklearn.calibration     import CalibratedClassifierCV
from sklearn.metrics         import roc_auc_score, precision_recall_curve

In [3]:
# 1. Data load & basic cleaning
df = pd.read_csv("data/diabetes_prediction_dataset.csv")
df = df[df.gender != "Other"].copy()
df["is_male"] = (df.gender == "Male").astype(int)
df.drop("gender", axis=1, inplace=True)

# 2. Feature groups
num_cols = ["age", "bmi", "HbA1c_level", "blood_glucose_level"]
cat_cols = ["smoking_history"]
bin_cols = ["is_male", "hypertension", "heart_disease"]

preproc = ColumnTransformer([
    ("num", StandardScaler(),                num_cols),
    ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), cat_cols),
    ("bin", "passthrough",                   bin_cols)
], remainder="drop")

In [4]:
# 3. Train-test split
X = df.drop("diabetes", axis=1)
y = df["diabetes"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [5]:
# ----------------- до
# ("clf", xgb)
# ----------------- после

smote = SMOTE(sampling_strategy=1.0, random_state=42)

xgb = XGBClassifier(
    objective       = "binary:logistic",   # ← только binary:logistic!
    eval_metric     = "logloss",
    n_estimators    = 400,
    learning_rate   = 0.05,
    max_depth       = 5,
    subsample       = 0.8,
    colsample_bytree= 0.8,
    min_child_weight= 1,
    # scale_pos_weight убираем, т.к. SMOTE уже балансирует классы
    n_jobs          = -1,
    random_state    = 42,
)

imb_pipe = ImbPipeline([
    ("preprocessor", preproc),
    ("smote",        smote),
    ("clf",          xgb)          # название шага «clf» допустимо
])


In [6]:
calibrated_clf = CalibratedClassifierCV(
    imb_pipe,
    method         = "isotonic",   # или 'sigmoid'
    cv             = 5,
)

calibrated_clf.fit(X_train, y_train)

proba = calibrated_clf.predict_proba(X_test)[:, 1]

# 2. выбираем порог
precision, recall, thr = precision_recall_curve(y_test, proba)
mask     = np.where(recall[:-1] >= 0.85)[0]
opt_idx  = mask[np.argmax(precision[mask])]
best_thr = thr[opt_idx]

# 3. сохраняем ОДИН раз
joblib.dump(calibrated_clf, "models/diabetes_xgb_calibrated.joblib")
np.save("models/opt_threshold.npy", np.array([best_thr]), allow_pickle=False)