## data loading

In [None]:
import pandas as pd
import numpy as np

df_albumin_clean = pd.read_excel('./Poisoning_Prediction/all_poisoning_data_wide_clean_albumin_20251106.xlsx')

In [2]:
features_categorical = ['Gender',
 'Education Level',
 'Type of Poisoning',
 'Hypertension',
 'Hyperlipidemia',
 'Diabetes Mellitus',
 'Cerebrovascular Disease',
 'Heart Disease',
 'Allergy History',
 'Cancer',
 'Poisoning',
 'degree of poisoning',
 'Smoking Status',
 'Alcohol Consumption Status',
 'Shortness of Breath',
 'Chest Pain',
 'Cough',
 'Pre-syncope',
 'Altered Consciousness or Syncope',
 'Sore Throat',
 'Fever',
 'Fatigue',
 'Lower Limb Edema',
 'Palpitations',
 'Vomiting',
 'Nausea',
 'Weakness',
 'Headache',
 'Residence']

In [3]:
value_mappings_en = {
    "Gender": {
        1: "Male",
        0: "Female"
    },
    "Education Level": {
        1: "Illiterate",
        2: "Primary School",
        3: "Junior High School",
        4: "Senior High School",
        5: "University Degree"
    },
    "Type of Poisoning": {
        1: "Industrial",
        2: "Pharmaceutical",
        3: "Pesticide",
        4: "Alcohol",
        0: "Uncertain"
    },
    "Hypertension": {
        1: "Yes",
        0: "No"
    },
    "Hyperlipidemia": {
        1: "Yes",
        0: "No"
    },
    "Diabetes Mellitus": {
        1: "Yes",
        0: "No"
    },
    "Cerebrovascular Disease": {
        1: "Yes",
        0: "No"
    },
    "Heart Disease": {
        1: "Yes",
        0: "No"
    },
    "Allergy History": {
        1: "Yes",
        0: "No"
    },
    "Cancer": {
        1: "Yes",
        0: "No"
    },
    "Poisoning": {
        1: "Yes",
        0: "No"
    },
    "degree of poisoning": {
        0: "Undetermined",
        1: "Low",
        2: "Moderate",
        3: "High"
    },
    "Smoking": {
        1: "Yes",
        0: "No"
    },
    "Alcohol Consumption Status": {
        1: "Yes",
        0: "No"
    },
    "Shortness of Breath": {
        1: "Yes",
        0: "No"
    },
    "Chest Pain": {
        1: "Yes",
        0: "No"
    },
    "Cough": {
        1: "Yes",
        0: "No"
    },
    "Pre-syncope": {
        1: "Yes",
        0: "No"
    },
    "Altered Mental Status or Syncope(AMS or Sync)": {
        1: "Yes",
        0: "No"
    },
    "Sore Throat": {
        1: "Yes",
        0: "No"
    },
    "Fever": {
        1: "Yes",
        0: "No"
    },
    "Fatigue": {
        1: "Yes",
        0: "No"
    },
    "Lower Limb Edema": {
        1: "Yes",
        0: "No"
    },
    "Palpitations": {
        1: "Yes",
        0: "No"
    },
    "Vomiting": {
        1: "Yes",
        0: "No"
    },
    "Nausea": {
        1: "Yes",
        0: "No"
    },
    "Weakness": {
        1: "Yes",
        0: "No"
    },
    "Headache": {
        1: "Yes",
        0: "No"
    },
    "Residence": {
        1: "Rural",
        2: "Urban"
    },
    "Smoking Status": {
        1: "Yes",
        0: "No"
    },
    'Altered Consciousness or Syncope': {
        1: "Yes",
        0: "No"
    },    
}

In [None]:
## Inverse mapping (English label → numerical value)
df_mapped_wide = df_albumin_clean.copy()
for col in features_categorical:
    if col in value_mappings_en and col in df_mapped_wide.columns:
        inv_map = {v: k for k, v in value_mappings_en[col].items()}
        df_mapped_wide[col] = df_mapped_wide[col].map(inv_map)

In [None]:
# Stores the distribution of each categorical variable
category_distributions = {}

for col in features_categorical:
    if col in df_mapped_wide.columns:
        counts = df_mapped_wide[col].value_counts(dropna=False)
        category_distributions[col] = counts
        print(counts)

In [None]:
# Statistics Distribution of Outcome_other and Outcome
print(df_mapped_wide["Outcome_other"].value_counts(dropna=False))
print(df_mapped_wide["Outcome"].value_counts(dropna=False))


In [None]:
## Calculate missing proportions for continuous variables

features_continuous = ['Age',
 'Length of Stay',
 'Weight',
 'Systolic Blood Pressure',
 'Diastolic Blood Pressure',
 'Respiratory Rate',
 'Heart Rate',
 'White Blood Cell Count',
 'Red Blood Cell Count',
 'Hemoglobin Concentration',
 'Mean Corpuscular Volume',
 'Mean Corpuscular Hemoglobin',
 'Mean Corpuscular Hemoglobin Concentration',
 'Platelet Count',
 'Mean Platelet Volume',
 'Alanine Aminotransferase (ALT)',
 'Total Bilirubin',
 'Direct Bilirubin',
 'Lactate Dehydrogenase (LDH)',
 'Urea',
 'Serum Creatinine',
 'Uric Acid',
 'Creatine Kinase (CK)',
 'Creatine Kinase-MB Isoenzyme',
 'Troponin I',
 'High-Sensitivity C-Reactive Protein (hs-CRP)',
 'Homocysteine',
 'Potassium',
 'Sodium',
 'Chloride',
 'Carbon Dioxide',
 'Prothrombin Time',
 'D-Dimer',
 'Lactate',
 'Blood Cholinesterase Test Results',
 'Albumin (First Measurement)',
 'Albumin (Last Measurement)',
 'Number of Hemoperfusion Sessions',
 'Number of Blood Purification Sessions',
 'Hyperbaric Oxygen Therapy Duration and Frequency',
 'Atropine Dosage',
 'Long-acting Nitroglycerin Dosage',
 'Pralidoxime Dosage',
 ] 

missing_ratios = df_mapped_wide[features_continuous].isnull().mean()
missing_summary = (missing_ratios * 100).round(2).sort_values(ascending=False)
print(missing_summary)

In [None]:
# Feature names screened for deletion rates> 90%
high_missing_features = missing_ratios[missing_ratios > 0.90].index.tolist()
for feat in high_missing_features:
    print(f"{feat}: {missing_ratios[feat]*100:.2f}%")

In [None]:

print(len(features_continuous))
print(len(high_missing_features))
features_continuous = [feat for feat in features_continuous if feat not in high_missing_features]
print(len(features_continuous))

df_mapped_wide = df_mapped_wide.drop(columns=high_missing_features)
print(df_mapped_wide.shape)

print('number of features：',len(features_categorical + features_continuous))

##### 5-fold cross validation: Divide 1/8 of the training set into validation sets (i.e. 70% training set, 10% validation set, 20% test set)

## CatBoost

- Use the new fixed random seed number + the previous optimal hyperparameter (the same hyperparameter used to predict whether or not to die)

In [None]:
import numpy as np 
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import KFold, train_test_split
from catboost import CatBoostClassifier, Pool
import os

def bootstrap_metric_ci(y_true, y_pred, metric_fn, n_bootstrap=2000, seed=42):
    rng = np.random.RandomState(seed)
    scores = []
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    for _ in range(n_bootstrap):
        idx = rng.randint(0, len(y_true), len(y_true))
        if len(np.unique(y_true[idx])) < 2:
            continue
        scores.append(metric_fn(y_true[idx], y_pred[idx]))
    return np.mean(scores), np.percentile(scores, 2.5), np.percentile(scores, 97.5)

def train_catboost_5fold_cv_fixed(
    dataX,
    dataY,
    cat_features=None,
    save_path='./Poisoning_Prediction/ML/predict_non-recovery/catboost_fixed_valid_test_5cv/',
    seed=42,
    early_stopping_rounds=30,
    params={'depth': 5, 'iterations': 200, 'learning_rate': 0.05}
):
    os.makedirs(save_path, exist_ok=True)
    dataX = dataX.copy()
    if cat_features is not None:
        for c in cat_features:
            dataX[c] = dataX[c].astype(str).fillna("missing")

    X = dataX
    y = np.array(dataY)

    param_name = "_".join([f"{k}_{v}" for k, v in params.items()])
    param_path = os.path.join(save_path, param_name)
    os.makedirs(param_path, exist_ok=True)

    kf = KFold(n_splits=5, shuffle=True, random_state=seed)
    all_results = []

    fold_idx = 1
    for train_val_index, test_index in kf.split(X):
        # Step1: train_val / test
        X_train_val, X_test = X.iloc[train_val_index], X.iloc[test_index]
        y_train_val, y_test = y[train_val_index], y[test_index]

        # Step2: train / val
        X_train, X_val, y_train, y_val = train_test_split(
            X_train_val, y_train_val,
            test_size=1/8, random_state=seed, stratify=y_train_val
        )

        print(f"Seed {seed}, Fold={fold_idx}: Train={len(y_train)}, Val={len(y_val)}, Test={len(y_test)}")
        print(f"  Train - Pos: {np.sum(y_train == 1)}, Neg: {np.sum(y_train == 0)}")
        print(f"  Val   - Pos: {np.sum(y_val == 1)},   Neg: {np.sum(y_val == 0)}")
        print(f"  Test  - Pos: {np.sum(y_test == 1)},  Neg: {np.sum(y_test == 0)}")

        num_pos = np.sum(y_train == 1)
        num_neg = np.sum(y_train == 0)
        scale_pos_weight = num_neg / max(num_pos, 1)

        # Pool
        train_pool = Pool(X_train, label=y_train, cat_features=cat_features)
        val_pool = Pool(X_val, label=y_val, cat_features=cat_features)
        test_pool = Pool(X_test, label=y_test, cat_features=cat_features)

        model = CatBoostClassifier(
            **params,
            loss_function="Logloss",
            eval_metric="AUC",
            scale_pos_weight=scale_pos_weight,
            random_seed=seed,
            verbose=False
        )

        model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=early_stopping_rounds, verbose=False)

        y_pred_prob = model.predict_proba(test_pool)[:, 1]

        fold_csv = os.path.join(param_path, f"fold_{fold_idx}_results.csv")
        pd.DataFrame({
            "fold": fold_idx,
            "y_test": y_test,
            "y_pred": y_pred_prob
        }).to_csv(fold_csv, index=False)

        all_results.append(pd.DataFrame({
            "fold": fold_idx,
            "y_test": y_test,
            "y_pred": y_pred_prob
        }))
        fold_idx += 1

    all_results_df = pd.concat(all_results, ignore_index=True)
    all_csv = os.path.join(param_path, "all_folds_results.csv")
    all_results_df.to_csv(all_csv, index=False)

    mean_auroc, auroc_lower, auroc_upper = bootstrap_metric_ci(
        all_results_df["y_test"], all_results_df["y_pred"], metrics.roc_auc_score
    )
    mean_auprc, auprc_lower, auprc_upper = bootstrap_metric_ci(
        all_results_df["y_test"], all_results_df["y_pred"], metrics.average_precision_score
    )

    print(f"AUROC: Mean={mean_auroc:.4f}, 95% CI=({auroc_lower:.4f},{auroc_upper:.4f})")
    print(f"AUPRC: Mean={mean_auprc:.4f}, 95% CI=({auprc_lower:.4f},{auprc_upper:.4f})")

    return {
        "params": params,
        "AUROC_mean": mean_auroc,
        "AUROC_CI": (auroc_lower, auroc_upper),
        "AUPRC_mean": mean_auprc,
        "AUPRC_CI": (auprc_lower, auprc_upper),
        "AllResults": all_results_df
    }


In [None]:
dataX = df_mapped_wide[features_categorical + features_continuous]
dataY = df_mapped_wide['Outcome'] 
results = train_catboost_5fold_cv_fixed(dataX, dataY,cat_features=features_categorical)
print(results)

In [None]:
### Save the optimal model 
import numpy as np 
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import KFold, train_test_split
from catboost import CatBoostClassifier, Pool
import os

def find_best_threshold_by_youden(y_true, y_pred_prob):

    fpr, tpr, thresholds = metrics.roc_curve(y_true, y_pred_prob)
    specificity = 1 - fpr
    youden_index = tpr + specificity - 1
    best_idx = np.argmax(youden_index)
    best_threshold = thresholds[best_idx]

    return best_threshold, youden_index[best_idx], tpr[best_idx], specificity[best_idx]

def bootstrap_metric_ci(y_true, y_pred, metric_fn, n_bootstrap=2000, seed=42):
    rng = np.random.RandomState(seed)
    scores = []
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    for _ in range(n_bootstrap):
        idx = rng.randint(0, len(y_true), len(y_true))
        if len(np.unique(y_true[idx])) < 2:
            continue
        scores.append(metric_fn(y_true[idx], y_pred[idx]))
    return np.mean(scores), np.percentile(scores, 2.5), np.percentile(scores, 97.5)

def train_catboost_5fold_cv_best_save(
    dataX,
    dataY,
    cat_features=None,
    save_path='./Poisoning_Prediction/ML/predict_non-recovery_best_model/catboost/',
    seed=42,
    early_stopping_rounds=30,
    params={'depth': 5, 'iterations': 200, 'learning_rate': 0.05}
):
    os.makedirs(save_path, exist_ok=True)
    dataX = dataX.copy()
    if cat_features is not None:
        for c in cat_features:
            dataX[c] = dataX[c].astype(str).fillna("missing")

    X = dataX
    y = np.array(dataY)

    param_name = "_".join([f"{k}_{v}" for k, v in params.items()])
    param_path = os.path.join(save_path, param_name)
    os.makedirs(param_path, exist_ok=True)

    kf = KFold(n_splits=5, shuffle=True, random_state=seed)
    all_results = []

    # ⭐ Record the best model for the five-way validation set AUC
    best_fold_auc = -np.inf
    best_fold_model_path = None
    best_fold_idx = None


    fold_idx = 1
    for train_val_index, test_index in kf.split(X):

        # Step1: train_val / test
        X_train_val, X_test = X.iloc[train_val_index], X.iloc[test_index]
        y_train_val, y_test = y[train_val_index], y[test_index]

        # Step2: train / val
        X_train, X_val, y_train, y_val = train_test_split(
            X_train_val, y_train_val,
            test_size=1/8, random_state=seed, stratify=y_train_val
        )

        print(f"Seed {seed}, Fold={fold_idx}: Train={len(y_train)}, Val={len(y_val)}, Test={len(y_test)}")

        num_pos = np.sum(y_train == 1)
        num_neg = np.sum(y_train == 0)
        scale_pos_weight = num_neg / max(num_pos, 1)

        # Pool
        train_pool = Pool(X_train, label=y_train, cat_features=cat_features)
        val_pool = Pool(X_val, label=y_val, cat_features=cat_features)
        test_pool = Pool(X_test, label=y_test, cat_features=cat_features)

        model = CatBoostClassifier(
            **params,
            loss_function="Logloss",
            eval_metric="AUC",
            scale_pos_weight=scale_pos_weight,
            random_seed=seed,
            verbose=False
        )

        model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=early_stopping_rounds)

        
        # ============================ AUC 输出 ============================

        # Validation AUC
        val_pred = model.predict_proba(val_pool)[:, 1]
        fold_val_auc = metrics.roc_auc_score(y_val, val_pred)
        print(f"Fold {fold_idx} - Validation AUC = {fold_val_auc:.4f}")

        # Test AUC
        y_pred_prob = model.predict_proba(test_pool)[:, 1]
        fold_test_auc = metrics.roc_auc_score(y_test, y_pred_prob)
        print(f"Fold {fold_idx} - Test AUC = {fold_test_auc:.4f}")

        # ================================================================

        # ------------------------------
        # ⭐ Save the best model of the fold
        # ------------------------------
        fold_model_path = os.path.join(param_path, f"best_model_fold_{fold_idx}.cbm")
        model.save_model(fold_model_path)
        print(f"Fold {fold_idx} Optimal model saved: {fold_model_path}")

        val_pred = model.predict_proba(val_pool)[:, 1]
        fold_val_auc = metrics.roc_auc_score(y_val, val_pred)

        if fold_val_auc > best_fold_auc:
            best_fold_auc = fold_val_auc
            best_fold_model_path = fold_model_path
            best_fold_idx = fold_idx  

        y_pred_prob = model.predict_proba(test_pool)[:, 1]

        fold_csv = os.path.join(param_path, f"fold_{fold_idx}_results.csv")
        pd.DataFrame({
            "fold": fold_idx,
            "y_test": y_test,
            "y_pred": y_pred_prob
        }).to_csv(fold_csv, index=False)

        all_results.append(pd.DataFrame({
            "fold": fold_idx,
            "y_test": y_test,
            "y_pred": y_pred_prob
        }))
        fold_idx += 1

    # ----------------------------------------
    # ⭐ Copy the best model of the five-way to the main directory
    # ----------------------------------------
    best_overall_path = os.path.join(param_path, "best_overall_model.cbm")
    if best_fold_model_path is not None:
        import shutil
        shutil.copy(best_fold_model_path, best_overall_path)
        print(f"\n===== Five-fold optimal model: Fold {best_fold_idx}, "f"Validation AUC={best_fold_auc:.4f} =====")

    all_results_df = pd.concat(all_results, ignore_index=True)
    all_csv = os.path.join(param_path, "all_folds_results.csv")
    all_results_df.to_csv(all_csv, index=False)

    mean_auroc, auroc_lower, auroc_upper = bootstrap_metric_ci(
        all_results_df["y_test"], all_results_df["y_pred"], metrics.roc_auc_score
    )
    mean_auprc, auprc_lower, auprc_upper = bootstrap_metric_ci(
        all_results_df["y_test"], all_results_df["y_pred"], metrics.average_precision_score
    )

    print(f"AUROC: Mean={mean_auroc:.4f}, 95% CI=({auroc_lower:.4f},{auroc_upper:.4f})")
    print(f"AUPRC: Mean={mean_auprc:.4f}, 95% CI=({auprc_lower:.4f},{auprc_upper:.4f})")

    best_thresh, best_youden, best_sen, best_spec = find_best_threshold_by_youden(
        all_results_df["y_test"], 
        all_results_df["y_pred"]
    )

    print(f"optimal threshold (cut-off) = {best_thresh:.4f}")
    print(f"Youden Index = {best_youden:.4f}")
    print(f"Sensitivity  = {best_sen:.4f}")
    print(f"Specificity  = {best_spec:.4f}")

    return {
        "params": params,
        "AUROC_mean": mean_auroc,
        "AUROC_CI": (auroc_lower, auroc_upper),
        "AUPRC_mean": mean_auprc,
        "AUPRC_CI": (auprc_lower, auprc_upper),
        "AllResults": all_results_df,
        "BestModelPath": best_overall_path
    }


In [14]:
df_mapped_wide['Outcome'].value_counts()

Outcome
0    731
1    240
Name: count, dtype: int64

In [15]:
dataX = df_mapped_wide[features_categorical + features_continuous]

In [None]:
dataX = df_mapped_wide[features_categorical + features_continuous]
dataX = dataX.drop(columns=['Residence','Hypertension'])
cat_features = [c for c in features_categorical if c in dataX.columns]
dataY = df_mapped_wide['Outcome']
# results = train_catboost_5fold_cv_best_save(dataX, dataY,cat_features=features_categorical)
results = train_catboost_5fold_cv_best_save(dataX, dataY,cat_features=cat_features)
# print(results)

In [None]:
### Save the optimal model (Probabilistic Calibration Model) ###
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import KFold, train_test_split
from catboost import CatBoostClassifier, Pool
import os
import joblib
from sklearn.calibration import CalibratedClassifierCV
import shutil

def find_best_threshold_by_youden(y_true, y_pred_prob):

    fpr, tpr, thresholds = metrics.roc_curve(y_true, y_pred_prob)
    specificity = 1 - fpr
    youden_index = tpr + specificity - 1
    best_idx = np.argmax(youden_index)
    best_threshold = thresholds[best_idx]
    return best_threshold, youden_index[best_idx], tpr[best_idx], specificity[best_idx]

def bootstrap_metric_ci(y_true, y_pred, metric_fn, n_bootstrap=2000, seed=42):
    rng = np.random.RandomState(seed)
    scores = []
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    for _ in range(n_bootstrap):
        idx = rng.randint(0, len(y_true), len(y_true))
        if len(np.unique(y_true[idx])) < 2:
            continue
        scores.append(metric_fn(y_true[idx], y_pred[idx]))
    return np.mean(scores), np.percentile(scores, 2.5), np.percentile(scores, 97.5)

def train_catboost_5fold_cv_best_save(
    dataX,
    dataY,
    cat_features=None,
    save_path='./Poisoning_Prediction/ML/predict_non-recovery_best_model/catboost_calibration/',
    seed=42,
    early_stopping_rounds=30,
    params={'depth': 5, 'iterations': 200, 'learning_rate': 0.05}
):
    os.makedirs(save_path, exist_ok=True)
    dataX = dataX.copy()
    if cat_features is not None:
        for c in cat_features:
            dataX[c] = dataX[c].astype(str).fillna("missing")
    X = dataX
    y = np.array(dataY)
    param_name = "_".join([f"{k}_{v}" for k, v in params.items()])
    param_path = os.path.join(save_path, param_name)
    os.makedirs(param_path, exist_ok=True)
    kf = KFold(n_splits=5, shuffle=True, random_state=seed)
    all_results = []

    # ⭐ Record the best model for the five-way validation set AUC
    best_fold_auc = -np.inf
    best_fold_model_path = None
    best_fold_idx = None

    fold_idx = 1
    for train_val_index, test_index in kf.split(X):
        # Step1: train_val / test
        X_train_val, X_test = X.iloc[train_val_index], X.iloc[test_index]
        y_train_val, y_test = y[train_val_index], y[test_index]

        # Step2: train / val
        X_train, X_val, y_train, y_val = train_test_split(
            X_train_val, y_train_val,
            test_size=1/8, random_state=seed, stratify=y_train_val
        )

        print(f"Seed {seed}, Fold={fold_idx}: Train={len(y_train)}, Val={len(y_val)}, Test={len(y_test)}")

        num_pos = np.sum(y_train == 1)
        num_neg = np.sum(y_train == 0)
        scale_pos_weight = num_neg / max(num_pos, 1)

        # Pool
        train_pool = Pool(X_train, label=y_train, cat_features=cat_features)
        val_pool = Pool(X_val, label=y_val, cat_features=cat_features)
        test_pool = Pool(X_test, label=y_test, cat_features=cat_features)

        cat_model = CatBoostClassifier(
            **params,
            loss_function="Logloss",
            eval_metric="AUC",
            scale_pos_weight=scale_pos_weight,
            random_seed=seed,
            verbose=False
        )
        cat_model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=early_stopping_rounds, verbose=False)

        calib_model = CalibratedClassifierCV(
            estimator=cat_model,
            method='sigmoid',
            cv='prefit'
        )
        calib_model.fit(X_val, y_val)

        y_pred_prob = calib_model.predict_proba(X_test)[:, 1]

        # Validation AUC
        val_pred = calib_model.predict_proba(X_val)[:, 1]
        fold_val_auc = metrics.roc_auc_score(y_val, val_pred)
        fold_test_auc = metrics.roc_auc_score(y_test, y_pred_prob)
        print(f"Fold {fold_idx} - Validation AUC = {fold_val_auc:.4f}")
        print(f"Fold {fold_idx} - Test AUC       = {fold_test_auc:.4f}")

        # ------------------------------
        # ⭐ Save the calibration model
        # ------------------------------
        fold_model_path = os.path.join(param_path, f"best_model_fold_{fold_idx}.pkl")
        joblib.dump(calib_model, fold_model_path)
        print(f"Fold {fold_idx} Calibration model saved: {fold_model_path}")

        if fold_val_auc > best_fold_auc:
            best_fold_auc = fold_val_auc
            best_fold_model_path = fold_model_path
            best_fold_idx = fold_idx
            
        fold_csv = os.path.join(param_path, f"fold_{fold_idx}_results.csv")
        pd.DataFrame({
            "fold": fold_idx,
            "y_test": y_test,
            "y_pred": y_pred_prob
        }).to_csv(fold_csv, index=False)

        all_results.append(pd.DataFrame({
            "fold": fold_idx,
            "y_test": y_test,
            "y_pred": y_pred_prob
        }))
        fold_idx += 1

    best_overall_path = os.path.join(param_path, "best_overall_model.pkl")
    if best_fold_model_path is not None:
        shutil.copy(best_fold_model_path, best_overall_path)
        print(f"Model saved to: {best_overall_path}")

    all_results_df = pd.concat(all_results, ignore_index=True)
    all_csv = os.path.join(param_path, "all_folds_results.csv")
    all_results_df.to_csv(all_csv, index=False)

    mean_auroc, auroc_lower, auroc_upper = bootstrap_metric_ci(
        all_results_df["y_test"], all_results_df["y_pred"], metrics.roc_auc_score
    )
    mean_auprc, auprc_lower, auprc_upper = bootstrap_metric_ci(
        all_results_df["y_test"], all_results_df["y_pred"], metrics.average_precision_score
    )

    print(f"AUROC: Mean={mean_auroc:.4f}, 95% CI=({auroc_lower:.4f},{auroc_upper:.4f})")
    print(f"AUPRC: Mean={mean_auprc:.4f}, 95% CI=({auprc_lower:.4f},{auprc_upper:.4f})")

    best_thresh, best_youden, best_sen, best_spec = find_best_threshold_by_youden(
        all_results_df["y_test"], 
        all_results_df["y_pred"]
    )

    print(f"optimal threshold (cut-off) = {best_thresh:.4f}")
    print(f"Youden Index = {best_youden:.4f}")
    print(f"Sensitivity  = {best_sen:.4f}")
    print(f"Specificity  = {best_spec:.4f}")

    return {
        "params": params,
        "AUROC_mean": mean_auroc,
        "AUROC_CI": (auroc_lower, auroc_upper),
        "AUPRC_mean": mean_auprc,
        "AUPRC_CI": (auprc_lower, auprc_upper),
        "AllResults": all_results_df,
        "BestModelPath": best_overall_path
    }


In [None]:

dataX = df_mapped_wide[features_categorical + features_continuous]
dataX = dataX.drop(columns=['Residence','Hypertension'])  
cat_features = [c for c in features_categorical if c in dataX.columns]
dataY = df_mapped_wide['Outcome']  
# results = train_catboost_5fold_cv_best_save(dataX, dataY,cat_features=features_categorical)
results = train_catboost_5fold_cv_best_save(dataX, dataY,cat_features=cat_features)
# print(results)

In [None]:
### Probabilistic calibrator using sklearn ###
import numpy as np 
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import KFold, train_test_split
from catboost import CatBoostClassifier, Pool
import os

def bootstrap_metric_ci(y_true, y_pred, metric_fn, n_bootstrap=2000, seed=42):
    rng = np.random.RandomState(seed)
    scores = []
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    for _ in range(n_bootstrap):
        idx = rng.randint(0, len(y_true), len(y_true))
        if len(np.unique(y_true[idx])) < 2:
            continue
        scores.append(metric_fn(y_true[idx], y_pred[idx]))
    return np.mean(scores), np.percentile(scores, 2.5), np.percentile(scores, 97.5)

def train_catboost_5fold_cv_calibration(
    dataX,
    dataY,
    cat_features=None,
    save_path='./Poisoning_Prediction/ML/predict_non-recovery_calibration/catboost/',
    seed=42,
    early_stopping_rounds=30,
    params={'depth': 5, 'iterations': 200, 'learning_rate': 0.05}
):
    os.makedirs(save_path, exist_ok=True)
    dataX = dataX.copy()
    if cat_features is not None:
        for c in cat_features:
            dataX[c] = dataX[c].astype(str).fillna("missing")

    X = dataX
    y = np.array(dataY)

    param_name = "_".join([f"{k}_{v}" for k, v in params.items()])
    param_path = os.path.join(save_path, param_name)
    os.makedirs(param_path, exist_ok=True)

    kf = KFold(n_splits=5, shuffle=True, random_state=seed)
    all_results = []

    fold_idx = 1
    for train_val_index, test_index in kf.split(X):
        # Step1: train_val / test
        X_train_val, X_test = X.iloc[train_val_index], X.iloc[test_index]
        y_train_val, y_test = y[train_val_index], y[test_index]

        # Step2: train / val
        X_train, X_val, y_train, y_val = train_test_split(
            X_train_val, y_train_val,
            test_size=1/8, random_state=seed, stratify=y_train_val
        )

        print(f"Seed {seed}, Fold={fold_idx}: Train={len(y_train)}, Val={len(y_val)}, Test={len(y_test)}")
        print(f"  Train - Pos: {np.sum(y_train == 1)}, Neg: {np.sum(y_train == 0)}")
        print(f"  Val   - Pos: {np.sum(y_val == 1)},   Neg: {np.sum(y_val == 0)}")
        print(f"  Test  - Pos: {np.sum(y_test == 1)},  Neg: {np.sum(y_test == 0)}")
        
        num_pos = np.sum(y_train == 1)
        num_neg = np.sum(y_train == 0)
        scale_pos_weight = num_neg / max(num_pos, 1)
        print('scale_pos_weight:',scale_pos_weight)

        # Pool
        train_pool = Pool(X_train, label=y_train, cat_features=cat_features)
        val_pool = Pool(X_val, label=y_val, cat_features=cat_features)
        test_pool = Pool(X_test, label=y_test, cat_features=cat_features)

        from sklearn.calibration import CalibratedClassifierCV

        cat_model = CatBoostClassifier(
            **params,
            loss_function="Logloss",
            eval_metric="AUC",
            scale_pos_weight=scale_pos_weight,
            random_seed=seed,
            verbose=False
        )

        calib_model = CalibratedClassifierCV(
            estimator=cat_model,
            method='sigmoid',
            cv='prefit'
        )

        cat_model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=early_stopping_rounds, verbose=False)

        calib_model.fit(X_val, y_val)  
        y_pred_prob = calib_model.predict_proba(X_test)[:, 1]

        fold_csv = os.path.join(param_path, f"fold_{fold_idx}_results.csv")
        pd.DataFrame({
            "fold": fold_idx,
            "y_test": y_test,
            "y_pred": y_pred_prob
        }).to_csv(fold_csv, index=False)

        all_results.append(pd.DataFrame({
            "fold": fold_idx,
            "y_test": y_test,
            "y_pred": y_pred_prob
        }))
        fold_idx += 1

    all_results_df = pd.concat(all_results, ignore_index=True)
    all_csv = os.path.join(param_path, "all_folds_results.csv")
    all_results_df.to_csv(all_csv, index=False)

    mean_auroc, auroc_lower, auroc_upper = bootstrap_metric_ci(
        all_results_df["y_test"], all_results_df["y_pred"], metrics.roc_auc_score
    )
    mean_auprc, auprc_lower, auprc_upper = bootstrap_metric_ci(
        all_results_df["y_test"], all_results_df["y_pred"], metrics.average_precision_score
    )

    print(f"AUROC: Mean={mean_auroc:.4f}, 95% CI=({auroc_lower:.4f},{auroc_upper:.4f})")
    print(f"AUPRC: Mean={mean_auprc:.4f}, 95% CI=({auprc_lower:.4f},{auprc_upper:.4f})")

    return {
        "params": params,
        "AUROC_mean": mean_auroc,
        "AUROC_CI": (auroc_lower, auroc_upper),
        "AUPRC_mean": mean_auprc,
        "AUPRC_CI": (auprc_lower, auprc_upper),
        "AllResults": all_results_df
    }


In [None]:
dataX = df_mapped_wide[features_categorical + features_continuous]
dataY = df_mapped_wide['Outcome']
results = train_catboost_5fold_cv_calibration(dataX, dataY,cat_features=features_categorical)
# print(results)

## XGBoost

Use the new fixed random seed number + the previous optimal hyperparameter (the same hyperparameter used to predict whether or not to die)


In [None]:
import numpy as np  
import pandas as pd 
from sklearn import metrics
from sklearn.model_selection import KFold, train_test_split
import xgboost as xgb
import os

def bootstrap_metric_ci(y_true, y_pred, metric_fn, n_bootstrap=2000, seed=42):
    rng = np.random.RandomState(seed)
    scores = []
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    for _ in range(n_bootstrap):
        idx = rng.randint(0, len(y_true), len(y_true))
        if len(np.unique(y_true[idx])) < 2:
            continue
        scores.append(metric_fn(y_true[idx], y_pred[idx]))
    return np.mean(scores), np.percentile(scores, 2.5), np.percentile(scores, 97.5)

def train_xgboost_5fold_cv_fixed(
    dataX,
    dataY,
    save_path='./Poisoning_Prediction/ML/predict_non-recovery/xgboost_fixed_valid_test_5cv/',
    seed=42,
    early_stopping_rounds=30,
    params={'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 200}
):
    os.makedirs(save_path, exist_ok=True)
    X = dataX.copy()
    y = np.array(dataY)

    param_name = "_".join([f"{k}_{v}" for k, v in params.items()])
    param_path = os.path.join(save_path, param_name)
    os.makedirs(param_path, exist_ok=True)
    print(f"\n===== fixed params: {params} =====")

    kf = KFold(n_splits=5, shuffle=True, random_state=seed)
    all_results = []

    fold_idx = 1
    for train_val_index, test_index in kf.split(X):
        # Step1: train_val / test
        X_train_val, X_test = X.iloc[train_val_index], X.iloc[test_index]
        y_train_val, y_test = y[train_val_index], y[test_index]

        # Step2: train / val
        X_train, X_val, y_train, y_val = train_test_split(
            X_train_val, y_train_val,
            test_size=1/8, random_state=seed, stratify=y_train_val
        )

        print(f"Seed {seed}, Fold={fold_idx}: Train={len(y_train)}, Val={len(y_val)}, Test={len(y_test)}")
        print(f"  Train - Pos: {np.sum(y_train == 1)}, Neg: {np.sum(y_train == 0)}")
        print(f"  Val   - Pos: {np.sum(y_val == 1)},   Neg: {np.sum(y_val == 0)}")
        print(f"  Test  - Pos: {np.sum(y_test == 1)},  Neg: {np.sum(y_test == 0)}")

        num_pos = np.sum(y_train == 1)
        num_neg = np.sum(y_train == 0)
        scale_pos_weight = num_neg / max(num_pos, 1)

        dtrain = xgb.DMatrix(X_train, label=y_train)
        dval   = xgb.DMatrix(X_val, label=y_val)
        dtest  = xgb.DMatrix(X_test, label=y_test)

        xgb_params = {
            "objective": "binary:logistic",
            "eval_metric": "auc",
            "scale_pos_weight": scale_pos_weight,
            "seed": seed,
            "verbosity": 0,
            "learning_rate": params["learning_rate"],
            "max_depth": params["max_depth"]
        }

        # early stopping
        evals = [(dval, "validation")]
        model = xgb.train(
            xgb_params,
            dtrain,
            num_boost_round=params.get("n_estimators", 200),
            evals=evals,
            early_stopping_rounds=early_stopping_rounds,
            verbose_eval=False
        )

        y_pred_prob = model.predict(dtest)

        fold_csv = os.path.join(param_path, f"fold_{fold_idx}_results.csv")
        pd.DataFrame({
            "fold": fold_idx,
            "y_test": y_test,
            "y_pred": y_pred_prob
        }).to_csv(fold_csv, index=False)

        all_results.append(pd.DataFrame({
            "fold": fold_idx,
            "y_test": y_test,
            "y_pred": y_pred_prob
        }))

        fold_idx += 1

    all_results_df = pd.concat(all_results, ignore_index=True)
    all_csv = os.path.join(param_path, "all_folds_results.csv")
    all_results_df.to_csv(all_csv, index=False)

    mean_auroc, auroc_lower, auroc_upper = bootstrap_metric_ci(
        all_results_df["y_test"], all_results_df["y_pred"], metrics.roc_auc_score
    )
    mean_auprc, auprc_lower, auprc_upper = bootstrap_metric_ci(
        all_results_df["y_test"], all_results_df["y_pred"], metrics.average_precision_score
    )

    print(f"AUROC: Mean={mean_auroc:.4f}, 95% CI=({auroc_lower:.4f},{auroc_upper:.4f})")
    print(f"AUPRC: Mean={mean_auprc:.4f}, 95% CI=({auprc_lower:.4f},{auprc_upper:.4f})")

    return {
        "params": params,
        "AUROC_mean": mean_auroc,
        "AUROC_CI": (auroc_lower, auroc_upper),
        "AUPRC_mean": mean_auprc,
        "AUPRC_CI": (auprc_lower, auprc_upper),
        "AllResults": all_results_df
    }


In [None]:
dataX = df_mapped_wide[features_categorical + features_continuous]
dataY = df_mapped_wide['Outcome'] 
results = train_xgboost_5fold_cv_fixed(dataX, dataY)
print(results)

In [None]:
### Fixed Hyperparameter + 5-fold XGBoost + Calibration (Training Set Fit + Validation Set Calibration)
import numpy as np  
import pandas as pd 
from sklearn import metrics
from sklearn.model_selection import KFold, train_test_split
from sklearn.calibration import CalibratedClassifierCV
from xgboost import XGBClassifier
import os

def bootstrap_metric_ci(y_true, y_pred, metric_fn, n_bootstrap=2000, seed=42):
    rng = np.random.RandomState(seed)
    scores = []
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    for _ in range(n_bootstrap):
        idx = rng.randint(0, len(y_true), len(y_true))
        if len(np.unique(y_true[idx])) < 2:
            continue
        scores.append(metric_fn(y_true[idx], y_pred[idx]))
    return np.mean(scores), np.percentile(scores, 2.5), np.percentile(scores, 97.5)

def train_xgboost_5fold_cv_calibrated(
    dataX,
    dataY,
    save_path='./Poisoning_Prediction/ML/predict_non-recovery_calibration/xgboost/',
    seed=42,
    early_stopping_rounds=30,
    params={'learning_rate':0.05, 'max_depth':5, 'n_estimators':200}
):
    os.makedirs(save_path, exist_ok=True)
    X = dataX.copy()
    y = np.array(dataY)

    param_name = "_".join([f"{k}_{v}" for k, v in params.items()])
    param_path = os.path.join(save_path, param_name)
    os.makedirs(param_path, exist_ok=True)
    print(f"\n===== fixed params: {params} =====")

    kf = KFold(n_splits=5, shuffle=True, random_state=seed)
    all_results = []

    fold_idx = 1
    for train_val_index, test_index in kf.split(X):
        # ---------------------- train_val / test ----------------------
        X_train_val, X_test = X.iloc[train_val_index], X.iloc[test_index]
        y_train_val, y_test = y[train_val_index], y[test_index]

        # train / val
        X_train, X_val, y_train, y_val = train_test_split(
            X_train_val, y_train_val,
            test_size=1/8, random_state=seed, stratify=y_train_val
        )

        print(f"Fold={fold_idx}: Train={len(y_train)}, Val={len(y_val)}, Test={len(y_test)}")
        print(f"  Train - Pos: {np.sum(y_train==1)}, Neg: {np.sum(y_train==0)}")
        print(f"  Val   - Pos: {np.sum(y_val==1)},   Neg: {np.sum(y_val==0)}")
        print(f"  Test  - Pos: {np.sum(y_test==1)},  Neg: {np.sum(y_test==0)}")

        num_pos = np.sum(y_train == 1)
        num_neg = np.sum(y_train == 0)
        scale_pos_weight = num_neg / max(num_pos,1)

        base_model = XGBClassifier(
            objective='binary:logistic',
            learning_rate=params['learning_rate'],
            max_depth=params['max_depth'],
            n_estimators=params['n_estimators'],
            scale_pos_weight=scale_pos_weight,
            random_state=seed,
            verbosity=0,
            use_label_encoder=False
        )

        base_model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            # early_stopping_rounds=early_stopping_rounds,
            verbose=False
        )

        calib_model = CalibratedClassifierCV(
            estimator=base_model,
            method='sigmoid',
            cv='prefit' 
        )
        calib_model.fit(X_val, y_val)
        y_pred_prob = calib_model.predict_proba(X_test)[:, 1]

        fold_csv = os.path.join(param_path, f"fold_{fold_idx}_results.csv")
        pd.DataFrame({
            "fold": fold_idx,
            "y_test": y_test,
            "y_pred": y_pred_prob
        }).to_csv(fold_csv, index=False)

        all_results.append(pd.DataFrame({
            "fold": fold_idx,
            "y_test": y_test,
            "y_pred": y_pred_prob
        }))

        fold_idx += 1

    all_results_df = pd.concat(all_results, ignore_index=True)
    all_csv = os.path.join(param_path, "all_folds_results.csv")
    all_results_df.to_csv(all_csv, index=False)

    mean_auroc, auroc_lower, auroc_upper = bootstrap_metric_ci(
        all_results_df["y_test"], all_results_df["y_pred"], metrics.roc_auc_score
    )
    mean_auprc, auprc_lower, auprc_upper = bootstrap_metric_ci(
        all_results_df["y_test"], all_results_df["y_pred"], metrics.average_precision_score
    )

    print(f"AUROC: Mean={mean_auroc:.4f}, 95% CI=({auroc_lower:.4f},{auroc_upper:.4f})")
    print(f"AUPRC: Mean={mean_auprc:.4f}, 95% CI=({auprc_lower:.4f},{auprc_upper:.4f})")

    return {
        "params": params,
        "AUROC_mean": mean_auroc,
        "AUROC_CI": (auroc_lower, auroc_upper),
        "AUPRC_mean": mean_auprc,
        "AUPRC_CI": (auprc_lower, auprc_upper),
        "AllResults": all_results_df
    }


In [None]:
dataX = df_mapped_wide[features_categorical + features_continuous]
dataY = df_mapped_wide['Outcome'] 
results = train_xgboost_5fold_cv_calibrated(dataX, dataY)
print(results)

## RF

In [None]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import KFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
import os

def train_randomforest_5fold_cv_fixed(
    dataX,
    dataY,
    save_path='./Poisoning_Prediction/ML/predict_non-recovery/randomforest_fixed_valid_test_5cv/',
    seed=42
):
    """
        n_estimators=200
        max_depth=5
    """
    os.makedirs(save_path, exist_ok=True)
    X = dataX.copy()
    y = np.array(dataY)

    fixed_params = {
        "n_estimators": 200,
        "max_depth": 5,
        "min_samples_split": 2,
        "min_samples_leaf": 1,
        "max_features": "sqrt"
    }

    print(f"\n===== fixed params: {fixed_params} =====")

    kf = KFold(n_splits=5, shuffle=True, random_state=seed)
    aurocs, auprcs = [], []
    all_results = []

    fold_idx = 1
    for train_val_index, test_index in kf.split(X):
        print(f"\n===== Fold {fold_idx} =====")
        # Step1: 80% train_val, 20% test
        X_train_val, X_test = X.iloc[train_val_index], X.iloc[test_index]
        y_train_val, y_test = y[train_val_index], y[test_index]

        # Step2: train_val / val
        X_train, X_val, y_train, y_val = train_test_split(
            X_train_val, y_train_val,
            test_size=1/8, random_state=seed, stratify=y_train_val
        )

        # class weight
        num_pos = np.sum(y_train == 1)
        num_neg = np.sum(y_train == 0)
        scale_pos_weight = num_neg / max(num_pos, 1)
        class_weight = {0: 1.0, 1: scale_pos_weight}

        model = RandomForestClassifier(
            random_state=seed,
            n_jobs=-1,
            class_weight=class_weight,
            **fixed_params
        )
        model.fit(X_train, y_train)

        y_pred_prob = model.predict_proba(X_test)[:, 1]
        auroc = metrics.roc_auc_score(y_test, y_pred_prob)
        auprc = metrics.average_precision_score(y_test, y_pred_prob)
        aurocs.append(auroc)
        auprcs.append(auprc)

        print(f"[Fold {fold_idx}] AUROC: {auroc:.4f}, AUPRC: {auprc:.4f}")

        fold_csv = os.path.join(save_path, f"fold_{fold_idx}_results.csv")
        pd.DataFrame({
            "fold": fold_idx,
            "y_test": y_test,
            "y_pred": y_pred_prob
        }).to_csv(fold_csv, index=False)

        all_results.append(pd.DataFrame({
            "fold": fold_idx,
            "y_test": y_test,
            "y_pred": y_pred_prob
        }))
        fold_idx += 1

    all_results_df = pd.concat(all_results, ignore_index=True)
    all_csv = os.path.join(save_path, "all_folds_results.csv")
    all_results_df.to_csv(all_csv, index=False)

    def mean_ci_interval(data):
        mean = np.mean(data)
        std = np.std(data, ddof=1)
        ci95 = 1.96 * std / np.sqrt(len(data))
        return round(mean, 4), round(mean - ci95, 4), round(mean + ci95, 4)

    auroc_mean, auroc_lower, auroc_upper = mean_ci_interval(aurocs)
    auprc_mean, auprc_lower, auprc_upper = mean_ci_interval(auprcs)

    print(f"AUROC: Mean={auroc_mean:.4f}, 95% CI=({auroc_lower:.4f},{auroc_upper:.4f})")
    print(f"AUPRC: Mean={auprc_mean:.4f}, 95% CI=({auprc_lower:.4f},{auprc_upper:.4f})")

    results_summary = {
        "AUROC_mean": auroc_mean,
        "AUROC_CI": (auroc_lower, auroc_upper),
        "AUPRC_mean": auprc_mean,
        "AUPRC_CI": (auprc_lower, auprc_upper)
    }

    return results_summary, all_results_df


In [None]:
dataX = df_mapped_wide[features_categorical + features_continuous]
dataY = df_mapped_wide['Outcome'] 
results = train_randomforest_5fold_cv_fixed(dataX, dataY)
print(results)

In [None]:
### probability calibration ####
import numpy as np 
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import KFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
import os

def train_randomforest_5fold_cv_calibrated(
    dataX,
    dataY,
    save_path='./Poisoning_Prediction/ML/predict_non-recovery_calibration/randomforest/',
    seed=42
):

    os.makedirs(save_path, exist_ok=True)
    X = dataX.copy()
    y = np.array(dataY)

    fixed_params = {
        "n_estimators": 200,
        "max_depth": 5,
        "min_samples_split": 2,
        "min_samples_leaf": 1,
        "max_features": "sqrt"
    }

    print(f"\n===== Use fixed parameters: {fixed_params} =====")

    kf = KFold(n_splits=5, shuffle=True, random_state=seed)
    aurocs, auprcs = [], []
    all_results = []
    fold_idx = 1
    for train_val_index, test_index in kf.split(X):
        print(f"\n===== Fold {fold_idx} =====")
        X_train_val, X_test = X.iloc[train_val_index], X.iloc[test_index]
        y_train_val, y_test = y[train_val_index], y[test_index]

        X_train, X_val, y_train, y_val = train_test_split(
            X_train_val, y_train_val,
            test_size=1/8, random_state=seed, stratify=y_train_val
        )

        num_pos = np.sum(y_train == 1)
        num_neg = np.sum(y_train == 0)
        scale_pos_weight = num_neg / max(num_pos, 1)
        class_weight = {0: 1.0, 1: scale_pos_weight}

        base_model = RandomForestClassifier(
            random_state=seed,
            n_jobs=-1,
            class_weight=class_weight,
            **fixed_params
        )
        base_model.fit(X_train, y_train)
        calib_model = CalibratedClassifierCV(
            estimator=base_model,
            method='sigmoid',
            cv='prefit' 
        )
        calib_model.fit(X_val, y_val)
        y_pred_prob = calib_model.predict_proba(X_test)[:, 1]
        auroc = metrics.roc_auc_score(y_test, y_pred_prob)
        auprc = metrics.average_precision_score(y_test, y_pred_prob)
        aurocs.append(auroc)
        auprcs.append(auprc)
        print(f"[Fold {fold_idx}] AUROC: {auroc:.4f}, AUPRC: {auprc:.4f}")
        fold_csv = os.path.join(save_path, f"fold_{fold_idx}_results.csv")
        pd.DataFrame({
            "fold": fold_idx,
            "y_test": y_test,
            "y_pred": y_pred_prob
        }).to_csv(fold_csv, index=False)

        all_results.append(pd.DataFrame({
            "fold": fold_idx,
            "y_test": y_test,
            "y_pred": y_pred_prob
        }))
        fold_idx += 1

    all_results_df = pd.concat(all_results, ignore_index=True)
    all_csv = os.path.join(save_path, "all_folds_results.csv")
    all_results_df.to_csv(all_csv, index=False)

    def mean_ci_interval(data):
        mean = np.mean(data)
        std = np.std(data, ddof=1)
        ci95 = 1.96 * std / np.sqrt(len(data))
        return round(mean, 4), round(mean - ci95, 4), round(mean + ci95, 4)
    auroc_mean, auroc_lower, auroc_upper = mean_ci_interval(aurocs)
    auprc_mean, auprc_lower, auprc_upper = mean_ci_interval(auprcs)
    print(f"AUROC: Mean={auroc_mean:.4f}, 95% CI=({auroc_lower:.4f},{auroc_upper:.4f})")
    print(f"AUPRC: Mean={auprc_mean:.4f}, 95% CI=({auprc_lower:.4f},{auprc_upper:.4f})")
    results_summary = {
        "AUROC_mean": auroc_mean,
        "AUROC_CI": (auroc_lower, auroc_upper),
        "AUPRC_mean": auprc_mean,
        "AUPRC_CI": (auprc_lower, auprc_upper)
    }
    return results_summary, all_results_df


In [23]:
df_mapped_wide['Outcome'].value_counts(dropna=False)

Outcome
0    731
1    240
Name: count, dtype: int64

In [None]:
dataX = df_mapped_wide[features_categorical + features_continuous]
dataY = df_mapped_wide['Outcome']
results = train_randomforest_5fold_cv_calibrated(dataX, dataY)
print(results)

## LR

In [None]:
import numpy as np 
import pandas as pd 
from sklearn import metrics 
from sklearn.model_selection import KFold, train_test_split 
from sklearn.preprocessing import OneHotEncoder 
from sklearn.impute import SimpleImputer 
from sklearn.linear_model import LogisticRegression 
import os 
from sklearn.preprocessing import StandardScaler
 
def bootstrap_metric_ci(y_true, y_pred, metric_fn, n_bootstrap=2000, seed=42): 
    rng = np.random.RandomState(seed) 
    scores = [] 
    y_true = np.array(y_true) 
    y_pred = np.array(y_pred) 
    for _ in range(n_bootstrap): 
        idx = rng.randint(0, len(y_true), len(y_true)) 
        if len(np.unique(y_true[idx])) < 2: 
            continue 
        scores.append(metric_fn(y_true[idx], y_pred[idx])) 
    return np.mean(scores), np.percentile(scores, 2.5), np.percentile(scores, 97.5) 
 
def train_lr_5fold_cv( 
    dataX, 
    dataY, 
    num_features, 
    cat_features, 
    save_path='./Poisoning_Prediction/ML/predict_non-recovery/lr_valid_test_5cv/',
    seed=42 
): 
    os.makedirs(save_path, exist_ok=True) 
    X = dataX.copy() 
    y = np.array(dataY) 
 
    # ===================== feature preprocessing ===================== 
    imputer = SimpleImputer(strategy='median') 
    X_num = pd.DataFrame(imputer.fit_transform(X[num_features]), columns=num_features, index=X.index) 

    scaler = StandardScaler()
    X_num_scaled = pd.DataFrame(scaler.fit_transform(X_num), columns=num_features, index=X.index)

    X_cat = X[cat_features].astype(str).fillna('missing')

    encoder = OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False)
    X_cat_encoded = pd.DataFrame(
        encoder.fit_transform(X_cat),
        columns=encoder.get_feature_names_out(cat_features),
        index=X.index
    )

    X_processed = pd.concat([X_num_scaled, X_cat_encoded], axis=1)

    print(f"final feature dimension: {X_processed.shape[1]}") 

    kf = KFold(n_splits=5, shuffle=True, random_state=seed) 
    all_results = [] 
    fold_idx = 1 
 
    for train_val_index, test_index in kf.split(X_processed): 
        X_train_val, X_test = X_processed.iloc[train_val_index], X_processed.iloc[test_index] 
        y_train_val, y_test = y[train_val_index], y[test_index] 
 
        X_train, X_val, y_train, y_val = train_test_split( 
            X_train_val, y_train_val, 
            test_size=1/8, random_state=seed, stratify=y_train_val 
        ) 
 
        print(f"Fold {fold_idx}: Train={len(y_train)}, Val={len(y_val)}, Test={len(y_test)}") 

        n_pos = np.sum(y_train == 1)
        n_neg = np.sum(y_train == 0)
        print(f"  Train - Pos: {n_pos}, Neg: {n_neg}") 
        print(f"  Test  - Pos: {np.sum(y_test==1)}, Neg: {np.sum(y_test==0)}") 
 
        # ===================== manual setting class_weight ===================== 
        w_pos = n_neg / max(n_pos, 1)
        class_weight = {0: 1.0, 1: w_pos}
        print(f"  use class_weight = {class_weight}") 
 
        model = LogisticRegression(
            max_iter=1000, 
            solver='lbfgs',
            class_weight=class_weight
        ) 
        # model = LogisticRegression(
        #     max_iter=200, 
        #     solver='lbfgs',
        #     class_weight=class_weight
        # ) 
 
        model.fit(X_train, y_train) 

        y_pred_prob = model.predict_proba(X_test)[:, 1] 

        fold_csv = os.path.join(save_path, f"fold_{fold_idx}_results.csv") 
        pd.DataFrame({ 
            "fold": fold_idx, 
            "y_test": y_test, 
            "y_pred": y_pred_prob 
        }).to_csv(fold_csv, index=False) 
 
        all_results.append(pd.DataFrame({ 
            "fold": fold_idx, 
            "y_test": y_test, 
            "y_pred": y_pred_prob 
        })) 
 
        fold_idx += 1 

    all_results_df = pd.concat(all_results, ignore_index=True) 
    all_csv = os.path.join(save_path, "all_folds_results.csv") 
    all_results_df.to_csv(all_csv, index=False) 

    mean_auroc, auroc_lower, auroc_upper = bootstrap_metric_ci( 
        all_results_df["y_test"], all_results_df["y_pred"], metrics.roc_auc_score 
    ) 
    mean_auprc, auprc_lower, auprc_upper = bootstrap_metric_ci( 
        all_results_df["y_test"], all_results_df["y_pred"], metrics.average_precision_score 
    ) 
 
    print(f"\n===== Logistic Regression 结果 =====") 
    print(f"AUROC: Mean={mean_auroc:.4f}, 95% CI=({auroc_lower:.4f}, {auroc_upper:.4f})") 
    print(f"AUPRC: Mean={mean_auprc:.4f}, 95% CI=({auprc_lower:.4f}, {auprc_upper:.4f})") 
 
    return { 
        "AUROC_mean": mean_auroc, 
        "AUROC_CI": (auroc_lower, auroc_upper), 
        "AUPRC_mean": mean_auprc, 
        "AUPRC_CI": (auprc_lower, auprc_upper), 
        "AllResults": all_results_df 
    }


In [None]:
dataX = df_mapped_wide[features_categorical + features_continuous]
dataY = df_mapped_wide['Outcome']
results = train_lr_5fold_cv(dataX, dataY,num_features=features_continuous, cat_features=features_categorical)
print(results)

In [None]:
### Calibrate models for probabilistic calibration

import numpy as np 
import pandas as pd 
from sklearn import metrics 
from sklearn.model_selection import KFold, train_test_split 
from sklearn.preprocessing import OneHotEncoder 
from sklearn.impute import SimpleImputer 
from sklearn.linear_model import LogisticRegression 
import os 
from sklearn.preprocessing import StandardScaler
from sklearn.calibration import CalibratedClassifierCV

def train_lr_5fold_cv_calibrated(
    dataX, 
    dataY, 
    num_features, 
    cat_features, 
    save_path='./Poisoning_Prediction/ML/predict_non-recovery_calibration/lr/',
    seed=42
):
    os.makedirs(save_path, exist_ok=True) 
    X = dataX.copy() 
    y = np.array(dataY) 

    imputer = SimpleImputer(strategy='median') 
    X_num = pd.DataFrame(imputer.fit_transform(X[num_features]), columns=num_features, index=X.index) 
    scaler = StandardScaler()
    X_num_scaled = pd.DataFrame(scaler.fit_transform(X_num), columns=num_features, index=X.index)

    X_cat = X[cat_features].astype(str).fillna('missing')
    encoder = OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False)
    X_cat_encoded = pd.DataFrame(
        encoder.fit_transform(X_cat),
        columns=encoder.get_feature_names_out(cat_features),
        index=X.index
    )

    X_processed = pd.concat([X_num_scaled, X_cat_encoded], axis=1)
    print(f"final feature dimension: {X_processed.shape[1]}") 

    kf = KFold(n_splits=5, shuffle=True, random_state=seed) 
    all_results = [] 
    fold_idx = 1 

    for train_val_index, test_index in kf.split(X_processed): 
        X_train_val, X_test = X_processed.iloc[train_val_index], X_processed.iloc[test_index] 
        y_train_val, y_test = y[train_val_index], y[test_index] 

        X_train, X_val, y_train, y_val = train_test_split( 
            X_train_val, y_train_val, 
            test_size=1/8, random_state=seed, stratify=y_train_val 
        ) 

        print(f"Fold {fold_idx}: Train={len(y_train)}, Val={len(y_val)}, Test={len(y_test)}") 

        n_pos = np.sum(y_train == 1)
        n_neg = np.sum(y_train == 0)
        class_weight = {0: 1.0, 1: n_neg / max(n_pos, 1)}
        print(f"  class_weight = {class_weight}") 

        base_model = LogisticRegression(max_iter=1000, solver='lbfgs', class_weight=class_weight)
        base_model.fit(X_train, y_train)

        calib_model = CalibratedClassifierCV(base_model, method='sigmoid', cv='prefit')
        calib_model.fit(X_val, y_val)

        y_pred_prob = calib_model.predict_proba(X_test)[:, 1]

        fold_csv = os.path.join(save_path, f"fold_{fold_idx}_results.csv") 
        pd.DataFrame({
            "fold": fold_idx, 
            "y_test": y_test, 
            "y_pred": y_pred_prob 
        }).to_csv(fold_csv, index=False)

        all_results.append(pd.DataFrame({
            "fold": fold_idx, 
            "y_test": y_test, 
            "y_pred": y_pred_prob 
        })) 
        fold_idx += 1 

    all_results_df = pd.concat(all_results, ignore_index=True) 
    all_csv = os.path.join(save_path, "all_folds_results.csv") 
    all_results_df.to_csv(all_csv, index=False) 

    mean_auroc, auroc_lower, auroc_upper = bootstrap_metric_ci(
        all_results_df["y_test"], all_results_df["y_pred"], metrics.roc_auc_score 
    ) 
    mean_auprc, auprc_lower, auprc_upper = bootstrap_metric_ci(
        all_results_df["y_test"], all_results_df["y_pred"], metrics.average_precision_score 
    ) 

    print(f"\n===== Logistic Regression =====") 
    print(f"AUROC: Mean={mean_auroc:.4f}, 95% CI=({auroc_lower:.4f}, {auroc_upper:.4f})") 
    print(f"AUPRC: Mean={mean_auprc:.4f}, 95% CI=({auprc_lower:.4f}, {auprc_upper:.4f})") 

    return {
        "AUROC_mean": mean_auroc, 
        "AUROC_CI": (auroc_lower, auroc_upper), 
        "AUPRC_mean": mean_auprc, 
        "AUPRC_CI": (auprc_lower, auprc_upper), 
        "AllResults": all_results_df 
    }




In [32]:
df_mapped_wide['Outcome'].value_counts(dropna=False)

Outcome
0    731
1    240
Name: count, dtype: int64

In [None]:
dataX = df_mapped_wide[features_categorical + features_continuous]
dataY = df_mapped_wide['Outcome']
results = train_lr_5fold_cv_calibrated(dataX, dataY,num_features=features_continuous, cat_features=features_categorical)
print(results)