<a href="https://colab.research.google.com/github/AMA-anam/predictive-models--MDR-YEM/blob/main/MDR_YEM_AMA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**1. Import Libraries**

Install and import all necessary Python libraries for data analysis, modeling, and interpretation.

In [1]:
!pip install optuna \
             numpy \
             scikit-learn==1.5 \
             umap-learn==0.5.7 \
             seaborn --upgrade \
             shap \
             lime \
             xgboost==1.6.1 \
             catboost \
             statsmodels \
             mlxtend \
             joblib

Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting numpy
  Downloading numpy-2.3.5-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-learn==1.5
  Downloading scikit_learn-1.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting umap-learn==0.5.7
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting xgboost==1.6.1
  Downloading xgboost-1.6.1-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)
Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.

**2. Data Splitting (Train / Validation / Test Sets)**  
Split the dataset into training, validation, and test subsets to ensure unbiased evaluation

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def fully_decoupled_split(data, target_column, train_size=0.7, val_size=0.15, test_size=0.15, random_state=42):

    if target_column not in data.columns:
        raise ValueError(f"Target column '{target_column}' not found in DataFrame columns: {data.columns.tolist()}")


    y = data[target_column].values
    X = data.drop(columns=[target_column]).values
    feature_names = data.drop(columns=[target_column]).columns.tolist()


    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y,
        train_size=train_size,
        stratify=y,
        random_state=random_state
    )

    remaining_ratio = val_size / (val_size + test_size)
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp,
        train_size=remaining_ratio,
        stratify=y_temp,
        random_state=random_state
    )


    train_df = pd.DataFrame(X_train, columns=feature_names)
    train_df[target_column] = y_train

    val_df = pd.DataFrame(X_val, columns=feature_names)
    val_df[target_column] = y_val

    test_df = pd.DataFrame(X_test, columns=feature_names)
    test_df[target_column] = y_test


    assert train_df.index.tolist() == list(range(len(train_df)))
    assert val_df.index.tolist() == list(range(len(val_df)))
    assert test_df.index.tolist() == list(range(len(test_df)))

    return train_df, val_df, test_df


original_data = pd.read_csv("Original data.csv")

for col in ['Gender', 'Healthcare Sector', 'Institution Type', 'Bacteria type']:
    print(f"{col}:")
    if col in original_data.columns:
        orig_dist = original_data[col].value_counts(normalize=True) * 100
        for category, pct in orig_dist.items():
            print(f"  {category}: {pct:.2f}%")
    else:
        print("  Column not found in data.")


train_df, val_df, test_df = fully_decoupled_split(
    data=original_data,
    target_column="MDR status",
    train_size=0.7,
    val_size=0.15,
    test_size=0.15,
    random_state=42
)


train_df.to_csv("train_set.csv", index=False)
val_df.to_csv("validation_set.csv", index=False)
test_df.to_csv("test_set.csv", index=False)

print("Split completed with complete index decoupling:")
print(f"Train: {len(train_df)} rows, index: {train_df.index.tolist()[:5]}...")
print(f"Val: {len(val_df)} rows, index: {val_df.index.tolist()[:5]}...")
print(f"Test: {len(test_df)} rows, index: {test_df.index.tolist()[:5]}...")

Gender:
  F: 60.86%
  M: 39.14%
Healthcare Sector:
  Private: 67.45%
  Governmental: 32.55%
Institution Type:
  Lab: 71.57%
  Hospital: 28.43%
Bacteria type:
  E. coli: 69.41%
  Klebsiella Spp: 16.20%
  Pseudomonas Spp: 14.39%
Split completed with complete index decoupling:
Train: 1785 rows, index: [0, 1, 2, 3, 4]...
Val: 382 rows, index: [0, 1, 2, 3, 4]...
Test: 383 rows, index: [0, 1, 2, 3, 4]...


**3. Data Preprocessing & Model Performance (Training Set)**  
Clean, encode, scale, and preprocess features, then train candidate models on the training set

In [5]:

from __future__ import annotations

import os
import warnings
import logging
import traceback
import joblib
import json
import gc
import sys
from pathlib import Path
from itertools import combinations
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Sequence, Tuple, Callable
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm, chi2, bootstrap, t
from statsmodels.stats.contingency_tables import mcnemar
from statsmodels.stats.multitest import multipletests
import shap
import optuna
from optuna.samplers import TPESampler
from optuna.exceptions import TrialPruned
import lightgbm as lgb
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import (
    roc_auc_score, average_precision_score, recall_score, f1_score,
    accuracy_score, cohen_kappa_score, matthews_corrcoef, brier_score_loss,
    log_loss, roc_curve, precision_recall_curve, confusion_matrix,
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from scipy.special import expit as sigmoid_func
from scipy.special import expit
from statsmodels.stats.proportion import proportion_confint
from scipy.special import expit, logit


try:
    import statsmodels.api as sm
    STATSMODELS_AVAILABLE = True
except ImportError:
    STATSMODELS_AVAILABLE = False
    warnings.warn("statsmodels not available - some statistical functions will be limited")

try:
    from statsmodels.stats.multitest import multipletests
    MULTITEST_AVAILABLE = True
except ImportError:
    MULTITEST_AVAILABLE = False
    warnings.warn("statsmodels.stats.multitest not available - multiple comparison corrections will be limited")

try:
    from statsmodels.stats.contingency_tables import mcnemar
    MCNEMAR_AVAILABLE = True
except ImportError:
    MCNEMAR_AVAILABLE = False
    warnings.warn("statsmodels.stats.contingency_tables not available - McNemar test will be limited")

try:
    from mlxtend.evaluate import mcnemar_table
    MLXTEND_AVAILABLE = True
except ImportError:
    MLXTEND_AVAILABLE = False
    warnings.warn("mlxtend not available - McNemar table functionality will be limited")

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score, brier_score_loss,
    matthews_corrcoef, cohen_kappa_score, roc_curve,
    precision_recall_curve, balanced_accuracy_score, log_loss, confusion_matrix
)




os.environ["OMP_NUM_THREADS"] = "1"
os.environ["KMP_WARNINGS"] = "0"
warnings.filterwarnings("ignore")



@dataclass(frozen=True)
class Config:


    random_state: int = 42
    n_jobs_parallel: int = field(default_factory=lambda: int(os.cpu_count() or 4))


    n_outer_folds: int = 5
    n_inner_folds: int = 5

    n_trials_optuna: int = 10
    early_stopping_rounds: int = 30
    eval_metric_objective: str = "logloss"


    calibration_cv_folds: int = 5
    calibration_method: str = 'sigmoid'
    n_bins_calibration: int = 10

    n_features_select: int = 10
    shap_weight: float = 0.5
    tree_weight: float = 0.5
    max_samples_shap: int = 100


    n_bootstrap: int = 1000
    confidence_level: float = 0.95


    train_csv_path: str = "train_set.csv"
    target_column: str = "MDR status"
    output_dir: Path = field(default_factory=lambda: Path("nested_cv_output"))


    plotting_settings: Dict[str, Any] = field(default_factory=lambda: {
        'figsize': (3.5, 3.0),
        'dpi': 450,
        'style': 'seaborn-v0_8',
        'colors': sns.color_palette("colorblind").as_hex()
    })

    def __post_init__(self) -> None:

        output_base = self.output_dir / self.eval_metric_objective
        object.__setattr__(self, 'output_dir', output_base)
        output_base.mkdir(exist_ok=True, parents=True)
        for sub in ["models", "plots", "metrics", "logs", "best_params"]:
            (output_base / sub).mkdir(exist_ok=True, parents=True)

    @classmethod
    def from_env(cls) -> "Config":

        cfg_dict = {}
        for key, default_value in cls.__annotations__.items():
            env_var = os.getenv(key.upper())
            if env_var is not None:

                if isinstance(default_value, type):
                     if default_value is Path:
                         cfg_dict[key] = Path(env_var)

                elif isinstance(default_value, bool):
                    cfg_dict[key] = env_var.lower() in {"1", "true", "yes"}
                elif isinstance(default_value, int):
                    cfg_dict[key] = int(env_var)
                elif isinstance(default_value, float):
                    cfg_dict[key] = float(env_var)
                elif isinstance(default_value, str):
                    cfg_dict[key] = env_var
        return cls(**cfg_dict)

    def to_dict(self) -> Dict[str, Any]:

        return {
            'random_state': self.random_state,
            'optimization': {'inner_cv_folds': self.n_inner_folds, 'n_trials': self.n_trials_optuna, 'optuna_n_jobs': 1},
            'feature_selection': {'n_features': self.n_features_select, 'shap_weight': self.shap_weight, 'tree_weight': self.tree_weight, 'max_samples_shap': self.max_samples_shap},
            'evaluation': {'outer_cv_folds': self.n_outer_folds, 'calibration_cv_folds': self.calibration_cv_folds, 'calibration_method': self.calibration_method, 'n_bins': self.n_bins_calibration},
            'early_stopping': {'rounds': self.early_stopping_rounds, 'metric': self.eval_metric_objective},
            'output_dirs': {'models': self.output_dir / "models", 'plots': self.output_dir / "plots", 'metrics': self.output_dir / "metrics"},
            'plotting': self.plotting_settings,
            'data': {'train_csv': self.train_csv_path, 'target': self.target_column}
        }



def setup_logger(cfg: Config) -> logging.Logger:

    log = logging.getLogger("nested_cv")
    if log.hasHandlers():
        return log
    log.setLevel(logging.INFO)


    log_path = cfg.output_dir / "logs" / "pipeline.log"
    fh = logging.FileHandler(log_path)
    fh.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s"))
    log.addHandler(fh)


    ch = logging.StreamHandler(sys.stdout)
    ch.setFormatter(logging.Formatter("%(levelname)s: %(message)s"))
    log.addHandler(ch)

    return log



def get_feature_names(pre: ColumnTransformer, X: pd.DataFrame) -> List[str]:

    names = []
    for nm, pipe, cols in pre.transformers_:
        if nm == 'cat':
            try:

                cats = pipe.named_steps['onehot'].categories_
                for c, cl in zip(cols, cats):
                    names += [f"{c}_{v}" for v in cl]
            except Exception:
                names += [f"cat_{c}" for c in cols]
        elif nm == 'num':
            names += list(cols)
    return names

def create_preprocessor(X: pd.DataFrame) -> ColumnTransformer:

    cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
    num_cols = X.select_dtypes(include=np.number).columns.tolist()

    cat_pipe = Pipeline([('onehot', OneHotEncoder(handle_unknown='ignore'))])
    num_pipe = Pipeline([('scale', StandardScaler())])

    return ColumnTransformer([
        ('cat', cat_pipe, cat_cols),
        ('num', num_pipe, num_cols)
    ], remainder='drop')

def _decision_to_proba(model: Any, X: pd.DataFrame) -> np.ndarray:

    try:
        scores = model.decision_function(X)
        scores = np.asarray(scores).ravel()
        return sigmoid_func(scores)
    except Exception as e:
        warnings.warn(f"Could not get probabilities from decision_function: {e}")
        return model.predict(X).astype(float)

def z_test_standard(y_true, y_prob):

    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)

    mask = ~(np.isnan(y_prob) | np.isinf(y_prob))
    y_true, y_prob = y_true[mask], y_prob[mask]

    if len(y_true) < 10:
        return np.nan, np.nan


    numerator = np.sum(y_true - y_prob)


    denominator = np.sqrt(np.sum(y_prob * (1 - y_prob)))

    if denominator < 1e-8:
        return np.nan, np.nan


    z_stat = numerator / denominator
    p_value = 2 * (1 - norm.cdf(abs(z_stat)))

    return z_stat, p_value




class HybridFeatureSelector:

    def __init__(self, cfg: Dict[str, Any]):
        self.n_features = cfg['feature_selection']['n_features']
        self.shap_weight = cfg['feature_selection']['shap_weight']
        self.tree_weight = cfg['feature_selection']['tree_weight']
        self.max_samples_shap = cfg['feature_selection']['max_samples_shap']
        self.random_state = cfg['random_state']
        self.selected_features_ = None

    def _calculate_shap_importances(self, X: pd.DataFrame, model: Any) -> pd.Series:

        try:
            explainer = shap.TreeExplainer(model)
            X_samp = X.sample(min(self.max_samples_shap, len(X)), random_state=self.random_state)
            X_samp = X_samp.apply(pd.to_numeric, errors='coerce').fillna(0)
            shap_vals = explainer.shap_values(X_samp)

            if isinstance(shap_vals, list):
                shap_arr = np.abs(shap_vals[1]) if len(shap_vals) > 1 else np.abs(shap_vals[0])
            else:
                shap_arr = np.abs(shap_vals)

            return pd.Series(shap_arr.mean(axis=0), index=X_samp.columns)
        except Exception:
            logging.warning("SHAP calculation failed. Using uniform importances.")
            return pd.Series(1.0, index=X.columns)

    def fit(self, X: pd.DataFrame, y: pd.Series, model: Any) -> 'HybridFeatureSelector':

        if not hasattr(model, 'fit'):
            raise ValueError("Model must be fitted before feature selection")


        tree_models_supporting_shap = (XGBClassifier, lgb.LGBMClassifier, CatBoostClassifier)
        if isinstance(model, tree_models_supporting_shap) and hasattr(model, 'feature_importances_'):
            shap_imp = self._calculate_shap_importances(X, model)
        else:
            shap_imp = pd.Series(1.0, index=X.columns)


        if hasattr(model, 'feature_importances_'):
            tree_imp = pd.Series(model.feature_importances_, index=X.columns)
        elif hasattr(model, 'coef_'):
            coef = model.coef_
            if coef.ndim > 1: coef = coef[0]
            tree_imp = pd.Series(np.abs(coef), index=X.columns)
        else:
            tree_imp = pd.Series(1.0, index=X.columns)


        shap_imp, tree_imp = shap_imp.align(tree_imp, fill_value=0)

        if shap_imp.sum() == 0 and tree_imp.sum() == 0:
            combined = pd.Series(np.ones(len(X.columns)), index=X.columns)
        else:
            shap_rank = shap_imp.rank(ascending=False, method='min')
            tree_rank = tree_imp.rank(ascending=False, method='min')
            combined = self.shap_weight * shap_rank + self.tree_weight * tree_rank


        self.selected_features_ = combined.nsmallest(self.n_features).index.tolist()
        return self


    def transform(self, X: pd.DataFrame) -> pd.DataFrame:

        if self.selected_features_ is None:
            raise ValueError("Selector not fitted yet. Call fit() first.")


        sorted_features = sorted(self.selected_features_)


        return X[sorted_features]

    def fit_transform(self, X: pd.DataFrame, y: pd.Series, model: Any) -> pd.DataFrame:

        return self.fit(X, y, model).transform(X)




def calculate_ece(y_true, y_prob, n_bins=10, method='uniform'):
    """
    Robust Expected Calibration Error (ECE) calculation.
    Handles duplicate bin edges and edge-case probabilities correctly.
    """
    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)

    # Remove invalid probabilities (NaNs, Infs)
    mask = ~(np.isnan(y_prob) | np.isinf(y_prob))
    y_true = y_true[mask]
    y_prob = y_prob[mask]

    if len(y_true) == 0:
        return np.nan

    # Ensure probabilities are clipped to [0, 1]
    y_prob = np.clip(y_prob, 0.0, 1.0)

    if method == 'quantile':
        quantiles = np.linspace(0, 1, n_bins + 1)
        bin_edges = np.percentile(y_prob, quantiles * 100)
        # FIX 1: Remove duplicate edges to prevent zero-width bins
        bin_edges = np.unique(bin_edges)
    else:
        # Uniform bins
        bin_edges = np.linspace(0.0, 1.0, n_bins + 1)

    # Determine actual number of bins (may be less than n_bins if duplicates existed)
    actual_n_bins = len(bin_edges) - 1
    if actual_n_bins == 0:
        return 0.0

    ece = 0.0
    total_samples = len(y_true)

    for i in range(actual_n_bins):
        # FIX 2: Correct semi-open intervals [a, b)
        if i == actual_n_bins - 1:
            # Last bin includes the upper edge (inclusive)
            mask = (y_prob >= bin_edges[i]) & (y_prob <= bin_edges[i+1])
        else:
            # Other bins are exclusive on the right
            mask = (y_prob >= bin_edges[i]) & (y_prob < bin_edges[i+1])

        n_in_bin = np.sum(mask)
        if n_in_bin > 0:
            avg_pred = np.mean(y_prob[mask])
            avg_true = np.mean(y_true[mask])
            ece += (n_in_bin / total_samples) * np.abs(avg_pred - avg_true)

    return float(ece)


def calculate_mce(y_true, y_prob, n_bins=10, method='uniform'):
    """
    Robust Maximum Calibration Error (MCE) calculation.
    """
    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)

    mask = ~(np.isnan(y_prob) | np.isinf(y_prob))
    y_true = y_true[mask]
    y_prob = y_prob[mask]

    if len(y_true) == 0:
        return np.nan

    y_prob = np.clip(y_prob, 0.0, 1.0)

    if method == 'quantile':
        quantiles = np.linspace(0, 1, n_bins + 1)
        bin_edges = np.percentile(y_prob, quantiles * 100)
        # FIX 1: Remove duplicate edges
        bin_edges = np.unique(bin_edges)
    else:
        bin_edges = np.linspace(0.0, 1.0, n_bins + 1)

    actual_n_bins = len(bin_edges) - 1
    if actual_n_bins == 0:
        return 0.0

    max_error = 0.0

    for i in range(actual_n_bins):
        # FIX 2: Correct semi-open intervals
        if i == actual_n_bins - 1:
            mask = (y_prob >= bin_edges[i]) & (y_prob <= bin_edges[i+1])
        else:
            mask = (y_prob >= bin_edges[i]) & (y_prob < bin_edges[i+1])

        n_in_bin = np.sum(mask)
        if n_in_bin > 0:
            avg_pred = np.mean(y_prob[mask])
            avg_true = np.mean(y_true[mask])
            error = abs(avg_pred - avg_true)
            if error > max_error:
                max_error = error

    return float(max_error)

def classification_extended_metrics(y_true, y_pred):

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    npv = tn / (tn + fn) if (tn + fn) > 0 else 0
    ppv = tp / (tp + fp) if (tp + fp) > 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0

    return {
        'TN': tn, 'FP': fp, 'FN': fn, 'TP': tp,
        'NPV': npv, 'PPV': ppv, 'FPR': fpr, 'FNR': fnr
    }



def calibration_slope_intercept(y_true, y_prob, method='logistic'):

    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)

    mask = ~(np.isnan(y_prob) | np.isinf(y_prob) | np.isnan(y_true))
    y_true, y_prob = y_true[mask], y_prob[mask]

    if len(y_true) < 10 or len(np.unique(y_true)) < 2:
        return np.nan, np.nan

    y_prob = np.clip(y_prob, 1e-6, 1 - 1e-6)
    X_base = logit(y_prob) if method == 'logistic' else y_prob
    X = sm.add_constant(X_base.reshape(-1, 1))

    try:
        mod = sm.GLM(y_true, X, family=sm.families.Binomial())
        res = mod.fit(disp=0)
        intercept, slope = float(res.params[0]), float(res.params[1])
        return intercept, slope
    except Exception:
        return np.nan, np.nan


def calibration_slope_intercept_ci(y_true, y_prob, n_bootstrap=100, alpha=0.05, seed=42):


    intercept_point, slope_point = calibration_slope_intercept(y_true, y_prob)

    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)
    rng = np.random.default_rng(seed)
    slopes, intercepts = [], []

    for _ in range(n_bootstrap):
        try:
            indices = rng.choice(len(y_true), len(y_true), replace=True)
            if len(np.unique(y_true[indices])) < 2:
                continue

            int_b, slp_b = calibration_slope_intercept(y_true[indices], y_prob[indices])
            if not (np.isnan(slp_b) or np.isnan(int_b)):
                slopes.append(slp_b)
                intercepts.append(int_b)
        except Exception:
            continue

    if slopes and intercepts:
        slope_ci = (np.percentile(slopes, 100 * alpha/2), np.percentile(slopes, 100 * (1 - alpha/2)))
        intercept_ci = (np.percentile(intercepts, 100 * alpha/2), np.percentile(intercepts, 100 * (1 - alpha/2)))
    else:

        slope_ci = (np.nan, np.nan)
        intercept_ci = (np.nan, np.nan)

    return intercept_point, slope_point, intercept_ci, slope_ci



def ci95(mean, std, n):
    if n < 2:
        return np.nan, np.nan
    tval = t.ppf(0.975, df=n-1)
    err = tval * std / np.sqrt(n)
    return mean - err, mean + err

def cohens_d_from_summary(mean1, std1, mean2, std2):
    pooled_std = np.sqrt((std1**2 + std2**2) / 2)
    if pooled_std == 0:
        return np.nan
    return (mean1 - mean2) / pooled_std

def interpret_effect_size(d):
    abs_d = abs(d)
    if np.isnan(abs_d):
        return "Unknown"
    elif abs_d < 0.2:
        return "Negligible"
    elif abs_d < 0.5:
        return "Small"
    elif abs_d < 0.8:
        return "Medium"
    else:
        return "Large"

def pairwise_pvalue(mean1, std1, n1, mean2, std2, n2):
    se = np.sqrt((std1**2 / n1) + (std2**2 / n2))
    if se == 0:
        return np.nan
    t_stat = (mean1 - mean2) / se
    df_num = (std1**2 / n1 + std2**2 / n2) ** 2
    df_den = ((std1**2 / n1) ** 2) / (n1 - 1) + ((std2**2 / n2) ** 2) / (n2 - 1)
    if df_den == 0:
        return np.nan
    df = df_num / df_den
    p = 2 * t.sf(abs(t_stat), df)
    return p



def hosmer_lemeshow_test_advanced(y_true, y_prob, n_bins=10, min_expected_freq=5):


    if not STATSMODELS_AVAILABLE:
        warnings.warn("statsmodels not available - returning NaN for Hosmer-Lemeshow test")
        return np.nan, np.nan

    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)

    mask = ~(np.isnan(y_prob) | np.isinf(y_prob))
    y_true, y_prob = y_true[mask], y_prob[mask]

    if len(y_true) < 20 or len(np.unique(y_true)) < 2:
        return np.nan, np.nan


    df = pd.DataFrame({'y_true': y_true, 'y_prob': y_prob})
    try:
        df['bin'] = pd.qcut(df['y_prob'], n_bins, labels=False, duplicates='drop')
    except ValueError:
        df['bin'] = np.floor(df['y_prob'] * n_bins).astype(int)
        df.loc[df['bin'] == n_bins, 'bin'] = n_bins - 1


    summary = df.groupby('bin').agg(
        observed=('y_true', 'sum'),
        expected=('y_prob', 'sum'),
        n_total=('y_true', 'size')
    ).reset_index()


    while True:
        sparse_bins = summary[summary['expected'] < min_expected_freq]
        if sparse_bins.empty or len(summary) <= 2:
            break


        merge_idx = sparse_bins.index[0]
        if merge_idx == 0:
            summary.loc[1, ['observed', 'expected', 'n_total']] += summary.loc[0, ['observed', 'expected', 'n_total']]
            summary = summary.drop(0).reset_index(drop=True)
        else:
            summary.loc[merge_idx - 1, ['observed', 'expected', 'n_total']] += summary.loc[merge_idx, ['observed', 'expected', 'n_total']]
            summary = summary.drop(merge_idx).reset_index(drop=True)


    g = len(summary)
    summary['variance'] = summary['expected'] * (1 - summary['expected'] / summary['n_total'])
    hl_statistic = ((summary['observed'] - summary['expected'])**2 / (summary['variance'] + 1e-8)).sum()


    df_hl = g - 2
    if df_hl <= 0:
        return hl_statistic, np.nan

    p_value = 1 - chi2.cdf(hl_statistic, df_hl)

    return hl_statistic, p_value



def bootstrap_metrics(y_true, y_proba, y_pred):

    METRIC_FN = {
        'roc_auc_bs': roc_auc_score,
        'pr_auc_bs': average_precision_score,
        'recall_bs': recall_score,
        'f1_bs': f1_score,
        'accuracy_bs': accuracy_score,
        'kappa_bs': cohen_kappa_score,
        'mcc_bs': matthews_corrcoef,
        'brier_bs': brier_score_loss,
        'logloss_bs': log_loss
    }

    metrics = {}
    try:
        for name, fn in METRIC_FN.items():

            if 'auc' in name or 'brier' in name or 'logloss' in name:
                if len(np.unique(y_true)) > 1:
                    metrics[name] = fn(y_true, y_proba)
                else:
                    metrics[name] = np.nan
            else:
                if len(np.unique(y_true)) > 1:
                    metrics[name] = fn(y_true, y_pred)
                else:
                    metrics[name] = np.nan
    except Exception as e:
        print(f"Error in bootstrap_metrics: {e}")

        for name in METRIC_FN.keys():
            metrics[name] = np.nan

    return metrics


CALIBRATION_STATS_KEYS = [
    'calibration_intercept', 'calibration_slope', 'z_statistic', 'z_p_value',
    'hl_statistic', 'hl_p_value', 'calibration_intercept_ci_low',
    'calibration_intercept_ci_high', 'calibration_slope_ci_low',
    'calibration_slope_ci_high', 'ece_uniform', 'mce'
]

def collect_metrics(y_true, y_proba, y_pred):

    res = {
        'roc_auc': roc_auc_score(y_true, y_proba),
        'pr_auc': average_precision_score(y_true, y_proba),
        'recall': recall_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred),
        'kappa': cohen_kappa_score(y_true, y_pred),
        'mcc': matthews_corrcoef(y_true, y_pred),
        'accuracy': accuracy_score(y_true, y_pred),
        'brier': brier_score_loss(y_true, y_proba),
        'logloss': log_loss(y_true, y_proba),
    }

    try:

        n_bootstrap = CONFIG.get('n_bootstrap', 1000)
        n_bins = CONFIG['evaluation']['n_bins']


        res['ece_uniform'] = calculate_ece(y_true, y_proba, n_bins=n_bins, method='uniform')


        res['mce_quantile'] = calculate_mce(y_true, y_proba, n_bins=n_bins, method='quantile')


        intercept, slope, intercept_ci, slope_ci = calibration_slope_intercept_ci(
            y_true, y_proba, n_bootstrap=n_bootstrap
        )
        res.update({
            'calibration_intercept': intercept,
            'calibration_slope': slope,
            'calibration_intercept_ci_low': intercept_ci[0],
            'calibration_intercept_ci_high': intercept_ci[1],
            'calibration_slope_ci_low': slope_ci[0],
            'calibration_slope_ci_high': slope_ci[1]
        })

        hl_stat, hl_p = hosmer_lemeshow_test_advanced(y_true, y_proba, n_bins=n_bins)
        res['hl_statistic'] = hl_stat
        res['hl_p_value'] = hl_p

        z_stat, z_p = z_test_standard(y_true, y_proba)
        res['z_statistic'] = z_stat
        res['z_p_value'] = z_p

    except Exception as e:
        warnings.warn(f"Statistical calibration analysis failed: {e}. Filling with NaN.")


        for key in CALIBRATION_STATS_KEYS:
            res[key] = np.nan


    extended_metrics = classification_extended_metrics(y_true, y_pred)
    res.update(extended_metrics)

    return res

def classification_extended_metrics(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    npv = tn / (tn + fn) if (tn + fn) > 0 else 0
    ppv = tp / (tp + fp) if (tp + fp) > 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0

    return {
        'TN': tn, 'FP': fp, 'FN': fn, 'TP': tp,
        'NPV': npv, 'PPV': ppv, 'FPR': fpr, 'FNR': fnr
    }

def calibration_curve_fixed_bins(y_true, y_prob, n_bins=5):

    edges = np.linspace(0., 1. + 1e-8, n_bins + 1)
    idx = np.digitize(y_prob, edges) - 1
    prob_true, prob_pred = np.zeros(n_bins), np.zeros(n_bins)
    for b in range(n_bins):
        m = idx == b
        if m.any():
            prob_true[b] = y_true[m].mean()
            prob_pred[b] = y_prob[m].mean()
        else:
            prob_true[b] = np.nan
            prob_pred[b] = 0.5 * (edges[b] + edges[b+1])
    return prob_true, prob_pred



def calculate_effect_sizes_and_tests(summaries, output_dir):

    os.makedirs(output_dir, exist_ok=True)
    models = [s['model'] for s in summaries]

    numeric_cols = [k for k in summaries[0].keys() if k.endswith('_mean')]
    metrics = [k[:-5] for k in numeric_cols]

    stats_records = []
    for s in summaries:
        for metric in metrics:

            mean_key = f"{metric}_mean"
            std_key = f"{metric}_std"

            if mean_key in s and std_key in s:
                mean = s[mean_key]
                std = s[std_key]
                n_folds = s.get("n_folds", CONFIG['evaluation']['outer_cv_folds'])
                ci_lo, ci_hi = ci95(mean, std, n_folds)
                stats_records.append({
                    'Metric': metric, 'Model': s['model'], 'Mean': mean, 'Std': std,
                    'N_Folds': n_folds, '95%CI_low': ci_lo, '95%CI_high': ci_hi
                })
            else:
                logging.warning(f"Missing mean or std for metric '{metric}' in model '{s['model']}'")

    stats_df = pd.DataFrame(stats_records)
    stats_df.to_csv(os.path.join(output_dir, "cv_metric_summary.csv"), index=False)

    pairwise_results = []
    for metric in metrics:

        metric_values = {s['model']: s.get(f"{metric}_mean", np.nan) for s in summaries}
        metric_stds = {s['model']: s.get(f"{metric}_std", np.nan) for s in summaries}
        metric_n_folds = {s['model']: s.get("n_folds", CONFIG['evaluation']['outer_cv_folds']) for s in summaries}

        for m1, m2 in combinations(models, 2):
            mean1, std1, n1 = metric_values[m1], metric_stds[m1], metric_n_folds[m1]
            mean2, std2, n2 = metric_values[m2], metric_stds[m2], metric_n_folds[m2]

            d = cohens_d_from_summary(mean1, std1, mean2, std2)
            pval = pairwise_pvalue(mean1, std1, n1, mean2, std2, n2)

            pairwise_results.append({
                "Metric": metric, "Model_A": m1, "Model_B": m2,
                "Diff": mean1 - mean2 if not (np.isnan(mean1) or np.isnan(mean2)) else np.nan,
                "Cohens_d": d,
                "Cohens_d_interpretation": interpret_effect_size(d),
                "p_value": pval
            })

    effect_df = pd.DataFrame(pairwise_results)
    effect_df.to_csv(os.path.join(output_dir, "cv_pairwise_effects_pvalues.csv"), index=False)
    print(f"CV-based statistical analysis saved to {output_dir}/")
    return stats_df, effect_df

def perform_statistical_tests(plot_data, threshold=0.5, auc_bootstrap_n=1000, verbose=True):

    models = list(plot_data.keys())
    results = {}
    for i, model1 in enumerate(models):
        for model2 in models[i+1:]:
            try:
                y1, p1 = plot_data[model1]['y'], plot_data[model1]['p']
                y2, p2 = plot_data[model2]['y'], plot_data[model2]['p']
                if not np.array_equal(y1, y2):
                    if verbose: print(f"Sample mismatch: {model1} vs {model2} skipped")
                    continue
                auc_diff = roc_auc_score(y1, p1) - roc_auc_score(y1, p2)
                auc_diff_pval = bootstrap_auc_difference_test(y1, p1, p2, n_bootstrap=auc_bootstrap_n)
                pred1 = (p1 >= threshold).astype(int)
                pred2 = (p2 >= threshold).astype(int)
                mcnemar_pval = perform_mcnemar_test(y1, pred1, pred2)
                results[f"{model1}_vs_{model2}"] = {
                    'auc_difference': auc_diff,
                    'auc_difference_pval': auc_diff_pval,
                    'mcnemar_pval': mcnemar_pval
                }
            except KeyError as e:
                if verbose: print(f"KeyError for {model1} or {model2}: {e}")
                continue
    return results

def bootstrap_auc_difference_test(y_true, y_prob1, y_prob2, n_bootstrap=1000, random_state=42):

    rng = np.random.default_rng(random_state)
    observed_diff = roc_auc_score(y_true, y_prob1) - roc_auc_score(y_true, y_prob2)
    n = len(y_true)
    bootstrap_diffs = []
    for _ in range(n_bootstrap):
        idx = rng.choice(n, n, replace=True)
        y_boot, p1_boot, p2_boot = y_true[idx], y_prob1[idx], y_prob2[idx]
        if len(np.unique(y_boot)) > 1:
            diff = roc_auc_score(y_boot, p1_boot) - roc_auc_score(y_boot, p2_boot)
            bootstrap_diffs.append(diff)
    if not bootstrap_diffs: return np.nan
    bootstrap_diffs = np.array(bootstrap_diffs)
    p_value = 2 * min((bootstrap_diffs >= observed_diff).mean(), (bootstrap_diffs <= observed_diff).mean())
    return p_value

def perform_mcnemar_test(y_true, y_pred1, y_pred2):
    if not MLXTEND_AVAILABLE:
        logging.warning("mlxtend not available - using basic McNemar implementation")
        n01 = np.sum((y_pred1 == 0) & (y_pred2 == 1) & (y_true == y_pred2))
        n10 = np.sum((y_pred1 == 1) & (y_pred2 == 0) & (y_true == y_pred1))
        if n01 + n10 == 0:
            return 1.0
        statistic = (abs(n01 - n10) - 1) ** 2 / (n01 + n10)
        from scipy.stats import chi2
        return 1 - chi2.cdf(statistic, 1)

    try:
        table = mcnemar_table(y_target=y_true, y_model1=y_pred1, y_model2=y_pred2)
        result = mcnemar(table, exact=False, correction=True)
        return result.pvalue
    except Exception as e:
        logging.warning(f"McNemar test failed: {e}")
        return np.nan

def apply_multiple_comparisons_correction(test_results_df, p_value_columns=None, alpha=0.05, method='both'):

    df_corrected = test_results_df.copy()
    if p_value_columns is None:
        p_value_columns = [col for col in df_corrected.columns
                           if 'pval' in col.lower() or 'p_value' in col.lower()]
    for col in p_value_columns:
        p_values = df_corrected[col].dropna().values
        valid_indices = df_corrected[col].notna()
        if len(p_values) == 0:
            continue
        if method in ['bonferroni', 'both']:
            alpha_bonferroni = alpha / len(p_values)
            bonferroni_corrected = np.minimum(p_values * len(p_values), 1.0)
            df_corrected.loc[valid_indices, f'{col}_bonferroni'] = bonferroni_corrected
            df_corrected.loc[valid_indices, f'{col}_bonferroni_sig'] = bonferroni_corrected < alpha
            df_corrected.loc[valid_indices, f'{col}_bonferroni_alpha'] = alpha_bonferroni
        if method in ['fdr', 'both']:
            _, pvals_fdr, _, _ = multipletests(p_values, alpha=alpha, method='fdr_bh')
            df_corrected.loc[valid_indices, f'{col}_fdr'] = pvals_fdr
            df_corrected.loc[valid_indices, f'{col}_fdr_sig'] = pvals_fdr < alpha
    return df_corrected

def print_correction_summary(df_corrected, original_alpha=0.05):

    print("\n 📊  Multiple Comparisons Correction Summary:")
    print("=" * 50)
    p_cols = [col for col in df_corrected.columns if 'pval' in col.lower() and not any(x in col for x in ['bonferroni', 'fdr'])]
    for col in p_cols:
        if col in df_corrected.columns:
            original_sig = (df_corrected[col] < original_alpha).sum()
            total_tests = df_corrected[col].notna().sum()
            print(f"\n{col.upper()}:")
            print(f"  Original significant results (p < {original_alpha}): {original_sig}/{total_tests}")
            if f'{col}_bonferroni_sig' in df_corrected.columns:
                bonf_sig = df_corrected[f'{col}_bonferroni_sig'].sum()
                bonf_alpha = df_corrected[f'{col}_bonferroni_alpha'].iloc[0] if f'{col}_bonferroni_alpha' in df_corrected.columns else 'N/A'
                print(f"  Bonferroni significant (α = {bonf_alpha:.4f}): {bonf_sig}/{total_tests}")
            if f'{col}_fdr_sig' in df_corrected.columns:
                fdr_sig = df_corrected[f'{col}_fdr_sig'].sum()
                print(f"  FDR significant (α = {original_alpha}): {fdr_sig}/{total_tests}")



class HybridFeatureSelector:

    def __init__(self, cfg: Dict[str, Any]):

        self.n_features = cfg['feature_selection']['n_features']
        self.shap_weight = cfg['feature_selection']['shap_weight']
        self.tree_weight = cfg['feature_selection']['tree_weight']
        self.max_samples_shap = cfg['feature_selection']['max_samples_shap']
        self.random_state = cfg['random_state']
        self.selected_features_ = None

    def _calculate_shap_importances(self, X, model):
        try:
            explainer = shap.TreeExplainer(model)
            X_samp = X.sample(min(self.max_samples_shap, len(X)), random_state=CONFIG['random_state'])


            X_samp = X_samp.apply(pd.to_numeric, errors='coerce').fillna(0)

            shap_vals = explainer.shap_values(X_samp)


            if isinstance(shap_vals, list):
                shap_arr = np.abs(shap_vals[1]) if len(shap_vals) > 1 else np.abs(shap_vals[0])
            else:
                shap_arr = np.abs(shap_vals)

            return pd.Series(shap_arr.mean(axis=0), index=X_samp.columns)
        except Exception as e:
            logging.warning(f"SHAP calculation failed: {e}. Using uniform importances.")
            logging.warning(traceback.format_exc())
            return pd.Series(1.0, index=X.columns)

    def fit(self, X, y, model):

        if not hasattr(model, 'fit'):
            raise ValueError("Model must be fitted before feature selection")


        tree_models_supporting_shap = (XGBClassifier, lgb.LGBMClassifier, CatBoostClassifier)
        if isinstance(model, tree_models_supporting_shap) and hasattr(model, 'feature_importances_'):
            shap_imp = self._calculate_shap_importances(X, model)
        else:

            shap_imp = pd.Series(1.0, index=X.columns)

        if hasattr(model, 'feature_importances_'):
            tree_imp = pd.Series(model.feature_importances_, index=X.columns)
        elif hasattr(model, 'coef_'):

            coef = model.coef_
            if coef.ndim > 1:
                coef = coef[0]
            tree_imp = pd.Series(np.abs(coef), index=X.columns)
        else:

            tree_imp = pd.Series(1.0, index=X.columns)


        shap_imp, tree_imp = shap_imp.align(tree_imp, fill_value=0)


        if shap_imp.sum() == 0 and tree_imp.sum() == 0:
            combined = pd.Series(np.ones(len(X.columns)), index=X.columns)
        else:

            if shap_imp.sum() > 0:
                shap_imp = shap_imp / shap_imp.sum()
            if tree_imp.sum() > 0:
                tree_imp = tree_imp / tree_imp.sum()


            shap_rank = shap_imp.rank(ascending=False)
            tree_rank = tree_imp.rank(ascending=False)
            combined = self.shap_weight * shap_rank + self.tree_weight * tree_rank


        self.selected_features_ = combined.nsmallest(self.n_features).index.tolist()
        return self

    def transform(self, X):
        if self.selected_features_ is None:
            raise ValueError("Selector not fitted yet. Call fit() first.")
        return X[self.selected_features_]

    def fit_transform(self, X, y, model):
        return self.fit(X, y, model).transform(X)



class NestedCVPipeline:

    def __init__(self, X: pd.DataFrame, y: pd.Series, cfg: Dict[str, Any]):
        self.X = X
        self.y = y
        self.cfg = cfg
        self.models = {
            'LogisticRegression': LogisticRegression,
            'RandomForest': RandomForestClassifier,
            'XGBoost': XGBClassifier,
            'LightGBM': lgb.LGBMClassifier,
            'CatBoost': CatBoostClassifier,
            'SVM': SVC
        }
        self.log = setup_logger(Config.from_env())

    def _get_model_params(self, model_name: str, trial: optuna.Trial) -> Dict[str, Any]:

        params: Dict[str, Any] = {"random_state": self.cfg['random_state']}
        is_boosting_model = model_name in ['XGBoost', 'LightGBM', 'CatBoost']

        if is_boosting_model:
            param_name = 'iterations' if model_name == 'CatBoost' else 'n_estimators'
            params[param_name] = 2000
            params['early_stopping_rounds'] = trial.suggest_int('early_stopping_rounds', 10, self.cfg['early_stopping']['rounds'])

        if model_name == 'XGBoost':
            params.update({
                'max_depth': trial.suggest_int('max_depth', 3, 6),
                'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
                'subsample': trial.suggest_float('subsample', 0.7, 1.0),
                'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.1, 10.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
                'n_jobs': 1,
                'random_state': self.cfg['random_state'],
                'enable_categorical': False,
                'tree_method': 'hist',
                'verbosity': 0
            })
        elif model_name == 'RandomForest':
            params.update({
                'n_estimators': trial.suggest_int('n_estimators', 50, 400),
                'max_depth': trial.suggest_int('max_depth', 3, 10),
                'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
                'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
                'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced']),
                'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
                'n_jobs': 1, 'random_state': self.cfg['random_state']
            })
        elif model_name == 'LogisticRegression':
            penalty = trial.suggest_categorical('penalty', ['l1', 'l2', 'elasticnet'])
            solver = trial.suggest_categorical(
                'solver', ['liblinear', 'saga', 'lbfgs', 'newton-cg', 'sag']
            )
            if (
                (penalty == 'l1' and solver not in ['liblinear', 'saga']) or
                (penalty == 'l2' and solver not in ['liblinear', 'saga', 'lbfgs', 'newton-cg', 'sag']) or
                (penalty == 'elasticnet' and solver != 'saga')
            ):
                raise optuna.TrialPruned()
            params.update({
                'C': trial.suggest_float('C', 0.001, 100.0, log=True),
                'penalty': penalty, 'solver': solver,
                'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced']),
                'max_iter': 2000, 'random_state': CONFIG['random_state']
            })
            if penalty == 'elasticnet':
                params['l1_ratio'] = trial.suggest_float('l1_ratio', 0.1, 0.9)
        elif model_name == 'CatBoost':
            bootstrap_type = trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli'])
            params.update({
                'depth': trial.suggest_int('depth', 3, 10),
                'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
                'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 1000, log=True),
                'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 50),
                'bootstrap_type': bootstrap_type,
                'auto_class_weights': trial.suggest_categorical('auto_class_weights', [None, 'Balanced']),
                'border_count': trial.suggest_int('border_count', 32, 128),
                'random_state': self.cfg['random_state'], 'logging_level': 'Silent'
            })
            if bootstrap_type == 'Bernoulli':
                params['subsample'] = trial.suggest_float('subsample', 0.6, 1.0)
            elif bootstrap_type == 'Bayesian':
                params['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0.0, 1.0)

        elif model_name == 'SVM':
            kernel = trial.suggest_categorical('kernel', ['linear', 'rbf'])
            params.update({
                'C': trial.suggest_float('C', 0.01, 10.0, log=True),
                'kernel': kernel, 'probability': True,
                'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced']),
                'random_state': self.cfg['random_state']
            })
            if kernel == 'rbf':
                params['gamma'] = trial.suggest_categorical('gamma', ['scale', 'auto'])
        elif model_name == 'LightGBM':
            params.update({
                'max_depth': trial.suggest_int('max_depth', 3, 6),
                'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
                'subsample': trial.suggest_float('subsample', 0.7, 1.0),
                'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced']),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
                'reg_lambda': trial.suggest_float('reg_lambda', 0.1, 10.0, log=True),
                'n_jobs': 1, 'random_state': self.cfg['random_state'], 'verbosity': -1
            })
        return params

    def _create_model(self, model_name, params):
        return self.models[model_name](**params)



    def _preprocess_to_df(self, pre: ColumnTransformer, X_df_raw: pd.DataFrame, feat_names: List[str]) -> pd.DataFrame:

        X_tx = pre.transform(X_df_raw)

        if hasattr(X_tx, 'toarray'):
             X_tx = X_tx.toarray()


        current_cols_count = X_tx.shape[1]


        if current_cols_count != len(feat_names):
             temp_cols = [f'f{i}' for i in range(current_cols_count)]
             temp_df = pd.DataFrame(X_tx, columns=temp_cols, index=X_df_raw.index)


             try:
                 temp_df = pd.DataFrame(X_tx, columns=feat_names, index=X_df_raw.index)
             except ValueError:

                 placeholder_cols = [f'_col_{i}' for i in range(current_cols_count)]
                 temp_df = pd.DataFrame(X_tx, columns=placeholder_cols, index=X_df_raw.index)

                 aligned_df = temp_df.reindex(columns=feat_names, fill_value=0.0)
                 return aligned_df

             return temp_df
        else:

             return pd.DataFrame(X_tx, columns=feat_names, index=X_df_raw.index)



    def _preprocess_for_xgboost(self, X_train: pd.DataFrame, X_test: pd.DataFrame, pre: ColumnTransformer, feat_names: List[str]) -> Tuple[pd.DataFrame, pd.DataFrame]:


        X_train_df = self._preprocess_to_df(pre, X_train, feat_names)
        X_test_df = self._preprocess_to_df(pre, X_test, feat_names)


        X_train_processed = X_train_df.apply(pd.to_numeric, errors='coerce').fillna(0)
        X_test_processed = X_test_df.apply(pd.to_numeric, errors='coerce').fillna(0)

        return X_train_processed, X_test_processed

    def _optimize_inner(self, X_train: pd.DataFrame, y_train: pd.Series, model_name: str, pre: ColumnTransformer, feat_names: List[str]) -> Dict[str, Any]:

        def objective(trial: optuna.Trial) -> float:
            params = self._get_model_params(model_name, trial)
            is_boosting = model_name in ['XGBoost', 'LightGBM', 'CatBoost']
            early_stopping_rounds = params.pop('early_stopping_rounds', 30) if is_boosting else None


            eval_metric = None
            if model_name == 'XGBoost':
                eval_metric = 'logloss'
            elif model_name == 'LightGBM':
                eval_metric = 'binary_logloss'
            elif model_name == 'CatBoost':
                eval_metric = 'Logloss'

            if eval_metric and is_boosting:
                params['eval_metric'] = eval_metric



            sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2,
                                         random_state=self.cfg['random_state'])
            for tr_idx, val_idx in sss.split(X_train, y_train):
                X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
                y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]


            if len(np.unique(y_tr)) < 2 or len(np.unique(y_val)) < 2:
                raise optuna.TrialPruned()

            X_tr_df = self._preprocess_to_df(pre, X_tr, feat_names)
            X_val_df = self._preprocess_to_df(pre, X_val, feat_names)


            if model_name == 'XGBoost':
                X_tr_df = X_tr_df.apply(pd.to_numeric, errors='coerce').fillna(0)
                X_val_df = X_val_df.apply(pd.to_numeric, errors='coerce').fillna(0)


            base = self._create_model(model_name, params)

            fit_args = {}
            if is_boosting:
                fit_args['eval_set'] = [(X_val_df, y_val)]
                fit_args['early_stopping_rounds'] = early_stopping_rounds
                fit_args['verbose'] = False

            if model_name == 'LightGBM':
                try:
                    import lightgbm as lgb
                    callbacks = [lgb.early_stopping(stopping_rounds=early_stopping_rounds, verbose=False), lgb.log_evaluation(0)]
                    fit_args['callbacks'] = callbacks
                    fit_args['eval_metric'] = params.get('eval_metric', 'logloss')
                    base.fit(X_tr_df, y_tr, **fit_args)
                except Exception:
                    base.fit(X_tr_df, y_tr)
            elif model_name == 'XGBoost':
               try:
                   base.fit(
                       X_tr_df, y_tr,
                       eval_set=fit_args['eval_set'],
                       early_stopping_rounds=fit_args['early_stopping_rounds'],
                       verbose=False
                   )
               except Exception:
                   base.fit(X_tr_df, y_tr)
            elif model_name == 'CatBoost':
                try:

                    base.fit(X_tr_df, y_tr, eval_set=fit_args['eval_set'], early_stopping_rounds=fit_args['early_stopping_rounds'], verbose=fit_args['verbose'])
                except Exception:
                    base.fit(X_tr_df, y_tr)
            else:
                base.fit(X_tr_df, y_tr)


            selector = HybridFeatureSelector(self.cfg)
            selector.fit(X_tr_df, y_tr, base)
            X_tr_sel = selector.transform(X_tr_df)
            X_val_sel = selector.transform(X_val_df)


            final_model = self._create_model(model_name, params)

            fit_args = {}

            is_boosting = model_name in ['XGBoost', 'LightGBM', 'CatBoost']
            if is_boosting:
                early_stopping_rounds = params.get('early_stopping_rounds', 30)
                fit_args['early_stopping_rounds'] = early_stopping_rounds
                fit_args['verbose'] = False

            if model_name == 'LightGBM' or model_name == 'XGBoost' or model_name == 'CatBoost':
                fit_args['eval_set'] = [(X_val_sel, y_val)]


            if model_name == 'LightGBM':
                final_model.fit(X_tr_sel, y_tr, callbacks=fit_args.get('callbacks'), eval_set=fit_args['eval_set'], eval_metric=fit_args.get('eval_metric'))
            elif model_name == 'XGBoost':
                final_model.fit(X_tr_sel, y_tr, eval_set=fit_args['eval_set'], early_stopping_rounds=early_stopping_rounds, verbose=False)
            elif model_name == 'CatBoost':
                final_model.fit(X_tr_sel, y_tr, eval_set=fit_args['eval_set'], early_stopping_rounds=early_stopping_rounds, verbose=False)
            else:
                final_model.fit(X_tr_sel, y_tr)



            if hasattr(final_model, 'predict_proba'):
                y_prob = final_model.predict_proba(X_val_sel)[:, 1]
            else:
                y_prob = _decision_to_proba(final_model, X_val_sel)


            try:
                logloss = log_loss(y_val, y_prob)
                return -logloss
            except ValueError:
                return -999.0


        study = optuna.create_study(
            direction='maximize',
            study_name=f'opt_{model_name}',
            sampler=TPESampler(seed=self.cfg['random_state'])
        )

        study.optimize(
            objective,
            n_trials=self.cfg['optimization']['n_trials'],
            n_jobs=1,
            timeout=None
        )


        best_params = study.best_params
        is_boosting = model_name in ['XGBoost', 'LightGBM', 'CatBoost']

        if is_boosting:
            param_name = 'iterations' if model_name == 'CatBoost' else 'n_estimators'
            best_params[param_name] = 2000


            best_params.pop('early_stopping_rounds', None)
            best_params.pop('eval_metric', None)
            best_params['random_state'] = self.cfg['random_state']


        self.log.info(f"       Validating best parameters with {self.cfg['optimization']['inner_cv_folds']}-fold CV...")
        cv_scores = self._validate_params_with_cv(X_train, y_train, model_name, best_params.copy(), pre, feat_names)
        self.log.info(f"       Inner CV AUC performance: {np.mean(cv_scores):.3f} ± {np.std(cv_scores):.3f}")

        return best_params

    def _validate_params_with_cv(self, X_train: pd.DataFrame, y_train: pd.Series, model_name: str, params: Dict[str, Any], pre: ColumnTransformer, feat_names: List[str]) -> List[float]:

        inner_cv = StratifiedKFold(
            n_splits=self.cfg['optimization']['inner_cv_folds'],
            shuffle=True,
            random_state=self.cfg['random_state']
        )
        cv_scores = []
        cv_params = params.copy()
        cv_params.pop('early_stopping_rounds', None)

        for inner_fold, (tr_idx, val_idx) in enumerate(inner_cv.split(X_train, y_train)):
            X_tr_fold, X_val_fold = X_train.iloc[tr_idx], X_train.iloc[val_idx]
            y_tr_fold, y_val_fold = y_train.iloc[tr_idx], y_train.iloc[val_idx]


            X_tr_df = self._preprocess_to_df(pre, X_tr_fold, feat_names)
            X_val_df = self._preprocess_to_df(pre, X_val_fold, feat_names)

            if model_name == 'XGBoost':
                X_tr_df = X_tr_df.apply(pd.to_numeric, errors='coerce').fillna(0)
                X_val_df = X_val_df.apply(pd.to_numeric, errors='coerce').fillna(0)


            base_for_selector = self._create_model(model_name, cv_params)
            base_for_selector.fit(X_tr_df, y_tr_fold)

            selector = HybridFeatureSelector(self.cfg)
            selector.fit(X_tr_df, y_tr_fold, base_for_selector)


            X_tr_sel = selector.transform(X_tr_df)
            X_val_sel = selector.transform(X_val_df)



            final_model = self._create_model(model_name, cv_params)
            final_model.fit(X_tr_sel, y_tr_fold)

            if hasattr(final_model, 'predict_proba'):
                y_prob = final_model.predict_proba(X_val_sel)[:, 1]
            else:
                y_prob = _decision_to_proba(final_model, X_val_sel)

            cv_scores.append(roc_auc_score(y_val_fold, y_prob))

        return cv_scores

    def _train_calibrate(self, X_tr: pd.DataFrame, y_tr: pd.Series, model_name: str, params: Dict[str, Any], pre: ColumnTransformer, feat_names: List[str]):

        if model_name == 'XGBoost':
            X_tr_df, _ = self._preprocess_for_xgboost(X_tr, X_tr, pre, feat_names)
        else:
            X_tr_df = self._preprocess_to_df(pre, X_tr, feat_names)

        params.pop('early_stopping_rounds', None)
        params.pop('eval_metric', None)

        base_for_selector = self._create_model(model_name, params)
        base_for_selector.fit(X_tr_df, y_tr)

        selector = HybridFeatureSelector(self.cfg)
        selector.fit(X_tr_df, y_tr, base_for_selector)
        X_tr_sel = selector.transform(X_tr_df)

        base_model = self._create_model(model_name, params)

        if model_name == 'SVM':

            base_model.set_params(probability=True)

        calibrated = CalibratedClassifierCV(
            base_model,
            method=self.cfg['evaluation']['calibration_method'],
            cv=self.cfg['evaluation']['calibration_cv_folds']
        )
        calibrated.fit(X_tr_sel, y_tr)
        return calibrated, selector


    def run_pipeline(self):

        self.log.info("Starting nested cross-validation pipeline...")
        outer = StratifiedKFold(self.cfg['evaluation']['outer_cv_folds'], shuffle=True, random_state=self.cfg['random_state'])
        summaries, plot_data = [], {}

        for model_name in self.models:
            self.log.info(f"=== Processing {model_name} ===")
            fold_metrics, all_y, all_p = [], [], []
            model_params = {}

            for fold_id, (tr_idx, te_idx) in enumerate(outer.split(self.X, self.y), 1):

                try:
                    X_tr, X_te = self.X.iloc[tr_idx], self.X.iloc[te_idx]
                    y_tr, y_te = self.y.iloc[tr_idx], self.y.iloc[te_idx]

                    if len(np.unique(y_tr)) < 2 or len(np.unique(y_te)) < 2:
                        self.log.warning(f"Fold {fold_id}: single-class – skipped")
                        continue
                    pre = create_preprocessor(X_tr).fit(X_tr)
                    feat_names = get_feature_names(pre, X_tr)


                    best_params = self._optimize_inner(X_tr, y_tr, model_name, pre, feat_names)


                    calibrated, selector = self._train_calibrate(X_tr, y_tr, model_name, best_params.copy(), pre, feat_names)


                    if model_name == 'XGBoost':
                        X_te_df, _ = self._preprocess_for_xgboost(X_te, X_te, pre, feat_names)
                    else:
                        X_te_df = self._preprocess_to_df(pre, X_te, feat_names)
                    X_te_sel = selector.transform(X_te_df)

                    y_proba = calibrated.predict_proba(X_te_sel)[:, 1]
                    y_pred = (y_proba >= 0.5).astype(int)

                    m = collect_metrics(y_te.to_numpy(), y_proba, y_pred)
                    fold_metrics.append(m)


                    artifact_path = self.cfg['output_dirs']['models'] / f"{model_name}_fold{fold_id}_artifact.pkl"
                    joblib.dump({'calibrated_model': calibrated, 'preprocessor': pre, 'selector': selector, 'best_params': best_params}, artifact_path)

                    all_y.append(y_te.to_numpy())
                    all_p.append(y_proba)
                    gc.collect()

                except Exception:
                    self.log.error(f"Catastrophic error in {model_name} fold {fold_id}:\n{traceback.format_exc()}")
                    continue

            if fold_metrics:
                df_fold = pd.DataFrame(fold_metrics)
                numeric_cols = df_fold.select_dtypes(include=[np.number]).columns
                summary = {f"{c}_mean": df_fold[c].mean() for c in numeric_cols}
                summary.update({f"{c}_std": df_fold[c].std() for c in numeric_cols})
                summary['model'] = model_name
                summary['n_folds'] = len(fold_metrics)
                summaries.append(summary)

                plot_data[model_name] = {'y': np.concatenate(all_y), 'p': np.concatenate(all_p)}


                try:
                    pre_full = create_preprocessor(self.X).fit(self.X)
                    feat_names_full = get_feature_names(pre_full, self.X)
                    best_full = self._optimize_inner(self.X, self.y, model_name, pre_full, feat_names_full)
                    model_params['final_model'] = best_full

                    param_path = self.cfg['output_dirs']['models'].parent / "best_params" / f'{model_name}_params.json'
                    with open(param_path, 'w') as f:
                        json.dump(model_params, f, indent=4)

                    calibrated_full, selector_full = self._train_calibrate(self.X, self.y, model_name, best_full.copy(), pre_full, feat_names_full)

                    final_path = self.cfg['output_dirs']['models'] / f"{model_name}_final_calibrated_model.pkl"
                    joblib.dump({'calibrated_model': calibrated_full, 'preprocessor': pre_full, 'selector': selector_full,
                                 'metadata': {'selected_features': selector_full.selected_features_, 'model_name': model_name, 'best_params': best_full}}, final_path)

                    self.log.info(f"Final model {model_name} complete and saved.")
                except Exception:
                    self.log.error(f"Error training final {model_name}:\n{traceback.format_exc()}")
            else:
                self.log.warning(f"{model_name} failed completely - skipping final analysis.")


        if summaries:
            self.log.info("Performing statistical analysis...")


            metrics_df = pd.DataFrame(summaries)
            metrics_df.to_csv(self.cfg['output_dirs']['metrics'] / 'final_metrics.csv', index=False)


            global CONFIG
            CONFIG = self.cfg


            _, effect_df = calculate_effect_sizes_and_tests(summaries, output_dir=CONFIG['output_dirs']['metrics'])
            effect_df['Comparison'] = effect_df['Model_A'] + '_vs_' + effect_df['Model_B']

            statistical_tests_dict = perform_statistical_tests(plot_data)
            test_records = [{'Comparison': comp, **tests} for comp, tests in statistical_tests_dict.items()]
            test_df = pd.DataFrame(test_records)
            test_df.to_csv(os.path.join(CONFIG['output_dirs']['metrics'], 'prediction_statistical_tests.csv'), index=False)

            print("\n 🔧  Applying multiple comparisons corrections...")
            test_df_corrected = test_df.copy()
            if not effect_df.empty:
                effect_df_corrected = apply_multiple_comparisons_correction(effect_df, p_value_columns=['p_value'], alpha=0.05, method='both')
                effect_df_corrected.to_csv(os.path.join(CONFIG['output_dirs']['metrics'], 'cv_pairwise_effects_corrected.csv'), index=False)
                print_correction_summary(effect_df_corrected)

            if not test_df.empty:
                test_df_corrected = apply_multiple_comparisons_correction(test_df, p_value_columns=['auc_difference_pval', 'mcnemar_pval'], alpha=0.05, method='both')
                test_df_corrected.to_csv(os.path.join(CONFIG['output_dirs']['metrics'], 'prediction_statistical_tests_corrected.csv'), index=False)
                print_correction_summary(test_df_corrected)


            print("\n 🎨 Generating plots...")
            self._make_plots(summaries, plot_data, effect_df, test_df_corrected)
            print(f"All plots saved to {CONFIG['output_dirs']['plots']}/")


            print("\n 📈  Statistical Summary of AUC Differences (Bootstrap Test):")
            if not test_df_corrected.empty:
                pval_col = 'auc_difference_pval_fdr' if 'auc_difference_pval_fdr' in test_df_corrected.columns else 'auc_difference_pval'
                sig_tests = test_df_corrected[test_df_corrected[pval_col] < 0.05]

                if not sig_tests.empty:
                    print(f"Significant differences (FDR p < 0.05):")
                    for _, row in sig_tests.iterrows():
                        print(f"   {row['Comparison']}: ΔAUC={row['auc_difference']:.3f}, p-fdr={row[pval_col]:.3f}")
                else:
                    print("No significant AUC differences found between models (FDR p < 0.05).")
            else:
                print("No test results available to summarize.")
        else:
            print("\n 📊  No model summaries available for statistical analysis and plotting.")

        return summaries, plot_data


    def _make_plots(self, summaries, plot_data, effect_df, test_df):

        try:
            matplotlib.rcParams.update({
                'font.family': 'DejaVu Sans', 'font.size': 6, 'axes.titlesize': 6,
                'axes.labelsize': 6, 'xtick.labelsize': 6, 'ytick.labelsize': 6,
                'legend.fontsize': 6, 'figure.dpi': CONFIG['plotting']['dpi']
            })
            colors = sns.color_palette("colorblind")


            plt.figure(figsize=(3.5, 3.5))
            for i, (m, d) in enumerate(plot_data.items()):
                pt, pp = calibration_curve_fixed_bins(d['y'], d['p'], CONFIG['evaluation']['n_bins'])
                plt.plot(pp, pt, 'o-', label=m, color=colors[i % len(colors)],
                         linewidth=0.7, markersize=2, markeredgewidth=0.7)
            plt.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated', linewidth=0.7)
            plt.xlabel('Mean Predicted Probability'); plt.ylabel('Fraction of Positives')
            plt.title("Calibration Curves (Reliability Diagram)"); plt.legend(loc='best')
            plt.grid(True, alpha=0.5); plt.tight_layout()
            plt.savefig(os.path.join(CONFIG['output_dirs']['plots'], 'calibration_all_models.png'))
            plt.close()


            dfm = pd.DataFrame(summaries)
            metrics = ['roc_auc_mean', 'pr_auc_mean', 'f1_mean', 'accuracy_mean', 'kappa_mean', 'mcc_mean']


            metrics = [m for m in metrics if m in dfm.columns]
            if len(metrics) > 0:
                dfr = dfm[['model'] + metrics].copy()
                for c in metrics:
                    dfr[c] = (dfr[c] - dfr[c].min()) / (dfr[c].max() - dfr[c].min() + 1e-9)

                angles = np.linspace(0, 2 * np.pi, len(metrics), endpoint=False).tolist() + [0]
                fig, ax = plt.subplots(subplot_kw={'polar': True}, figsize=(3.5, 3.5))
                matplotlib.rcParams['font.family'] = 'DejaVu Sans'

                for i, row in dfr.iterrows():
                    vals = row[metrics].tolist() + [row[metrics[0]]]
                    ax.plot(angles, vals, label=row['model'], color=colors[i % len(colors)], linewidth=0.8)
                    ax.fill(angles, vals, alpha=0.1, color=colors[i % len(colors)])

                metric_labels = [m.replace('_mean', '').replace('_', ' ').upper() for m in metrics]
                ax.set_thetagrids(np.degrees(angles[:-1]), metric_labels, fontsize=6)
                ax.set_ylim(0, 1)
                ax.set_yticks([0.2, 0.4, 0.6, 0.8, 1.0])
                ax.set_yticklabels(['0.2', '0.4', '0.6', '0.8', '1.0'], fontsize=5)
                ax.grid(True, alpha=0.3, linewidth=0.5)
                ax.legend(
                    loc='upper center',
                    bbox_to_anchor=(0.5, -0.15),
                    fontsize=6,
                    frameon=False,
                    borderaxespad=0,
                    ncol=3
                )
                plt.title('Model Performance Radar', fontsize=7, pad=20)
                plt.tight_layout()
                plt.savefig(os.path.join(CONFIG['output_dirs']['plots'], 'radar_chart.png'), dpi=450, bbox_inches='tight', facecolor='white', edgecolor='none')
                plt.close()
            else:
                logging.warning("Skipping radar chart: Not enough metrics available.")



            dfm = pd.DataFrame(summaries)
            if 'model' not in dfm.columns or 'roc_auc_mean' not in dfm.columns or 'pr_auc_mean' not in dfm.columns:

                logging.warning("Cannot align plot legends: Model summary data is missing required columns (model, roc_auc_mean, pr_auc_mean).")
                return


            mean_auc_lookup = dfm.set_index('model')['roc_auc_mean'].to_dict()
            mean_ap_lookup = dfm.set_index('model')['pr_auc_mean'].to_dict()


            plt.figure(figsize=(3.5, 3.5))
            for i, (m, d) in enumerate(plot_data.items()):

                fpr, tpr, _ = roc_curve(d['y'], d['p'])


                mean_auc = mean_auc_lookup.get(m, np.nan)


                label_text = f"{m} (AUC={mean_auc:.3f})" if not np.isnan(mean_auc) else f"{m} (AUC=N/A)"

                plt.plot(fpr, tpr, label=label_text, color=colors[i % len(colors)], linewidth=0.7)

            plt.plot([0, 1], [0, 1], 'k--', label='Chance', linewidth=0.7)
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title('ROC Curves')
            plt.legend(loc='lower right')
            plt.grid(True, alpha=0.3)
            plt.tight_layout()
            plt.savefig(os.path.join(CONFIG['output_dirs']['plots'], 'roc_curves.png'))
            plt.close()


            plt.figure(figsize=(3.5, 3.5))
            for i, (m, d) in enumerate(plot_data.items()):

                precision, recall, _ = precision_recall_curve(d['y'], d['p'])


                mean_ap = mean_ap_lookup.get(m, np.nan)


                label_text = f"{m} (AP={mean_ap:.3f})" if not np.isnan(mean_ap) else f"{m} (AP=N/A)"

                plt.plot(recall, precision, label=label_text, color=colors[i % len(colors)], linewidth=0.7)

            plt.xlabel('Recall')
            plt.ylabel('Precision')
            plt.title('Precision-Recall Curves')
            plt.legend(loc='best')
            plt.grid(True, alpha=0.3)
            plt.tight_layout()
            plt.savefig(os.path.join(CONFIG['output_dirs']['plots'], 'pr_curves.png'))
            plt.close()


            if not effect_df.empty:
                heatmap_df = effect_df.pivot(index='Metric', columns='Comparison', values='Cohens_d')
                plt.figure(figsize=(max(6, len(heatmap_df.columns)*0.5), max(4, len(heatmap_df.index)*0.4)))
                sns.heatmap(heatmap_df, annot=True, fmt=".2f", cmap='RdBu_r', vmin=-1.5, vmax=1.5,
                            cbar_kws={'label': "Cohen's d Effect Size"},
                            annot_kws={'fontsize': 6, 'fontname': 'DejaVu Sans'},
                            linewidths=0.5, linecolor='lightgrey', square=False)
                plt.title("Effect Sizes (Cohen's d) Between Models Across CV Folds", fontsize=6, pad=8)
                plt.ylabel('', fontsize=6); plt.xlabel('', fontsize=6)
                plt.xticks(rotation=45, ha='right', fontsize=6, fontname='DejaVu Sans')
                plt.yticks(rotation=0, fontsize=6, fontname='DejaVu Sans')
                plt.tight_layout()
                plt.savefig(os.path.join(CONFIG['output_dirs']['plots'], 'effect_sizes_heatmap.png'))
                plt.close()
            else:
                logging.warning("Skipping effect size heatmap: effect_df is empty.")


            if not test_df.empty:
                fig, ax = plt.subplots(figsize=(max(6, len(test_df)*0.5), 4.0))


                pval_col = 'auc_difference_pval_fdr' if 'auc_difference_pval_fdr' in test_df.columns else 'auc_difference_pval'
                p_values = test_df[pval_col]

                bar_colors = ['darkred' if p < 0.001 else 'red' if p < 0.01 else 'orange' if p < 0.05 else 'lightblue' for p in p_values]
                bars = ax.bar(test_df['Comparison'], test_df['auc_difference'], color=bar_colors, edgecolor='black', linewidth=0.7)
                ax.axhline(y=0, color='black', linestyle='-', alpha=0.7, linewidth=0.7)

                for i, p_val in enumerate(p_values):
                    height = bars[i].get_height()
                    significance = '***' if p_val < 0.001 else '**' if p_val < 0.01 else '*' if p_val < 0.05 else ''
                    if significance:
                        ax.text(bars[i].get_x() + bars[i].get_width()/2., height + (0.01 if height >= 0 else -0.01),
                                significance, ha='center', va='bottom' if height >= 0 else 'top', fontsize=6, fontname='DejaVu Sans')

                ax.set_ylabel('AUC Difference\n(Model A - Model B)', fontsize=6, fontname='DejaVu Sans')
                ax.set_xlabel('Model Comparisons', fontsize=6, fontname='DejaVu Sans')
                ax.set_title(f'Statistical Significance of AUC Differences (Bootstrap Test, FDR corrected)', fontsize=6)
                plt.xticks(rotation=45, ha='left', fontsize=6, fontname='DejaVu Sans')
                legend_elements = [
                    plt.Rectangle((0,0),1,1, color='darkred', label='p < 0.001 (***)'),
                    plt.Rectangle((0,0),1,1, color='red', label='p < 0.01 (**)'),
                    plt.Rectangle((0,0),1,1, color='orange', label='p < 0.05 (*)'),
                    plt.Rectangle((0,0),1,1, color='lightblue', label='NS (p ≥ 0.05)')]
                ax.legend(handles=legend_elements, loc='upper right', title='Significance Level', title_fontsize=6, frameon=False)
                plt.grid(True, alpha=0.2, axis='y', linewidth=0.5)
                ax.set_axisbelow(True)
                plt.tight_layout()
                plt.savefig(os.path.join(CONFIG['output_dirs']['plots'], 'statistical_significance.png'))
                plt.close()
            else:
                logging.warning("Skipping statistical significance bar plot: test_df is empty.")

        except Exception as e:
            logging.error(f"Error during plotting in _make_plots: {e}\n{traceback.format_exc()}")
            print(f"       ❌   Error during plotting: {e}")





if __name__ == '__main__':
    try:

        cfg_obj = Config.from_env()
        CONFIG = cfg_obj.to_dict()
        log = setup_logger(cfg_obj)

        log.info("Loading training data...")

        train_csv_path = CONFIG['data']['train_csv']
        target_col = CONFIG['data']['target']

        if not Path(train_csv_path).exists():
            log.critical(f"Training data file '{train_csv_path}' not found.")
            sys.exit(1)

        df_train = pd.read_csv(train_csv_path)
        if target_col not in df_train.columns:
            log.critical(f"Target column '{target_col}' not found in training data.")
            sys.exit(1)

        X_train = df_train.drop(columns=[target_col])
        y_train = df_train[target_col].astype(int)

        log.info(f"Data shape: {X_train.shape} | Class balance:\n{y_train.value_counts(normalize=True)}")

        pipeline = NestedCVPipeline(X_train, y_train, CONFIG)
        summaries, plot_data = pipeline.run_pipeline()

        if summaries and plot_data:
            log.info("=== Pipeline Complete ===")
        else:
            log.warning("Pipeline finished but no models completed successfully.")

    except Exception:
        log.critical(f"A critical error occurred:\n{traceback.format_exc()}")
        sys.exit(1)

[I 2025-12-09 16:52:10,705] A new study created in memory with name: opt_LogisticRegression
[I 2025-12-09 16:52:10,771] Trial 0 finished with value: -0.5973404148633357 and parameters: {'penalty': 'l2', 'solver': 'sag', 'C': 1.0129197956845732, 'class_weight': None}. Best is trial 0 with value: -0.5973404148633357.
[I 2025-12-09 16:52:10,775] Trial 1 pruned. 
[I 2025-12-09 16:52:10,856] Trial 2 finished with value: -0.5974324177366267 and parameters: {'penalty': 'l2', 'solver': 'newton-cg', 'C': 0.37253938395788866, 'class_weight': None}. Best is trial 0 with value: -0.5973404148633357.
[I 2025-12-09 16:52:10,976] Trial 3 finished with value: -0.5972367747695909 and parameters: {'penalty': 'l1', 'solver': 'saga', 'C': 2.6373339933815254, 'class_weight': None}. Best is trial 3 with value: -0.5972367747695909.
[I 2025-12-09 16:52:11,050] Trial 4 finished with value: -0.6760530766001178 and parameters: {'penalty': 'elasticnet', 'solver': 'saga', 'C': 0.008399864445957504, 'class_weight': 

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
1000:	learn: 0.3932013	total: 3.37s	remaining: 3.36s
1001:	learn: 0.3931727	total: 3.37s	remaining: 3.36s
1002:	learn: 0.3930890	total: 3.37s	remaining: 3.35s
1003:	learn: 0.3930468	total: 3.38s	remaining: 3.35s
1004:	learn: 0.3929821	total: 3.38s	remaining: 3.35s
1005:	learn: 0.3928601	total: 3.38s	remaining: 3.34s
1006:	learn: 0.3927549	total: 3.39s	remaining: 3.34s
1007:	learn: 0.3927357	total: 3.39s	remaining: 3.34s
1008:	learn: 0.3926929	total: 3.39s	remaining: 3.33s
1009:	learn: 0.3926649	total: 3.4s	remaining: 3.33s
1010:	learn: 0.3925282	total: 3.4s	remaining: 3.33s
1011:	learn: 0.3925055	total: 3.4s	remaining: 3.32s
1012:	learn: 0.3924399	total: 3.41s	remaining: 3.32s
1013:	learn: 0.3924039	total: 3.41s	remaining: 3.31s
1014:	learn: 0.3922819	total: 3.41s	remaining: 3.31s
1015:	learn: 0.3922479	total: 3.42s	remaining: 3.31s
1016:	learn: 0.3921858	total: 3.42s	remaining: 3.31s
1017:	learn: 0.3921446	total: 3.42s	r

[I 2025-12-09 17:19:01,963] A new study created in memory with name: opt_CatBoost
[I 2025-12-09 17:19:14,464] Trial 0 finished with value: -0.5073328322115354 and parameters: {'early_stopping_rounds': 17, 'bootstrap_type': 'Bayesian', 'depth': 7, 'learning_rate': 0.0020513382630874496, 'l2_leaf_reg': 2.937538457632828, 'min_data_in_leaf': 3, 'auto_class_weights': None, 'border_count': 100, 'bagging_temperature': 0.020584494295802447}. Best is trial 0 with value: -0.5073328322115354.
[I 2025-12-09 17:19:18,718] Trial 1 finished with value: -0.5251408891590298 and parameters: {'early_stopping_rounds': 30, 'bootstrap_type': 'Bayesian', 'depth': 4, 'learning_rate': 0.002327067708383781, 'l2_leaf_reg': 8.17949947521167, 'min_data_in_leaf': 27, 'auto_class_weights': None, 'border_count': 91, 'bagging_temperature': 0.13949386065204183}. Best is trial 0 with value: -0.5073328322115354.
[I 2025-12-09 17:19:38,824] Trial 2 finished with value: -0.524381722404791 and parameters: {'early_stopping_

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
1000:	learn: 0.3990063	total: 1.91s	remaining: 1.91s
1001:	learn: 0.3989559	total: 1.91s	remaining: 1.9s
1002:	learn: 0.3988929	total: 1.91s	remaining: 1.9s
1003:	learn: 0.3987584	total: 1.92s	remaining: 1.9s
1004:	learn: 0.3986495	total: 1.92s	remaining: 1.9s
1005:	learn: 0.3985675	total: 1.92s	remaining: 1.9s
1006:	learn: 0.3984129	total: 1.92s	remaining: 1.9s
1007:	learn: 0.3983639	total: 1.92s	remaining: 1.89s
1008:	learn: 0.3982317	total: 1.93s	remaining: 1.89s
1009:	learn: 0.3980968	total: 1.93s	remaining: 1.89s
1010:	learn: 0.3980042	total: 1.93s	remaining: 1.89s
1011:	learn: 0.3979221	total: 1.93s	remaining: 1.89s
1012:	learn: 0.3978175	total: 1.93s	remaining: 1.88s
1013:	learn: 0.3977439	total: 1.94s	remaining: 1.88s
1014:	learn: 0.3976144	total: 1.94s	remaining: 1.88s
1015:	learn: 0.3973992	total: 1.94s	remaining: 1.88s
1016:	learn: 0.3973023	total: 1.94s	remaining: 1.88s
1017:	learn: 0.3972104	total: 1.94s	rema

[I 2025-12-09 17:22:02,182] A new study created in memory with name: opt_CatBoost
[I 2025-12-09 17:22:13,522] Trial 0 finished with value: -0.5404144626643712 and parameters: {'early_stopping_rounds': 17, 'bootstrap_type': 'Bayesian', 'depth': 7, 'learning_rate': 0.0020513382630874496, 'l2_leaf_reg': 2.937538457632828, 'min_data_in_leaf': 3, 'auto_class_weights': None, 'border_count': 100, 'bagging_temperature': 0.020584494295802447}. Best is trial 0 with value: -0.5404144626643712.
[I 2025-12-09 17:22:17,798] Trial 1 finished with value: -0.5610049576965327 and parameters: {'early_stopping_rounds': 30, 'bootstrap_type': 'Bayesian', 'depth': 4, 'learning_rate': 0.002327067708383781, 'l2_leaf_reg': 8.17949947521167, 'min_data_in_leaf': 27, 'auto_class_weights': None, 'border_count': 91, 'bagging_temperature': 0.13949386065204183}. Best is trial 0 with value: -0.5404144626643712.
[I 2025-12-09 17:22:38,109] Trial 2 finished with value: -0.5551501452842929 and parameters: {'early_stopping

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
1000:	learn: 0.2788437	total: 10.8s	remaining: 10.8s
1001:	learn: 0.2788006	total: 10.8s	remaining: 10.8s
1002:	learn: 0.2787738	total: 10.8s	remaining: 10.8s
1003:	learn: 0.2787269	total: 10.8s	remaining: 10.8s
1004:	learn: 0.2786722	total: 10.9s	remaining: 10.7s
1005:	learn: 0.2786144	total: 10.9s	remaining: 10.7s
1006:	learn: 0.2785721	total: 10.9s	remaining: 10.7s
1007:	learn: 0.2785353	total: 10.9s	remaining: 10.7s
1008:	learn: 0.2784801	total: 10.9s	remaining: 10.7s
1009:	learn: 0.2784457	total: 10.9s	remaining: 10.7s
1010:	learn: 0.2784113	total: 10.9s	remaining: 10.7s
1011:	learn: 0.2783601	total: 10.9s	remaining: 10.7s
1012:	learn: 0.2783135	total: 10.9s	remaining: 10.7s
1013:	learn: 0.2782551	total: 11s	remaining: 10.7s
1014:	learn: 0.2782196	total: 11s	remaining: 10.6s
1015:	learn: 0.2781831	total: 11s	remaining: 10.6s
1016:	learn: 0.2781222	total: 11s	remaining: 10.6s
1017:	learn: 0.2780880	total: 11s	remainin

[I 2025-12-09 17:31:40,985] A new study created in memory with name: opt_CatBoost
[I 2025-12-09 17:31:54,039] Trial 0 finished with value: -0.4911030886235302 and parameters: {'early_stopping_rounds': 17, 'bootstrap_type': 'Bayesian', 'depth': 7, 'learning_rate': 0.0020513382630874496, 'l2_leaf_reg': 2.937538457632828, 'min_data_in_leaf': 3, 'auto_class_weights': None, 'border_count': 100, 'bagging_temperature': 0.020584494295802447}. Best is trial 0 with value: -0.4911030886235302.
[I 2025-12-09 17:31:58,204] Trial 1 finished with value: -0.507040174145569 and parameters: {'early_stopping_rounds': 30, 'bootstrap_type': 'Bayesian', 'depth': 4, 'learning_rate': 0.002327067708383781, 'l2_leaf_reg': 8.17949947521167, 'min_data_in_leaf': 27, 'auto_class_weights': None, 'border_count': 91, 'bagging_temperature': 0.13949386065204183}. Best is trial 0 with value: -0.4911030886235302.
[I 2025-12-09 17:32:18,831] Trial 2 finished with value: -0.4954890020347475 and parameters: {'early_stopping_

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
1000:	learn: 0.3674427	total: 1.99s	remaining: 1.98s
1001:	learn: 0.3673057	total: 1.99s	remaining: 1.98s
1002:	learn: 0.3671789	total: 1.99s	remaining: 1.98s
1003:	learn: 0.3671003	total: 1.99s	remaining: 1.98s
1004:	learn: 0.3670321	total: 1.99s	remaining: 1.97s
1005:	learn: 0.3669117	total: 2s	remaining: 1.97s
1006:	learn: 0.3668474	total: 2s	remaining: 1.97s
1007:	learn: 0.3667688	total: 2s	remaining: 1.97s
1008:	learn: 0.3666880	total: 2s	remaining: 1.97s
1009:	learn: 0.3666318	total: 2.01s	remaining: 1.97s
1010:	learn: 0.3665640	total: 2.01s	remaining: 1.97s
1011:	learn: 0.3665032	total: 2.01s	remaining: 1.96s
1012:	learn: 0.3664507	total: 2.01s	remaining: 1.96s
1013:	learn: 0.3663543	total: 2.02s	remaining: 1.96s
1014:	learn: 0.3661711	total: 2.02s	remaining: 1.96s
1015:	learn: 0.3660470	total: 2.02s	remaining: 1.96s
1016:	learn: 0.3659152	total: 2.02s	remaining: 1.96s
1017:	learn: 0.3658004	total: 2.02s	remaining:

[I 2025-12-09 17:34:49,171] A new study created in memory with name: opt_CatBoost
[I 2025-12-09 17:35:02,674] Trial 0 finished with value: -0.5134221946182849 and parameters: {'early_stopping_rounds': 17, 'bootstrap_type': 'Bayesian', 'depth': 7, 'learning_rate': 0.0020513382630874496, 'l2_leaf_reg': 2.937538457632828, 'min_data_in_leaf': 3, 'auto_class_weights': None, 'border_count': 100, 'bagging_temperature': 0.020584494295802447}. Best is trial 0 with value: -0.5134221946182849.
[I 2025-12-09 17:35:07,609] Trial 1 finished with value: -0.5289736829954994 and parameters: {'early_stopping_rounds': 30, 'bootstrap_type': 'Bayesian', 'depth': 4, 'learning_rate': 0.002327067708383781, 'l2_leaf_reg': 8.17949947521167, 'min_data_in_leaf': 27, 'auto_class_weights': None, 'border_count': 91, 'bagging_temperature': 0.13949386065204183}. Best is trial 0 with value: -0.5134221946182849.
[I 2025-12-09 17:35:27,182] Trial 2 finished with value: -0.5235314916704897 and parameters: {'early_stopping

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
1000:	learn: 0.3945262	total: 3.52s	remaining: 3.51s
1001:	learn: 0.3944195	total: 3.52s	remaining: 3.5s
1002:	learn: 0.3943633	total: 3.52s	remaining: 3.5s
1003:	learn: 0.3943198	total: 3.52s	remaining: 3.5s
1004:	learn: 0.3942161	total: 3.53s	remaining: 3.49s
1005:	learn: 0.3941609	total: 3.54s	remaining: 3.49s
1006:	learn: 0.3941180	total: 3.54s	remaining: 3.5s
1007:	learn: 0.3940633	total: 3.55s	remaining: 3.5s
1008:	learn: 0.3939856	total: 3.56s	remaining: 3.5s
1009:	learn: 0.3938878	total: 3.56s	remaining: 3.49s
1010:	learn: 0.3937887	total: 3.57s	remaining: 3.49s
1011:	learn: 0.3937346	total: 3.57s	remaining: 3.49s
1012:	learn: 0.3936604	total: 3.58s	remaining: 3.48s
1013:	learn: 0.3935833	total: 3.58s	remaining: 3.48s
1014:	learn: 0.3934924	total: 3.58s	remaining: 3.48s
1015:	learn: 0.3934569	total: 3.59s	remaining: 3.47s
1016:	learn: 0.3934142	total: 3.59s	remaining: 3.47s
1017:	learn: 0.3933971	total: 3.59s	rema

[I 2025-12-09 17:39:01,468] A new study created in memory with name: opt_CatBoost
[I 2025-12-09 17:39:14,955] Trial 0 finished with value: -0.5387427652907288 and parameters: {'early_stopping_rounds': 17, 'bootstrap_type': 'Bayesian', 'depth': 7, 'learning_rate': 0.0020513382630874496, 'l2_leaf_reg': 2.937538457632828, 'min_data_in_leaf': 3, 'auto_class_weights': None, 'border_count': 100, 'bagging_temperature': 0.020584494295802447}. Best is trial 0 with value: -0.5387427652907288.
[I 2025-12-09 17:39:19,653] Trial 1 finished with value: -0.5589278080913197 and parameters: {'early_stopping_rounds': 30, 'bootstrap_type': 'Bayesian', 'depth': 4, 'learning_rate': 0.002327067708383781, 'l2_leaf_reg': 8.17949947521167, 'min_data_in_leaf': 27, 'auto_class_weights': None, 'border_count': 91, 'bagging_temperature': 0.13949386065204183}. Best is trial 0 with value: -0.5387427652907288.
[I 2025-12-09 17:39:41,007] Trial 2 finished with value: -0.5415650827911972 and parameters: {'early_stopping

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
1000:	learn: 0.3967092	total: 4.93s	remaining: 4.92s
1001:	learn: 0.3966736	total: 4.93s	remaining: 4.91s
1002:	learn: 0.3966390	total: 4.93s	remaining: 4.9s
1003:	learn: 0.3966047	total: 4.94s	remaining: 4.9s
1004:	learn: 0.3965708	total: 4.95s	remaining: 4.9s
1005:	learn: 0.3965151	total: 4.96s	remaining: 4.9s
1006:	learn: 0.3964658	total: 4.96s	remaining: 4.89s
1007:	learn: 0.3964309	total: 4.97s	remaining: 4.89s
1008:	learn: 0.3963862	total: 4.97s	remaining: 4.89s
1009:	learn: 0.3963517	total: 4.98s	remaining: 4.88s
1010:	learn: 0.3962049	total: 4.98s	remaining: 4.87s
1011:	learn: 0.3961718	total: 4.99s	remaining: 4.87s
1012:	learn: 0.3961464	total: 4.99s	remaining: 4.86s
1013:	learn: 0.3960113	total: 4.99s	remaining: 4.85s
1014:	learn: 0.3959448	total: 5s	remaining: 4.85s
1015:	learn: 0.3958447	total: 5s	remaining: 4.84s
1016:	learn: 0.3958104	total: 5s	remaining: 4.83s
1017:	learn: 0.3957870	total: 5s	remaining: 4.8

[I 2025-12-09 17:43:15,676] A new study created in memory with name: opt_SVM
[I 2025-12-09 17:43:16,526] Trial 0 finished with value: -0.5816106961798504 and parameters: {'kernel': 'rbf', 'C': 1.5702970884055387, 'class_weight': None, 'gamma': 'scale'}. Best is trial 0 with value: -0.5816106961798504.
[I 2025-12-09 17:43:17,256] Trial 1 finished with value: -0.5987956133845991 and parameters: {'kernel': 'linear', 'C': 1.3311216080736887, 'class_weight': 'balanced'}. Best is trial 0 with value: -0.5816106961798504.
[I 2025-12-09 17:43:17,773] Trial 2 finished with value: -0.5985487621198097 and parameters: {'kernel': 'linear', 'C': 0.035113563139704075, 'class_weight': 'balanced'}. Best is trial 0 with value: -0.5816106961798504.
[I 2025-12-09 17:43:18,127] Trial 3 finished with value: -0.6080425036905538 and parameters: {'kernel': 'linear', 'C': 0.07476312062252301, 'class_weight': None}. Best is trial 0 with value: -0.5816106961798504.
[I 2025-12-09 17:43:18,634] Trial 4 finished with

CV-based statistical analysis saved to nested_cv_output/logloss/metrics/

 🔧  Applying multiple comparisons corrections...

 📊  Multiple Comparisons Correction Summary:

 📊  Multiple Comparisons Correction Summary:

AUC_DIFFERENCE_PVAL:
  Original significant results (p < 0.05): 0/15
  Bonferroni significant (α = 0.0033): 0/15
  FDR significant (α = 0.05): 0/15

MCNEMAR_PVAL:
  Original significant results (p < 0.05): 10/15
  Bonferroni significant (α = 0.0033): 8/15
  FDR significant (α = 0.05): 9/15

 🎨 Generating plots...
All plots saved to nested_cv_output/logloss/plots/

 📈  Statistical Summary of AUC Differences (Bootstrap Test):
No significant AUC differences found between models (FDR p < 0.05).


**4. Model Evaluation (Validation Set)**     
Evaluate models on the validation set to compare performance before threshold optimization.

In [10]:
import os
import logging
import glob
from pathlib import Path
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
import warnings
import sys
import traceback



try:
    import statsmodels.api as sm
    STATSMODELS_AVAILABLE = True
except ImportError:
    STATSMODELS_AVAILABLE = False
    warnings.warn("statsmodels not available - some statistical functions will be limited")

try:
    from statsmodels.stats.multitest import multipletests
    MULTITEST_AVAILABLE = True
except ImportError:
    MULTITEST_AVAILABLE = False
    warnings.warn("statsmodels.stats.multitest not available - multiple comparison corrections will be limited")

try:
    from statsmodels.stats.contingency_tables import mcnemar
    MCNEMAR_AVAILABLE = True
except ImportError:
    MCNEMAR_AVAILABLE = False
    warnings.warn("statsmodels.stats.contingency_tables not available - McNemar test will be limited")

try:
    from mlxtend.evaluate import mcnemar_table
    MLXTEND_AVAILABLE = True
except ImportError:
    MLXTEND_AVAILABLE = False
    warnings.warn("mlxtend not available - McNemar table functionality will be limited")

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score, brier_score_loss,
    matthews_corrcoef, cohen_kappa_score, roc_curve,
    precision_recall_curve, balanced_accuracy_score, log_loss, confusion_matrix
)
from sklearn.utils import resample
from itertools import combinations
from collections import defaultdict
from scipy.special import logit, expit
from scipy.stats import norm, chi2, t
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from statsmodels.stats.proportion import proportion_confint


matplotlib.rcParams.update({
    'figure.facecolor': 'white',
    'savefig.facecolor': 'white',
    'font.family': 'DejaVu Sans',
    'font.size': 6,
    'axes.titlesize': 6,
    'axes.labelsize': 6,
    'xtick.labelsize': 6,
    'ytick.labelsize': 6,
    'legend.fontsize': 6,
    'figure.dpi': 450,
})



def get_feature_names_from_column_transformer(ct):

    names = []
    for nm, pipe, cols in getattr(ct, 'transformers_', []):
        if nm == 'remainder' and pipe == 'drop':
            continue
        if hasattr(pipe, 'named_steps') and 'onehot' in pipe.named_steps:
            cats = pipe.named_steps['onehot'].categories_
            for c, cl in zip(cols, cats):
                names += [f"{c}_{v}" for v in cl]
        else:
            names += list(cols)
    return names

def enforce_numeric(df: pd.DataFrame) -> pd.DataFrame:
    if df is None or df.empty:
        return pd.DataFrame()
    out = df.copy()
    out = out.apply(pd.to_numeric, errors="coerce")
    out.dropna(axis=0, how="all", inplace=True)
    out.dropna(axis=1, how="all", inplace=True)
    return out


def apply_multiple_comparisons_correction(df, pval_cols, alpha=0.05, method='both'):

    if not MULTITEST_AVAILABLE:
        warnings.warn("multipletests not available - returning original dataframe")
        return df

    df_corrected = df.copy()

    for col in pval_cols:
        if col not in df.columns:
            continue

        pvals = df[col].dropna().values
        valid_idx = df[col].notna()

        if len(pvals) == 0:
            continue


        if method in ['bonferroni', 'both']:
            bonf_pvals = np.minimum(pvals * len(pvals), 1.0)
            df_corrected.loc[valid_idx, f'{col}_bonferroni'] = bonf_pvals
            df_corrected.loc[valid_idx, f'{col}_bonferroni_sig'] = bonf_pvals < alpha
            df_corrected.loc[valid_idx, f'{col}_bonferroni_alpha'] = alpha / len(pvals)


        if method in ['fdr', 'both']:
            _, fdr_pvals, _, _ = multipletests(pvals, alpha=alpha, method='fdr_bh')
            df_corrected.loc[valid_idx, f'{col}_fdr'] = fdr_pvals
            df_corrected.loc[valid_idx, f'{col}_fdr_sig'] = fdr_pvals < alpha

    return df_corrected


def print_correction_summary(df_corrected, original_alpha=0.05):

    print("\n📊 Multiple Comparisons Correction Summary:")
    print("=" * 50)


    pval_cols = [col for col in df_corrected.columns
                 if 'p_value' in col.lower() or 'pval' in col.lower()
                 and not any(x in col for x in ['bonferroni', 'fdr'])]

    for col in pval_cols:
        if col in df_corrected.columns:
            original_sig = (df_corrected[col] < original_alpha).sum()
            total_tests = df_corrected[col].notna().sum()

            print(f"\n{col.upper()}:")
            print(f"  Original significant results (p < {original_alpha}): {original_sig}/{total_tests}")

            if f'{col}_bonferroni_sig' in df_corrected.columns:
                bonf_sig = df_corrected[f'{col}_bonferroni_sig'].sum()
                bonf_alpha = df_corrected[f'{col}_bonferroni_alpha'].iloc[0] if not df_corrected.empty else 'N/A'
                print(f"  Bonferroni significant (α = {bonf_alpha:.4f}): {bonf_sig}/{total_tests}")

            if f'{col}_fdr_sig' in df_corrected.columns:
                fdr_sig = df_corrected[f'{col}_fdr_sig'].sum()
                print(f"  FDR significant (α = {original_alpha}): {fdr_sig}/{total_tests}")



def cohens_d(group1, group2):
    group1, group2 = np.asarray(group1), np.asarray(group2)
    n1, n2 = len(group1), len(group2)
    if n1 < 2 or n2 < 2:
        return np.nan
    m1, m2 = np.mean(group1), np.mean(group2)
    s1, s2 = np.std(group1, ddof=1), np.std(group2, ddof=1)
    pooled_std = np.sqrt(((n1 - 1) * s1**2 + (n2 - 1) * s2**2) / (n1 + n2 - 2))
    return np.nan if pooled_std == 0 else (m1 - m2) / pooled_std

def cliffs_delta(group1, group2):
    dominance = 0
    for x in group1:
        for y in group2:
            if x > y:
                dominance += 1
            elif x < y:
                dominance -= 1
    return dominance / (len(group1) * len(group2))


def interpret_cohens_d(d):
    d = abs(d)
    if np.isnan(d): return "Unknown"
    return ("Negligible" if d < 0.2 else
            "Small" if d < 0.5 else
            "Medium" if d < 0.8 else "Large")

def interpret_cliffs_delta(delta):
    d = abs(delta)
    if np.isnan(d): return "Unknown"
    return ("Negligible" if d < 0.147 else
            "Small" if d < 0.33 else
            "Medium" if d < 0.474 else "Large")

def bootstrap_effect_size_ci(group1, group2, effect_fn, n_bootstrap=1000, alpha=0.05):
    group1, group2 = np.asarray(group1), np.asarray(group2)
    rng = np.random.default_rng(42)
    effects = []
    for _ in range(n_bootstrap):
        idx1 = rng.choice(len(group1), len(group1), replace=True)
        idx2 = rng.choice(len(group2), len(group2), replace=True)

        eff = effect_fn(group1[idx1], group2[idx2])
        if not np.isnan(eff):
            effects.append(eff)
    if not effects:
        return np.nan, np.nan, np.nan
    effects = np.array(effects)
    return effects.mean(), np.percentile(effects, 100 * alpha / 2), np.percentile(effects, 100 * (1 - alpha / 2))



def calibration_slope_intercept(y_true, y_prob, method='logistic'):

    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)

    mask = ~(np.isnan(y_prob) | np.isinf(y_prob) | np.isnan(y_true))
    y_true, y_prob = y_true[mask], y_prob[mask]

    if len(y_true) < 10 or len(np.unique(y_true)) < 2:
        return np.nan, np.nan

    y_prob = np.clip(y_prob, 1e-6, 1 - 1e-6)
    X_base = logit(y_prob) if method == 'logistic' else y_prob
    X = sm.add_constant(X_base.reshape(-1, 1))

    try:
        mod = sm.GLM(y_true, X, family=sm.families.Binomial())
        res = mod.fit(disp=0)
        intercept, slope = float(res.params[0]), float(res.params[1])
        return intercept, slope
    except Exception:
        return np.nan, np.nan

def calibration_slope_intercept_ci(y_true, y_prob, n_bootstrap=1000, alpha=0.05, seed=42):


    intercept_point, slope_point = calibration_slope_intercept(y_true, y_prob)


    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)
    rng = np.random.default_rng(seed)
    slopes, intercepts = [], []

    for _ in range(n_bootstrap):
        try:
            indices = rng.choice(len(y_true), len(y_true), replace=True)
            if len(np.unique(y_true[indices])) < 2:
                continue

            int_b, slp_b = calibration_slope_intercept(y_true[indices], y_prob[indices])
            if not (np.isnan(slp_b) or np.isnan(int_b)):
                slopes.append(slp_b)
                intercepts.append(int_b)
        except Exception:
            continue

    if slopes and intercepts:
        slope_ci = (np.percentile(slopes, 100 * alpha/2), np.percentile(slopes, 100 * (1 - alpha/2)))
        intercept_ci = (np.percentile(intercepts, 100 * alpha/2), np.percentile(intercepts, 100 * (1 - alpha/2)))
    else:

        slope_ci = (np.nan, np.nan)
        intercept_ci = (np.nan, np.nan)

    return intercept_point, slope_point, intercept_ci, slope_ci



import numpy as np

def calculate_ece(y_true, y_prob, n_bins=10, method='uniform'):
    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)
    mask = ~(np.isnan(y_prob) | np.isinf(y_prob))
    y_true = y_true[mask]
    y_prob = y_prob[mask]
    if len(y_true) == 0: return np.nan
    y_prob = np.clip(y_prob, 0.0, 1.0)

    if method == 'quantile':
        quantiles = np.linspace(0, 1, n_bins + 1)
        bin_edges = np.percentile(y_prob, quantiles * 100)
        bin_edges = np.unique(bin_edges) # Fix zero-width bins
    else:
        bin_edges = np.linspace(0.0, 1.0, n_bins + 1)

    actual_n_bins = len(bin_edges) - 1
    if actual_n_bins == 0: return 0.0

    ece = 0.0
    total_samples = len(y_true)

    for i in range(actual_n_bins):
        # Fix double counting: use [a, b) intervals
        if i == actual_n_bins - 1:
            mask = (y_prob >= bin_edges[i]) & (y_prob <= bin_edges[i+1])
        else:
            mask = (y_prob >= bin_edges[i]) & (y_prob < bin_edges[i+1])

        n_in_bin = np.sum(mask)
        if n_in_bin > 0:
            avg_pred = np.mean(y_prob[mask])
            avg_true = np.mean(y_true[mask])
            ece += (n_in_bin / total_samples) * np.abs(avg_pred - avg_true)
    return float(ece)

def calculate_mce(y_true, y_prob, n_bins=10, method='uniform'):
    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)
    mask = ~(np.isnan(y_prob) | np.isinf(y_prob))
    y_true = y_true[mask]
    y_prob = y_prob[mask]
    if len(y_true) == 0: return np.nan
    y_prob = np.clip(y_prob, 0.0, 1.0)

    if method == 'quantile':
        quantiles = np.linspace(0, 1, n_bins + 1)
        bin_edges = np.percentile(y_prob, quantiles * 100)
        bin_edges = np.unique(bin_edges)
    else:
        bin_edges = np.linspace(0.0, 1.0, n_bins + 1)

    actual_n_bins = len(bin_edges) - 1
    if actual_n_bins == 0: return 0.0

    max_error = 0.0
    for i in range(actual_n_bins):
        if i == actual_n_bins - 1:
            mask = (y_prob >= bin_edges[i]) & (y_prob <= bin_edges[i+1])
        else:
            mask = (y_prob >= bin_edges[i]) & (y_prob < bin_edges[i+1])

        n_in_bin = np.sum(mask)
        if n_in_bin > 0:
            avg_pred = np.mean(y_prob[mask])
            avg_true = np.mean(y_true[mask])
            error = abs(avg_pred - avg_true)
            if error > max_error: max_error = error
    return float(max_error)

def hosmer_lemeshow_test_advanced(y_true, y_prob, n_bins=10, min_expected_freq=5):


    if not STATSMODELS_AVAILABLE:
        warnings.warn("statsmodels not available - returning NaN for Hosmer-Lemeshow test")
        return np.nan, np.nan

    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)

    mask = ~(np.isnan(y_prob) | np.isinf(y_prob))
    y_true, y_prob = y_true[mask], y_prob[mask]

    if len(y_true) < 20 or len(np.unique(y_true)) < 2:
        return np.nan, np.nan


    df = pd.DataFrame({'y_true': y_true, 'y_prob': y_prob})
    try:
        df['bin'] = pd.qcut(df['y_prob'], n_bins, labels=False, duplicates='drop')
    except ValueError:
        df['bin'] = np.floor(df['y_prob'] * n_bins).astype(int)
        df.loc[df['bin'] == n_bins, 'bin'] = n_bins - 1


    summary = df.groupby('bin').agg(
        observed=('y_true', 'sum'),
        expected=('y_prob', 'sum'),
        n_total=('y_true', 'size')
    ).reset_index()


    while True:
        sparse_bins = summary[summary['expected'] < min_expected_freq]
        if sparse_bins.empty or len(summary) <= 2:
            break


        merge_idx = sparse_bins.index[0]
        if merge_idx == 0:
            summary.loc[1, ['observed', 'expected', 'n_total']] += summary.loc[0, ['observed', 'expected', 'n_total']]
            summary = summary.drop(0).reset_index(drop=True)
        else:
            summary.loc[merge_idx - 1, ['observed', 'expected', 'n_total']] += summary.loc[merge_idx, ['observed', 'expected', 'n_total']]
            summary = summary.drop(merge_idx).reset_index(drop=True)


    g = len(summary)
    summary['variance'] = summary['expected'] * (1 - summary['expected'] / summary['n_total'])
    hl_statistic = ((summary['observed'] - summary['expected'])**2 / (summary['variance'] + 1e-8)).sum()


    df_hl = g - 2
    if df_hl <= 0:
        return hl_statistic, np.nan

    p_value = 1 - chi2.cdf(hl_statistic, df_hl)

    return hl_statistic, p_value

def calibration_curve_fixed_bins(y_true, y_prob, n_bins=10):

    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)

    mask = ~(np.isnan(y_prob) | np.isinf(y_prob))
    y_true = y_true[mask]
    y_prob = y_prob[mask]

    if len(y_true) == 0:
        return np.full(n_bins, np.nan), np.full(n_bins, np.nan)


    quantiles = np.linspace(0, 1, n_bins + 1)
    bin_edges = np.percentile(y_prob, quantiles * 100)
    bin_edges[0] = 0.0
    bin_edges[-1] = 1.0

    prob_true = np.full(n_bins, np.nan)
    prob_pred = np.full(n_bins, np.nan)

    for i in range(n_bins):
        mask = (y_prob >= bin_edges[i]) & (y_prob <= bin_edges[i+1])
        n_in_bin = np.sum(mask)

        if n_in_bin > 0:
            prob_true[i] = np.mean(y_true[mask])
            prob_pred[i] = np.mean(y_prob[mask])
        else:

            prob_true[i] = np.nan
            prob_pred[i] = (bin_edges[i] + bin_edges[i+1]) / 2

    return prob_true, prob_pred



def calculate_calibration_summary_robust(y_true, y_prob, n_bins=10,
                                       n_bootstrap=1000, alpha=0.05):

    summary = {}


    summary['Brier'] = brier_score_loss(y_true, y_prob)
    summary['LogLoss'] = log_loss(y_true, y_prob)


    summary['ECE_Quantile'] = calculate_ece(y_true, y_prob, n_bins, 'quantile')

    summary['MCE'] = calculate_mce(y_true, y_prob, n_bins, 'quantile')



    intercept, slope, intercept_ci, slope_ci = calibration_slope_intercept_ci(
        y_true, y_prob, n_bootstrap=n_bootstrap, alpha=alpha
    )

    summary['Cal_Intercept'] = intercept
    summary['Cal_Slope'] = slope
    summary['Cal_Slope_CI_Low'] = slope_ci[0]
    summary['Cal_Slope_CI_High'] = slope_ci[1]
    summary['Cal_Intercept_CI_Low'] = intercept_ci[0]
    summary['Cal_Intercept_CI_High'] = intercept_ci[1]


    z_statistic, z_p_value = z_test_standard(y_true, y_prob)
    summary['Spiegelhalter_Z'] = z_statistic
    summary['Spiegelhalter_P_Value'] = z_p_value


    hl_statistic, hl_p_value = hosmer_lemeshow_test_advanced(y_true, y_prob, n_bins)
    summary['HL_Statistic'] = hl_statistic
    summary['HL_P_Value'] = hl_p_value


    summary['Well_Calibrated'] = (
        not np.isnan(slope) and not np.isnan(intercept) and
        abs(slope - 1) < 0.2 and abs(intercept) < 0.15 and
        (not np.isnan(hl_p_value) and hl_p_value > 0.05) and
        (not np.isnan(z_p_value) and z_p_value > 0.05) and
        summary['ECE_Quantile'] < 0.10
    )

    return summary



def z_test_standard(y_true, y_prob):


    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)

    mask = ~(np.isnan(y_prob) | np.isinf(y_prob))
    y_true, y_prob = y_true[mask], y_prob[mask]

    if len(y_true) < 10:
        return np.nan, np.nan


    numerator = np.sum(y_true - y_prob)


    denominator = np.sqrt(np.sum(y_prob * (1 - y_prob)))

    if denominator < 1e-8:
        return np.nan, np.nan


    z_stat = numerator / denominator
    p_value = 2 * (1 - norm.cdf(abs(z_stat)))

    return z_stat, p_value

def bootstrap_ci(y_true, y_pred, y_prob, metric_func, n_bootstrap=1000, ci_width=0.95, **kwargs):
    boot = []
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    y_prob = np.array(y_prob)

    n = len(y_true)
    if n < 10:
        return np.nan, np.nan


    pos_indices = np.where(y_true == 1)[0]
    neg_indices = np.where(y_true == 0)[0]

    rng = np.random.default_rng(42)

    for _ in range(n_bootstrap):

        pos_sample = rng.choice(pos_indices, len(pos_indices), replace=True)
        neg_sample = rng.choice(neg_indices, len(neg_indices), replace=True)
        idx = np.concatenate([pos_sample, neg_sample])


        try:
            if metric_func.__name__ in ['roc_auc_score', 'average_precision_score', 'brier_score_loss', 'log_loss', 'calculate_ece', 'calculate_mce']:
                # PASS KWARGS HERE
                val = metric_func(y_true[idx], y_prob[idx], **kwargs)
            else:
                val = metric_func(y_true[idx], y_pred[idx])

            if not np.isnan(val):
                boot.append(val)
        except Exception:
            continue

    if len(boot) < 100:
        return np.nan, np.nan

    boot = np.array(boot)
    lower = np.percentile(boot, 100 * (1 - ci_width) / 2)
    upper = np.percentile(boot, 100 * (1 + ci_width) / 2)

    return lower, upper




def load_model_artifacts(cfg: dict) -> dict:

    model_dir = Path(cfg['output_dirs']['models'])

    log = logging.getLogger("nested_cv")

    log.info(f"🔍 Loading final models from: {model_dir.resolve()}")

    artifacts = {}

    pattern = str(model_dir / "*_final_calibrated_model.pkl")
    files = sorted(glob.glob(pattern))

    if not files:
        log.warning(f"⚠️ No models found in {model_dir} matching pattern '{pattern}'.")
        return artifacts

    for fpath_str in files:
        fpath = Path(fpath_str)

        try:
            art = joblib.load(fpath)
        except Exception as e:
            log.error(f"⚠️ Could not load artifact {fpath.name}: {e}")
            continue


        required_keys = ['preprocessor', 'selector', 'calibrated_model', 'metadata']
        if not all(k in art for k in required_keys):
            log.warning(f"⚠️ Missing required keys in {fpath.name}, skipping file.")
            continue


        basename = fpath.name

        raw_name = basename.replace("_final_calibrated_model.pkl", "")

        name = raw_name.replace("_", " ")

        metadata = art.get('metadata', {})


        artifacts[name] = {
            'artifact': art,
            'preprocessor': art.get('preprocessor'),
            'selector': art.get('selector'),
            'calibrated_model': art.get('calibrated_model', None),
            'base_model': art.get('base_model', None),
            'metadata': metadata,
            'selected_features': metadata.get('selected_features', None),
            'all_features': metadata.get('all_features', None)
        }

    return artifacts



def validate_pre_split_model(X_raw, y_true, entry, threshold=0.5):

    X_val_clean = X_raw.copy().reset_index(drop=True)
    y_val_clean = y_true.copy().reset_index(drop=True)
    pre = entry['preprocessor']
    calibrated_model = entry['calibrated_model'] or entry['base_model']
    X_pre = pre.transform(X_val_clean)
    all_features = get_feature_names_from_column_transformer(pre)
    X_pre_df = pd.DataFrame(X_pre, columns=all_features)
    selected_features = entry['metadata']['selected_features']
    X_final = X_pre_df[selected_features]
    proba = calibrated_model.predict_proba(X_final)[:, 1]
    pred = (proba >= threshold).astype(int)
    return pred, proba, y_val_clean


def evaluate_single_model(entry, X_raw, y_true, threshold=0.5, bootstrap_n=1000):

    pred, proba, y_val_clean = validate_pre_split_model(X_raw, y_true, entry, threshold)

    metrics = {}
    y_true_np = y_val_clean.values


    metrics['Accuracy'] = accuracy_score(y_true_np, pred)
    metrics['Balanced_Accuracy'] = balanced_accuracy_score(y_true_np, pred)
    metrics['Precision'] = precision_score(y_true_np, pred, zero_division=0)
    metrics['Recall'] = recall_score(y_true_np, pred, zero_division=0)
    metrics['F1'] = f1_score(y_true_np, pred, zero_division=0)

    if len(np.unique(y_true_np)) > 1:
        metrics['AUC_ROC'] = roc_auc_score(y_true_np, proba)
        metrics['AUPRC'] = average_precision_score(y_true_np, proba)
    else:
        metrics['AUC_ROC'] = np.nan
        metrics['AUPRC'] = np.nan

    metrics['MCC'] = matthews_corrcoef(y_true_np, pred)
    metrics['Kappa'] = cohen_kappa_score(y_true_np, pred)
    metrics['Brier'] = brier_score_loss(y_true_np, proba)
    metrics['Log_Loss'] = log_loss(y_true_np, proba)


    cm = confusion_matrix(y_true_np, pred)
    tn, fp, fn, tp = cm.ravel()
    metrics.update({
        'TN': tn, 'FP': fp, 'FN': fn, 'TP': tp,
        'NPV': tn / (tn + fn) if (tn + fn) > 0 else 0,
        'PPV': tp / (tp + fp) if (tp + fp) > 0 else 0,
        'FPR': fp / (fp + tn) if (fp + tn) > 0 else 0,
        'FNR': fn / (fn + tp) if (fn + tp) > 0 else 0
    })


    metrics['ECE'] = calculate_ece(y_true_np, proba, method='uniform')
    metrics['MCE'] = calculate_mce(y_true_np, proba, method='uniform')


    intercept, slope, intercept_ci, slope_ci = calibration_slope_intercept_ci(
        y_true_np, proba, n_bootstrap=bootstrap_n
    )
    metrics['calibration_intercept'] = intercept
    metrics['calibration_slope'] = slope
    metrics['calibration_intercept_CI_low'] = intercept_ci[0]
    metrics['calibration_intercept_CI_high'] = intercept_ci[1]
    metrics['calibration_slope_CI_low'] = slope_ci[0]
    metrics['calibration_slope_CI_high'] = slope_ci[1]


    z, p_z = z_test_standard(y_true_np, proba)
    metrics['z_statistic'] = z
    metrics['z_p_value'] = p_z


    hl_statistic, hl_p = hosmer_lemeshow_test_advanced(y_true_np, proba)
    metrics['hl_statistic'] = hl_statistic
    metrics['hl_p_value'] = hl_p


    ci_metrics = {}
    metric_definitions = [
        ('Accuracy', accuracy_score),
        ('Balanced_Accuracy', balanced_accuracy_score),
        ('Precision', precision_score),
        ('Recall', recall_score),
        ('F1', f1_score),
        ('AUC_ROC', roc_auc_score),
        ('AUPRC', average_precision_score),
        ('MCC', matthews_corrcoef),
        ('Kappa', cohen_kappa_score),
        ('Brier', brier_score_loss),
        ('Log_Loss', log_loss),
        ('ECE', calculate_ece),
        ('MCE', calculate_mce),
    ]

    for name, func in metric_definitions:
        kwargs = {}
        # FORCE 'uniform' for ECE and MCE to match point estimate
        if name in ['ECE', 'MCE']:
            kwargs = {'method': 'uniform'}

        lo, hi = bootstrap_ci(y_true_np, pred, proba, func,
                              n_bootstrap=bootstrap_n, ci_width=0.95, **kwargs)

        ci_metrics[f'{name}_CI_low'] = lo
        ci_metrics[f'{name}_CI_high'] = hi


    fpr, tpr, _ = roc_curve(y_true_np, proba)
    precision, recall, _ = precision_recall_curve(y_true_np, proba)
    prob_true, prob_pred = calibration_curve_fixed_bins(y_true_np, proba, n_bins=10)

    selected_features = entry['metadata'].get('selected_features', [])

    return {
        'metrics': {**metrics, **ci_metrics},
        'roc_curve': (fpr, tpr),
        'pr_curve': (precision, recall),
        'calibration_curve': (prob_true, prob_pred),
        'avg_pred': pred,
        'avg_proba': proba,
        'y_true': y_val_clean,
        'selected_features': selected_features,
        'n_features_used': len(selected_features)
    }



def bootstrap_metric_diff(y, p1, p2, metric_fn, n_bootstraps=1000, seed=42):
    y, p1, p2 = np.asarray(y), np.asarray(p1), np.asarray(p2)
    if len(np.unique(y)) < 2: return np.nan, np.nan, np.nan
    rng = np.random.default_rng(seed)
    diffs = []
    for _ in range(n_bootstraps):
        idx = rng.choice(len(y), len(y), replace=True)
        try:
            diffs.append(metric_fn(y[idx], p1[idx]) - metric_fn(y[idx], p2[idx]))
        except Exception as e:
            continue
    if not diffs: return np.nan, np.nan, np.nan
    diffs = np.asarray(diffs)
    p_val = 2 * min((diffs > 0).mean(), (diffs < 0).mean())
    return p_val, diffs.mean(), diffs.std()

def perform_mcnemar(y_true, y_pred1, y_pred2):
    if not MLXTEND_AVAILABLE or not MCNEMAR_AVAILABLE:
        warnings.warn("mlxtend or mcnemar not available - using basic implementation")

        n01 = np.sum((y_pred1 == 0) & (y_pred2 == 1) & (y_true == y_pred2))
        n10 = np.sum((y_pred1 == 1) & (y_pred2 == 0) & (y_true == y_pred1))
        if n01 + n10 == 0:
            return 1.0
        statistic = (abs(n01 - n10) - 1) ** 2 / (n01 + n10)
        from scipy.stats import chi2
        return 1 - chi2.cdf(statistic, 1)

    try:
        from mlxtend.evaluate import mcnemar_table
        y_true, y_pred1, y_pred2 = np.asarray(y_true), np.asarray(y_pred1), np.asarray(y_pred2)
        table = mcnemar_table(y_target=y_true, y_model1=y_pred1, y_model2=y_pred2)
        return mcnemar(table, exact=False, correction=True).pvalue, table
    except Exception as e:
        warnings.warn(f"McNemar test failed: {e}")
        return np.nan, None

def mcnemar_effect_size(table):
    if table is None:
        return np.nan
    b, c = table[0,1], table[1,0]
    if c == 0: return np.inf if b > 0 else np.nan
    return b/c

def interpret_odds_ratio(odds_ratio):
    if np.isnan(odds_ratio): return "Unknown"
    if np.isinf(odds_ratio): return "Complete dominance"
    return ("Large" if odds_ratio >= 3 or odds_ratio <= 1/3 else
            "Medium" if odds_ratio >= 1.5 or odds_ratio <= 1/1.5 else "Small")



def compute_pairwise_effect_sizes(results, n_bootstrap=1000):

    model_names = list(results.keys())
    effect_data = []
    for m1, m2 in combinations(model_names, 2):
        r1, r2 = results[m1], results[m2]
        y = r1['y_true']
        p1, p2 = r1['avg_proba'], r2['avg_proba']
        pred1, pred2 = r1['avg_pred'], r2['avg_pred']


        try:
            pval_roc, diff_roc, std_roc = bootstrap_metric_diff(y, p1, p2, roc_auc_score, n_bootstraps=n_bootstrap)
        except Exception:
            pval_roc = diff_roc = std_roc = np.nan


        rng = np.random.default_rng(42)
        auprc_samples1, auprc_samples2 = [], []
        for _ in range(450):
            idx = rng.choice(len(y), len(y), replace=True)
            try:
                auprc_samples1.append(average_precision_score(y[idx], p1[idx]))
                auprc_samples2.append(average_precision_score(y[idx], p2[idx]))
            except Exception: continue
        if auprc_samples1 and auprc_samples2:
            coh_d = cohens_d(np.array(auprc_samples1), np.array(auprc_samples2))
            cl_delta = cliffs_delta(np.array(auprc_samples1), np.array(auprc_samples2))
            cd_mean, cd_low, cd_high = bootstrap_effect_size_ci(np.array(auprc_samples1), np.array(auprc_samples2), cohens_d)
        else:
            coh_d=cl_delta=cd_mean=cd_low=cd_high=np.nan


        try:
            pval_mcn, table = perform_mcnemar(y, pred1, pred2)
            oratio = mcnemar_effect_size(table)
            interp_or = interpret_odds_ratio(oratio)
        except Exception:
            pval_mcn = oratio = interp_or = np.nan

        effect_data.append({
            'Model_A': m1, 'Model_B': m2,
            'AUROC_diff_mean': diff_roc, 'AUROC_diff_std': std_roc, 'AUROC_p_value': pval_roc,
            'AUPRC_Cohens_d': coh_d,
            'AUPRC_Cohens_d_CI_low': cd_low, 'AUPRC_Cohens_d_CI_high': cd_high,
            'AUPRC_Cohens_d_interpretation': interpret_cohens_d(coh_d),
            'AUPRC_Cliffs_delta': cl_delta,
            'AUPRC_Cliffs_delta_interpretation': interpret_cliffs_delta(cl_delta),
            'McNemar_p_value': pval_mcn,
            'McNemar_Odds_ratio': oratio,
            'McNemar_effect_interpretation': interp_or,
        })
    return pd.DataFrame(effect_data)

def run_mcnemar_tests(preds_dict, y_true):

    names = list(preds_dict.keys())
    records = []
    for m1, m2 in combinations(names, 2):
        try:
            pval, table = perform_mcnemar(y_true, preds_dict[m1], preds_dict[m2])
            oratio = mcnemar_effect_size(table)
            interp = interpret_odds_ratio(oratio)
        except Exception:
            pval = oratio = interp = np.nan
        records.append({
            "Model_A": m1, "Model_B": m2,
            "McNemar_p_value": pval,
            "Odds_ratio": oratio,
            "Effect_size_interpretation": interp
        })
    return pd.DataFrame(records)



matplotlib.rcParams.update({
    'font.family': 'DejaVu Sans',
    'font.size': 6,
    'axes.titlesize': 6,
    'axes.labelsize': 6,
    'xtick.labelsize': 6,
    'ytick.labelsize': 6,
    'legend.fontsize': 6,
    'figure.dpi': 450,
})

def plot_roc_curves(results, outpath=None):
    plt.figure(figsize=(3.5, 3.5))
    for model, res in results.items():
        fpr = np.array(res['roc_curve'][0], dtype=float)
        tpr = np.array(res['roc_curve'][1], dtype=float)
        auc = res['metrics'].get('AUC_ROC', np.nan)
        plt.plot(fpr, tpr, linewidth=0.7, label=f"{model} (AUC = {auc:.3f})")
    plt.plot([0, 1], [0, 1], 'k--', linewidth=0.7)
    plt.title("ROC Curves")
    plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
    plt.legend(loc='lower right', frameon=False)
    plt.grid(True, linestyle='--', alpha=0.5, linewidth=0.5)
    plt.tight_layout()
    if outpath:
        plt.savefig(outpath)
        plt.close()
    else:
        plt.show()

def plot_pr_curves(results, outpath=None):
    plt.figure(figsize=(3.5, 3.5))
    for model, res in results.items():
        prec = np.array(res['pr_curve'][0], dtype=float)
        rec = np.array(res['pr_curve'][1], dtype=float)
        ap = res['metrics'].get('AUPRC', np.nan)
        plt.plot(rec, prec, linewidth=0.7, label=f"{model} (AUPRC = {ap:.3f})")
    plt.title("Precision-Recall Curves")
    plt.xlabel("Recall"); plt.ylabel("Precision")
    plt.legend(loc='lower left', frameon=False)
    plt.grid(True, linestyle='--', alpha=0.5, linewidth=0.5)
    plt.tight_layout()
    if outpath:
        plt.savefig(outpath)
        plt.close()
    else:
        plt.show()

def plot_calibration_curves(results, outpath=None):
    plt.figure(figsize=(3.5, 3.5))
    for model, res in results.items():
        prob_true = np.array(res['calibration_curve'][0], dtype=float)
        prob_pred = np.array(res['calibration_curve'][1], dtype=float)
        plt.plot(prob_pred, prob_true, 'o-', markersize=2, linewidth=0.7, label=model)
    plt.plot([0, 1], [0, 1], 'k--', label="Perfect", linewidth=0.7)
    plt.title("Calibration Curves")
    plt.xlabel("Mean Predicted Probability")
    plt.ylabel("Observed Fraction Positive")
    plt.legend(loc='upper left', frameon=False)
    plt.grid(True, linestyle='--', alpha=0.5, linewidth=0.5)
    plt.tight_layout()
    if outpath:
        plt.savefig(outpath)
        plt.close()
    else:
        plt.show()

def plot_heatmap(df: pd.DataFrame, outpath=None, title="Performance Heatmap"):
    df = enforce_numeric(df)
    if df.empty:
        return

    plt.figure(figsize=(max(6, df.shape[1] * 0.5), max(4, df.shape[0] * 0.4)))
    sns.heatmap(
        df, annot=True, fmt=".3f", cmap="viridis", linewidths=0.5, linecolor='lightgrey',
        annot_kws={'fontsize': 6, 'fontname': 'DejaVu Sans'}
    )
    plt.title(title, fontsize=6, pad=8)
    plt.tight_layout()
    if outpath:
        plt.savefig(outpath, dpi=450)
        plt.close()
    else:
        plt.show()

def plot_radar_chart(df: pd.DataFrame, outpath=None, title="Radar Chart"):
    df = enforce_numeric(df)
    if df.empty:
        return
    desired = ["roc_auc", "pr_auc", "f1_", "accuracy", "kappa", "mcc"]
    present = [c for c in desired if c in df.columns]
    if not present:
        return
    df = df[present].astype(float)
    df_norm = (df - df.min()) / (df.max() - df.min() + 1e-9)
    if df_norm.empty:
        return
    n_axes = len(df_norm.columns)
    angles = np.linspace(0, 2 * np.pi, n_axes, endpoint=False).tolist()
    angles += angles[:1]
    fig, ax = plt.subplots(figsize=(3.5, 3.5), subplot_kw=dict(polar=True))
    colors = sns.color_palette("colorblind", len(df_norm))
    for idx, (row_name, row) in enumerate(df_norm.iterrows()):
        vals = np.array(row.tolist() + [row.iloc[0]], dtype=float)
        ax.plot(angles, vals, label=row_name, color=colors[idx], linewidth=0.7)
        ax.fill(angles, vals, alpha=0.25, color=colors[idx])
    ax.set_thetagrids(np.degrees(angles[:-1]), df_norm.columns)
    plt.title(title, pad=8)
    plt.legend(loc="lower center", bbox_to_anchor=(0.5, -0.18), ncol=3, frameon=False, fontsize=6)
    plt.tight_layout()
    if outpath:
        plt.savefig(outpath, dpi=450)
        plt.close()
    else:
        plt.show()



def create_effect_size_summary(effect_df, out_dir):
    summary_lines = ["# Effect Size Analysis Summary\n"]
    if "AUPRC_Cohens_d_interpretation" in effect_df.columns:
        summary_lines.append("## AUPRC Cohen's d:")
        vc = effect_df['AUPRC_Cohens_d_interpretation'].value_counts(dropna=False)
        summary_lines.extend([f"- {str(k)}: {str(v)} comps" for k, v in vc.items()])
    if "McNemar_effect_interpretation" in effect_df.columns:
        summary_lines.append("\n## McNemar Odds Ratio Interpretation:")
        vc = effect_df['McNemar_effect_interpretation'].value_counts(dropna=False)
        summary_lines.extend([f"- {str(k)}: {str(v)} comps" for k, v in vc.items()])
    with open(Path(out_dir,"effect_size_summary.txt"), "w") as f:
        f.write("\n".join(summary_lines))

def run_evaluation(val_df, cfg: dict, out_dir="evaluation_1"):
    os.makedirs(out_dir, exist_ok=True)
    print("=== MODEL EVALUATION PIPELINE ===")
    print("🔍 Focus: Calculating MCE (Maximum Calibration Error)")

    effect_df = None
    if 'MDR status' not in val_df.columns:
        raise ValueError("Validation DataFrame must contain 'MDR status' column.")
    df = val_df.reset_index(drop=True)
    X_val = df.drop(columns=['MDR status'])
    y_val = df['MDR status']
    print(f"✅ Validation data loaded from val_df: {X_val.shape}")


    try:
        artifacts = load_model_artifacts(cfg=cfg)
    except Exception as e:
        print(f"⚠️ Config-based loading failed, trying direct path...")

        model_dir = Path("nested_cv_output") / "logloss" / "models"
        if not model_dir.exists():

            model_dir = Path("models")
        artifacts = load_model_artifacts({'output_dirs': {'models': model_dir}})

    if len(artifacts) == 0:
        raise RuntimeError("No valid model artifacts found.")
    print(f"✅ Loaded {len(artifacts)} models")


    results = {}
    preds = {}
    for model_name, entry in artifacts.items():
        print(f"\n▶ Evaluating {model_name}...")
        try:
            res = evaluate_single_model(entry, X_val, y_val, threshold=0.5, bootstrap_n=1000)
            results[model_name] = res
            preds[model_name] = res['avg_pred']


            mce_val = res['metrics'].get('MCE', np.nan)
            ece_val = res['metrics'].get('ECE', np.nan)
            print(f"✅ {model_name}: AUC={res['metrics'].get('AUC_ROC', 0):.3f}, "
                  f"MCE={mce_val:.3f}, ECE={ece_val:.3f}")

        except Exception as e:
            print(f"❌ Failed evaluating {model_name}: {e}")
            print(traceback.format_exc())
            continue

    if not results:
        raise RuntimeError("No model produced results.")


    summary_rows = []
    for m, r in results.items():
        row = {'Model': m}
        row.update(r['metrics'])
        summary_rows.append(row)
    perf_df = pd.DataFrame(summary_rows).set_index('Model')
    perf_df.to_csv(os.path.join(out_dir, "model_performance_with_ci.csv"))

    print("\n=== Performance Summary (key metrics) ===")
    key_metrics = ['AUC_ROC', 'AUPRC', 'Accuracy', 'F1', 'Recall', 'Precision', 'calibration_slope', 'calibration_intercept']
    ava = [m for m in key_metrics if m in perf_df.columns]
    print(perf_df[ava].round(3).to_string())


    print(f"\n📊 Generating visualizations...")
    try:
        plot_roc_curves(results, outpath=os.path.join(out_dir, "roc_curves.png"))
        plot_pr_curves(results, outpath=os.path.join(out_dir, "pr_curves.png"))
        plot_calibration_curves(results, outpath=os.path.join(out_dir, "calibration_curves.png"))
    except Exception as e:
        print(f"⚠️ Some curve plots failed: {e}")


    try:
        rename_map = {
            "AUC_ROC": "roc_auc",
            "AUPRC": "pr_auc",
            "F1": "f1",
            "Accuracy": "accuracy",
            "Kappa": "kappa",
            "MCC": "mcc",
            "Cal_Intercept": "calibration_intercept",
            "Cal_Slope": "calibration_slope",
        }
        visual_cols_old = [k for k in rename_map.keys() if k in perf_df.columns]
        visual_df = perf_df[visual_cols_old].rename(columns=rename_map)
        heat_cols = [c for c in ["roc_auc", "pr_auc", "f1", "accuracy", "kappa", "mcc"] if c in visual_df.columns]
        if heat_cols:
            plot_heatmap(
                visual_df[heat_cols],
                outpath=os.path.join(out_dir, "performance_heatmap.png"),
                title="Model Performance Heatmap"
            )
            plot_radar_chart(
                visual_df[heat_cols],
                outpath=os.path.join(out_dir, "metrics_radar_chart.png"),
                title="Radar Chart"
            )
        print("✅ All plots generated successfully")
    except Exception as e:
        print(f"⚠️ Visual plots failed: {e}")


    mcn_df = None
    try:
        mcn_df = run_mcnemar_tests(preds, y_val.values)
        if mcn_df is not None and not mcn_df.empty:
            print(f"\n=== Statistical Significance Tests (McNemar) ===")
            print(mcn_df.round(4).to_string(index=False))
            mcn_df.to_csv(os.path.join(out_dir, "mcnemar_results.csv"), index=False)


            print(f"\n🔧 Applying multiple comparisons corrections to McNemar tests...")
            mcn_corrected = apply_multiple_comparisons_correction(
                mcn_df,
                pval_cols=['McNemar_p_value'],
                alpha=0.05,
                method='both'
            )
            mcn_corrected.to_csv(os.path.join(out_dir, "mcnemar_results_corrected.csv"), index=False)
            print_correction_summary(mcn_corrected)

    except Exception as e:
        print(f"⚠️ Statistical tests failed: {e}")

    effect_df = None
    try:
        print(f"\n🔍 Computing pairwise effect sizes...")
        effect_df = compute_pairwise_effect_sizes(results, n_bootstrap=1000)
        if not effect_df.empty:
            effect_csv_fp = os.path.join(out_dir, "pairwise_effect_sizes.csv")
            effect_df.to_csv(effect_csv_fp, index=False)
            print(f"✅ Pairwise effect sizes saved to: {effect_csv_fp}")


            print(f"\n🔧 Applying multiple comparisons corrections to effect size tests...")
            effect_corrected = apply_multiple_comparisons_correction(
                effect_df,
                pval_cols=['AUROC_p_value', 'McNemar_p_value'],
                alpha=0.05,
                method='both'
            )
            effect_corrected.to_csv(os.path.join(out_dir, "pairwise_effect_sizes_corrected.csv"), index=False)


        best_model = perf_df['Accuracy'].idxmax() if 'Accuracy' in perf_df.columns else perf_df.index[0]
        slope = perf_df.loc[best_model, 'calibration_slope'] if 'calibration_slope' in perf_df.columns else np.nan
        intercept = perf_df.loc[best_model, 'calibration_intercept'] if 'calibration_intercept' in perf_df.columns else np.nan

        if not np.isnan(slope) and not np.isnan(intercept):
            if abs(slope - 1) < 0.1 and abs(intercept) < 0.05:
                print("   ✅ Well calibrated (slope ≈ 1, intercept ≈ 0)")
            else:
                issues = []
                if abs(slope - 1) >= 0.1:
                    issues.append(f"slope deviation ({slope:.3f})")
                if abs(intercept) >= 0.05:
                    issues.append(f"intercept deviation ({intercept:.3f})")
                print(f"   ⚠️  Calibration issues: {', '.join(issues)}")


            create_effect_size_summary(effect_corrected, out_dir)
            print(f"✅ Corrected effect sizes and summary saved")

        else:
            print("⚠️ No effect sizes computed (insufficient models or data)")
    except Exception as e:
        print(f"⚠️ Effect size analysis failed: {e}")
        effect_df = None

    print(f"\n🎉 Evaluation complete! Results saved to: {os.path.abspath(out_dir)}")
    print(f"   📊 {len(results)} models evaluated successfully")
    if 'Accuracy' in perf_df.columns:
        print(f"   📈 Best model: {perf_df['Accuracy'].idxmax()} ({perf_df['Accuracy'].max():.1%})")
    print(f"\n📁 Generated files:")
    print(f"   - model_performance_with_ci.csv: Comprehensive metrics with confidence intervals")
    print(f"   - pairwise_effect_sizes.csv: Raw effect size comparisons")
    print(f"   - pairwise_effect_sizes_corrected.csv: Effect sizes with Bonferroni/FDR corrections")
    print(f"   - mcnemar_results.csv: Raw McNemar test results")
    print(f"   - mcnemar_results_corrected.csv: McNemar with Bonferroni/FDR corrections")
    print(f"   - effect_size_summary.txt: Human-readable effect size interpretation")
    print(f"   - Various plots and visualization files")

    return results, perf_df, effect_df


if __name__ == "__main__":
    try:
        from your_training_module import Config, setup_logger
        cfg_obj = Config.from_env()
        CONFIG = cfg_obj.to_dict()
        log = setup_logger(cfg_obj)
        print("✅ Loaded configuration from training module")
    except ImportError:
        print("⚠️ Could not import training config, using defaults")
        CONFIG = {
            'output_dirs': {
                'models': Path("nested_cv_output/logloss/models")
             },
             'random_state': 42
        }

        logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
        log = logging.getLogger("validation")


        validation_csv_path = "validation_set.csv"
        if not Path(validation_csv_path).exists():
            log.critical(f"Validation data file '{validation_csv_path}' not found.")
            sys.exit(1)

        val_df = pd.read_csv(validation_csv_path)
        print(f"✅ Loaded validation data: {val_df.shape}")


        print("\nStarting Model Evaluation...")

        results, perf_df, effect_df = run_evaluation(
            val_df,
            cfg=CONFIG,
            out_dir="evaluation"
        )

        log.info("Evaluation complete. Results processed.")

    except Exception as e:
        log.critical(f"A critical error occurred during evaluation:\n{traceback.format_exc()}")
        sys.exit(1)

⚠️ Could not import training config, using defaults
✅ Loaded validation data: (382, 6)

Starting Model Evaluation...
=== MODEL EVALUATION PIPELINE ===
🔍 Focus: Calculating MCE (Maximum Calibration Error)
✅ Validation data loaded from val_df: (382, 5)
✅ Loaded 6 models

▶ Evaluating CatBoost...
✅ CatBoost: AUC=0.850, MCE=0.096, ECE=0.032

▶ Evaluating LightGBM...
✅ LightGBM: AUC=0.814, MCE=0.088, ECE=0.034

▶ Evaluating LogisticRegression...
✅ LogisticRegression: AUC=0.751, MCE=0.101, ECE=0.039

▶ Evaluating RandomForest...
✅ RandomForest: AUC=0.805, MCE=0.106, ECE=0.047

▶ Evaluating SVM...
✅ SVM: AUC=0.768, MCE=0.105, ECE=0.057

▶ Evaluating XGBoost...
✅ XGBoost: AUC=0.838, MCE=0.165, ECE=0.077

=== Performance Summary (key metrics) ===
                    AUC_ROC  AUPRC  Accuracy     F1  Recall  Precision  calibration_slope  calibration_intercept
Model                                                                                                           
CatBoost              0.85

**5. Threshold Optimization**  
Determine the optimal probability thresholds using metrics such as Youden’s J, F1-score, and balanced accuracy.

In [11]:

import os
import csv
import argparse
import logging
import sys
import warnings
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import joblib
from pathlib import Path
from joblib import Parallel, delayed
from sklearn.metrics import (
    f1_score, accuracy_score, balanced_accuracy_score, confusion_matrix,
    roc_auc_score, average_precision_score
)
from sklearn.utils import resample
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Tuple, Optional, Callable
from types import MappingProxyType
from matplotlib.ticker import MultipleLocator, AutoMinorLocator
from typing import Dict, Any

plt.rcParams['font.family'] = 'DejaVu Sans'


log_filename = f'eval_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_filename),
        logging.StreamHandler()
    ]
)


warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)



def setup_logger(config):

    return logging.getLogger("Placeholder")

class NestedCVPipeline:


    def __init__(self, X: pd.DataFrame, y: pd.Series, cfg: Dict[str, Any]):
        self.X = X
        self.y = y
        self.cfg = cfg

        pass

class HybridFeatureSelector:

    def __init__(self, cfg: Dict[str, Any]):

        self.n_features = cfg['feature_selection']['n_features']
        self.shap_weight = cfg['feature_selection']['shap_weight']
        self.tree_weight = cfg['feature_selection']['tree_weight']
        self.max_samples_shap = cfg['feature_selection']['max_samples_shap']
        self.random_state = cfg['random_state']
        self.selected_features_ = None

    def fit(self, X, y=None):

        return self

    def transform(self, X):

        if self.selected_features_ is not None and isinstance(X, pd.DataFrame):
            return X[self.selected_features_]


        return X

def calculate_metrics(y_true: np.ndarray,
                      y_pred_proba: np.ndarray,
                      thresholds: np.ndarray) -> Dict[str, list]:

    metrics = {
        'thresholds': list(thresholds), 'f1_scores': [], 'accuracy': [],
        'sensitivity': [], 'specificity': [], 'youden_j': [],
        'balanced_accuracies': [], 'TN': [], 'FP': [], 'FN': [], 'TP': [],
        'NPV': [], 'PPV': [], 'FPR': [], 'FNR': []
    }

    for t in thresholds:
        y_pred = (y_pred_proba >= t).astype(int)
        try:
            tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()
            sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0.0
            specificity = tn / (tn + fp) if (tn + fp) > 0 else 0.0
            npv = tn / (tn + fn) if (tn + fn) > 0 else 0.0
            ppv = tp / (tp + fp) if (tp + fp) > 0 else 0.0
            fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0
            fnr = fn / (fn + tp) if (fn + tp) > 0 else 0.0
        except ValueError:
            sensitivity, specificity, npv, ppv, fpr, fnr = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
            tn = fp = fn = tp = 0

        metrics['f1_scores'].append(f1_score(y_true, y_pred, zero_division=0))
        metrics['accuracy'].append(accuracy_score(y_true, y_pred))
        metrics['sensitivity'].append(sensitivity)
        metrics['specificity'].append(specificity)
        metrics['youden_j'].append(sensitivity + specificity - 1)
        metrics['balanced_accuracies'].append(balanced_accuracy_score(y_true, y_pred))
        metrics['TN'].append(tn); metrics['FP'].append(fp)
        metrics['FN'].append(fn); metrics['TP'].append(tp)
        metrics['NPV'].append(npv); metrics['PPV'].append(ppv)
        metrics['FPR'].append(fpr); metrics['FNR'].append(fnr)

    return metrics



def sensitivity_score(y_true_bs, y_pred_bs):
    try:
        tn, fp, fn, tp = confusion_matrix(y_true_bs, y_pred_bs, labels=[0, 1]).ravel()
        return tp / (tp + fn) if (tp + fn) > 0 else 0.0
    except ValueError: return 0.0

def specificity_score(y_true_bs, y_pred_bs):
    try:
        tn, fp, fn, tp = confusion_matrix(y_true_bs, y_pred_bs, labels=[0, 1]).ravel()
        return tn / (tn + fp) if (tn + fp) > 0 else 0.0
    except ValueError: return 0.0

F1_FUNC = lambda yt, yp: f1_score(yt, yp, zero_division=0)
METRIC_FUNCTIONS = {
    'f1': F1_FUNC,
    'accuracy': accuracy_score,
    'sensitivity': sensitivity_score,
    'specificity': specificity_score
}

def _bootstrap_single_metric(metric_func: Callable, y_true: np.ndarray, y_pred: np.ndarray, seed: int) -> float:

    try:
        indices = resample(np.arange(len(y_true)), replace=True, n_samples=len(y_true), random_state=seed)
        if len(np.unique(y_true[indices])) < 2: return 0.0
        return metric_func(y_true[indices], y_pred[indices])
    except Exception: return 0.0


def _bootstrap_single_score_metric(metric_func: Callable, y_true: np.ndarray, y_score: np.ndarray, seed: int) -> float:

    try:
        indices = resample(np.arange(len(y_true)), replace=True, n_samples=len(y_true), random_state=seed)
        if len(np.unique(y_true[indices])) < 2: return 0.0
        return metric_func(y_true[indices], y_score[indices])
    except Exception: return 0.0

def bootstrap_metric_ci(metric_func: Callable, y_true: np.ndarray, y_score: np.ndarray, threshold: float,
                        n_bootstrap: int = 1000, n_jobs: int = -1, seed: int = 42) -> Tuple[float, float]:

    y_pred = (y_score >= threshold).astype(int)
    rng = np.random.RandomState(seed)
    seeds = rng.randint(np.iinfo(np.int32).max, size=n_bootstrap)

    boot_metrics = Parallel(n_jobs=n_jobs)(
        delayed(_bootstrap_single_metric)(metric_func, y_true, y_pred, seed=s) for s in seeds
    )
    return (np.percentile(boot_metrics, 2.5), np.percentile(boot_metrics, 97.5))


def bootstrap_score_metric_ci(metric_func: Callable, y_true: np.ndarray, y_score: np.ndarray,
                              n_bootstrap: int = 1000, n_jobs: int = -1, seed: int = 42) -> Tuple[float, float]:

    rng = np.random.RandomState(seed)
    seeds = rng.randint(np.iinfo(np.int32).max, size=n_bootstrap)
    boot_metrics = Parallel(n_jobs=n_jobs)(
        delayed(_bootstrap_single_score_metric)(metric_func, y_true, y_score, seed=s) for s in seeds
    )
    valid_metrics = [m for m in boot_metrics if not np.isnan(m)]
    if not valid_metrics: return (0.0, 0.0)
    return (np.percentile(valid_metrics, 2.5), np.percentile(valid_metrics, 97.5))

def bootstrap_all_cis(y_true: np.ndarray, y_pred_proba: np.ndarray, threshold: float,
                      n_bootstrap: int = 1000, n_jobs: int = -1, seed: int = 42) -> Dict[str, Tuple[float, float]]:

    f1_func = lambda yt, yp: f1_score(yt, yp, zero_division=0)


    metric_functions = {
        'f1': f1_func,
        'accuracy': accuracy_score,
        'sensitivity': sensitivity_score,
        'specificity': specificity_score
    }

    ci_results = {}

    for name, func in metric_functions.items():
        try:
            ci_results[name] = bootstrap_metric_ci(
                func, y_true, y_pred_proba, threshold, n_bootstrap, n_jobs, seed
            )
        except Exception as e:
            logging.warning(f"Failed to calculate CI for {name}: {e}")
            ci_results[name] = (0.0, 1.0)

    return ci_results



def get_feature_names_from_column_transformer(column_transformer) -> List[str]:

    names = []
    for name, pipe, cols in column_transformer.transformers_:
        if name == 'remainder' and pipe == 'drop': continue
        if name == 'cat' and hasattr(pipe, 'named_steps') and 'onehot' in pipe.named_steps:
            try:
                cats = pipe.named_steps['onehot'].categories_
                for c, cat_list in zip(cols, cats):
                    names.extend([f"{c}_{v}" for v in cat_list])
            except Exception: names.extend(cols)
        else: names.extend(cols)
    return names

def get_model_predictions(model_artifact: Dict, X_val: pd.DataFrame, model_name: str = "") -> Tuple[Optional[np.ndarray], int, List[str]]:

    try:
        logging.info(f"Inspecting artifact '{model_name}'. Found keys: {list(model_artifact.keys())}")
        model = model_artifact.get('calibrated_model')
        preprocessor = model_artifact.get('preprocessor')
        metadata = model_artifact.get('metadata', {})

        if not all([model, preprocessor]):
            missing = [k for k,v in {'calibrated':model, 'preprocessor':preprocessor}.items() if not v]
            raise ValueError(f"Missing required components in '{model_name}': {missing}")

        selected_features = metadata.get('selected_features', [])
        all_features = metadata.get('all_features', [])

        try:
            X_val_preprocessed = preprocessor.transform(X_val)
        except Exception as e:
            logging.error(f"Preprocessing failed for '{model_name}': {e}")
            return None, 0, []


        if not all_features:
            try:
                all_features = get_feature_names_from_column_transformer(preprocessor)
            except Exception:
                all_features = [f'feature_{i}' for i in range(X_val_preprocessed.shape[1])]



        if hasattr(X_val_preprocessed, 'toarray'):
            X_val_preprocessed = X_val_preprocessed.toarray()


        current_cols_count = X_val_preprocessed.shape[1]

        if current_cols_count != len(all_features):
             logging.warning(f"Feature count mismatch for '{model_name}'. Expected {len(all_features)}, got {current_cols_count}. Attempting re-indexing.")

        try:

            X_val_df_full = pd.DataFrame(X_val_preprocessed, columns=all_features, index=X_val.index)
        except ValueError:

            placeholder_cols = [f'_col_{i}' for i in range(current_cols_count)]
            X_val_df_temp = pd.DataFrame(X_val_preprocessed, columns=placeholder_cols, index=X_val.index)

            X_val_df_full = X_val_df_temp.reindex(columns=all_features, fill_value=0.0)


        if selected_features:

            X_val_final = X_val_df_full[selected_features].copy()


            missing_features = set(selected_features) - set(X_val_final.columns)
            if missing_features:
                 logging.warning(f"Model '{model_name}' missing {len(missing_features)} features in final set.")

            num_features_used = len(selected_features)
            logging.info(f"Selected {num_features_used} features for '{model_name}'")

        else:
            logging.warning(f"No selected features stored for '{model_name}', using all features.")
            X_val_final = X_val_df_full
            num_features_used = X_val_final.shape[1]


        X_val_final = X_val_final.apply(pd.to_numeric, errors='coerce').fillna(0)

        if X_val_final.shape[1] == 0:
            logging.error(f"No features available for prediction in '{model_name}'")
            return None, 0, []


        try:
            if "lightgbm" in str(type(model)).lower():
                y_pred_proba = model.predict_proba(X_val_final, predict_disable_shape_check=True)[:, 1]
            else: y_pred_proba = model.predict_proba(X_val_final)[:, 1]
            logging.info(f"✅ Predictions generated for '{model_name}' using {num_features_used} features.")
            return y_pred_proba, num_features_used, list(X_val_final.columns)
        except Exception as e:
            try:
                logging.warning("Retrying prediction with numpy array")
                y_pred_proba = model.predict_proba(X_val_final.values)[:, 1]
                logging.info(f"✅ Predictions generated for '{model_name}' using {num_features_used} features (array input).")
                return y_pred_proba, num_features_used, list(X_val_final.columns)
            except Exception as e2:
                logging.error(f"❌ Prediction failed for '{model_name}': {e2}")
                return None, 0, []
    except Exception as e:
        logging.error(f"❌ Failed to get predictions for '{model_name}': {str(e)}", exc_info=True)
        return None, 0, []


def create_subgroup_masks(df_processed: pd.DataFrame) -> Dict[str, np.ndarray]:

    masks = {}
    if 'Age' in df_processed.columns:
        age_median = df_processed['Age'].median()
        masks['Age < Median'] = (df_processed['Age'] < age_median).values
        masks['Age >= Median'] = (df_processed['Age'] >= age_median).values
    categorical_features = [
        'Institution Type_Lab', 'Institution Type_Hospital',
        'Healthcare Sector_Governmental', 'Healthcare Sector_Private',
        'Bacteria type_E. coli', 'Bacteria type_Klebsiella Spp', 'Bacteria type_Pseudomonas Spp',
        'Gender_F', 'Gender_M'
    ]
    for feat in [f for f in categorical_features if f in df_processed.columns]:
        masks[f'{feat} = 1'] = (df_processed[feat] == 1).values
        masks[f'{feat} = 0'] = (df_processed[feat] == 0).values
    return masks


def evaluate_subgroup(y_true: np.ndarray, y_pred_proba: np.ndarray, mask: np.ndarray,
                      thresholds: np.ndarray, n_bootstrap: int, n_jobs: int, seed: int) -> Optional[Dict]:

    if not mask.any() or mask.sum() < 10:
        return None
    y_true_g, y_pred_proba_g = y_true[mask], y_pred_proba[mask]
    if len(np.unique(y_true_g)) < 2:
        return None

    metrics = calculate_metrics(y_true_g, y_pred_proba_g, thresholds)


    opt_idx = np.argmax(metrics['youden_j'])
    opt_thr = float(thresholds[opt_idx])

    results = {
        'n_samples': int(mask.sum()),
        'threshold_youden': opt_thr,
        'youden_j': metrics['youden_j'][opt_idx],
        'f1': metrics['f1_scores'][opt_idx],
        'accuracy': metrics['accuracy'][opt_idx],
        'sensitivity': metrics['sensitivity'][opt_idx],
        'specificity': metrics['specificity'][opt_idx],
        'balanced_accuracy': metrics['balanced_accuracies'][opt_idx],
    }

    try:
        ci = bootstrap_all_cis(y_true_g, y_pred_proba_g, opt_thr, n_bootstrap, n_jobs, seed)
        results.update({
            'youden_j_ci': f"[{ci.get('youden_j', (0,0))[0]:.3f}, {ci.get('youden_j', (0,0))[1]:.3f}]",
            'f1_ci': f"[{ci['f1'][0]:.3f}, {ci['f1'][1]:.3f}]",
            'accuracy_ci': f"[{ci['accuracy'][0]:.3f}, {ci['accuracy'][1]:.3f}]",
            'sensitivity_ci': f"[{ci['sensitivity'][0]:.3f}, {ci['sensitivity'][1]:.3f}]",
            'specificity_ci': f"[{ci['specificity'][0]:.3f}, {ci['specificity'][1]:.3f}]",
        })
    except Exception as e:
        logging.warning(f"Bootstrap CI calculation failed: {e}")
        results.update({
            'f1_ci': "[0.000, 1.000]",
            'accuracy_ci': "[0.000, 1.000]",
            'sensitivity_ci': "[0.000, 1.000]",
            'specificity_ci': "[0.000, 1.000]",
        })

    return results


def run_subgroup_analysis(y_pred_proba: np.ndarray, y_true: np.ndarray, df_processed: pd.DataFrame,
                          thresholds: np.ndarray, n_bootstrap: int, n_jobs: int, seed: int) -> Dict[str, Dict]:

    masks = create_subgroup_masks(df_processed)
    results = {}
    for group_name, mask in masks.items():
        subgroup_result = evaluate_subgroup(y_true, y_pred_proba, mask, thresholds, n_bootstrap, n_jobs, seed)
        if subgroup_result:
            results[group_name] = subgroup_result
    return results



def _compute_ci_for_threshold(y_true, y_pred_proba, bootstrap_indices, metric_key, threshold):
    boot_vals = []
    for idx in bootstrap_indices:
        try:
            y_true_b, y_proba_b = y_true[idx], y_pred_proba[idx]
            y_pred_b = (y_proba_b >= threshold).astype(int)
            if metric_key == 'f1_scores': value = f1_score(y_true_b, y_pred_b, zero_division=0)
            elif metric_key == 'accuracy': value = accuracy_score(y_true_b, y_pred_b)
            elif metric_key == 'sensitivity':
                tn, fp, fn, tp = confusion_matrix(y_true_b, y_pred_b, labels=[0,1]).ravel()
                value = tp / (tp + fn) if (tp + fn) > 0 else 0.0
            elif metric_key == 'specificity':
                tn, fp, fn, tp = confusion_matrix(y_true_b, y_pred_b, labels=[0,1]).ravel()
                value = tn / (tn + fp) if (tn + fp) > 0 else 0.0
            elif metric_key == 'youden_j':
                tn, fp, fn, tp = confusion_matrix(y_true_b, y_pred_b, labels=[0,1]).ravel()
                sens = tp / (tp + fn) if (tp + fn) > 0 else 0.0
                spec = tn / (tn + fp) if (tn + fp) > 0 else 0.0
                value = sens + spec - 1
            elif metric_key == 'balanced_accuracies': value = balanced_accuracy_score(y_true_b, y_pred_b)
            else: value = 0.0
            boot_vals.append(value)
        except Exception: boot_vals.append(0.0)
    if boot_vals: return np.percentile(boot_vals, 2.5), np.percentile(boot_vals, 97.5)
    else: return 0.0, 1.0

def plot_threshold_metrics(metrics: dict, optimal_thresholds: dict, metrics_at_optimal: dict, model_name: str,
                           output_path: str, y_true: np.ndarray, y_pred_proba: np.ndarray,
                           n_bootstrap_plot: int = 1000, seed: int = 42) -> None:


    matplotlib.rcParams.update({'font.family': 'DejaVu Sans', 'font.size': 7, 'axes.titlesize': 7, 'axes.labelsize': 7, 'xtick.labelsize': 6, 'ytick.labelsize': 6, 'legend.fontsize': 7, 'lines.linewidth': 1.1, 'axes.linewidth': 0.7, 'figure.dpi': 600})
    color_palette = sns.color_palette("colorblind", n_colors=6)
    metric_map = {'f1_scores': ('F1 score', color_palette[0]), 'accuracy': ('Accuracy', color_palette[1]), 'sensitivity': ('Sensitivity', color_palette[2]), 'specificity': ('Specificity', color_palette[3]), 'youden_j': ("Youden's J", color_palette[4]), 'balanced_accuracies': ('Balanced Acc.', color_palette[5])}
    rng = np.random.RandomState(seed)
    bootstrap_indices = [rng.choice(len(y_true), size=len(y_true), replace=True) for _ in range(n_bootstrap_plot)]
    thresholds = metrics['thresholds']
    ci_bands = {}
    for metric_key, (label, color) in metric_map.items():
        results = Parallel(n_jobs=-1)(delayed(_compute_ci_for_threshold)(y_true, y_pred_proba, bootstrap_indices, metric_key, t) for t in thresholds)
        lower, upper = [r[0] for r in results], [r[1] for r in results]
        ci_bands[metric_key] = (lower, upper)
    fig, ax = plt.subplots(figsize=(6.5, 3.4))
    for key, (label, color) in metric_map.items():
        ax.plot(thresholds, metrics[key], label=label, color=color, linewidth=1.2)
        ax.fill_between(thresholds, ci_bands[key][0], ci_bands[key][1], color=color, alpha=0.15, linewidth=0)
    line_styles = {'F1': ('--', 'black'), 'YoudenJ': (':', color_palette[4]), 'BalancedAcc': ('-.', color_palette[5])}
    for name, (linestyle, col) in line_styles.items():
        thr = optimal_thresholds.get(name)
        if thr is not None:
            display_label = {"F1": "F1", "YoudenJ": "Youden's J", "BalancedAcc": "Balanced Acc"}[name]
            ax.axvline(thr, ls=linestyle, color=col, label=f"{display_label} thr: {thr:.2f}", linewidth=1.1)
    ax.set_xlabel("Threshold", labelpad=4); ax.set_ylabel("Metric value", labelpad=4)
    ax.set_ylim(-0.03, 1.03); ax.set_xlim(thresholds[0], thresholds[-1])
    ax.xaxis.set_major_locator(MultipleLocator(0.2)); ax.xaxis.set_minor_locator(MultipleLocator(0.05))
    ax.yaxis.set_minor_locator(AutoMinorLocator(2))
    ax.grid(axis='y', linestyle="--", color="gray", alpha=0.14, linewidth=0.65)
    ax.tick_params(direction='out', length=4.5, width=0.75)
    for spine in ['right', 'top']: ax.spines[spine].set_visible(False)
    ax.legend(loc='center left', bbox_to_anchor=(1.01, 0.51), frameon=False, borderaxespad=0)
    fig.suptitle(f"Threshold-wise performance curves ({model_name})", y=1.01, fontsize=8.8, fontweight='regular')
    plt.tight_layout(rect=(0, 0, 1, 0.965))
    output_path = Path(output_path)
    if output_path.suffix.lower() != '.png': output_path = output_path.with_suffix('.png')
    fig.savefig(str(output_path), dpi=600, bbox_inches='tight', facecolor='white')
    fig.savefig(str(output_path.with_suffix('.pdf')), bbox_inches='tight', facecolor='white')
    plt.close(fig)
    print(f"threshold metrics plot saved to: {output_path}")



def print_summary_table(title: str, results_dict: dict, is_global: bool = False):

    print("\n" + "=" * 80)
    print(f"🔍 {title}")
    print("=" * 80)
    if is_global:
        header = f"{'Metric':<20} {'Value':<10} {'95% CI':<20} {'Optimal Threshold'}"
        print(header)
        print("-" * len(header))
        g = results_dict

        print(f"{'Youden\'s J':<20} {g['youden_j']:<10.3f} {'-':<20} {g['threshold_youdenj']:.2f} (Youden)")
        print(f"{'Sensitivity':<20} {g['sensitivity']:<10.3f} {g['sensitivity_ci']}")
        print(f"{'Specificity':<20} {g['specificity']:<10.3f} {g['specificity_ci']}")
        print(f"{'F1 Score':<20} {g['f1']:<10.3f} {g['f1_ci']}")
        print(f"{'Accuracy':<20} {g['accuracy']:<10.3f} {g['accuracy_ci']}")
        print(f"{'Balanced Accuracy':<20} {g['balanced_accuracy']:<10.3f}")
    else:

        header = f"{'Subgroup':<35} {'N':<6} {'Youden':<8} {'Sens':<8} {'Spec':<8} {'Threshold':<10}"
        print(header)
        print("-" * len(header))
        for group, metrics in results_dict.items():
            print(f"{group:<35} {metrics['n_samples']:<6} {metrics['youden_j']:<8.3f} "
                  f"{metrics['sensitivity']:<8.3f} {metrics['specificity']:<8.3f} {metrics['threshold_youden']:<10.2f}")
    print("=" * 80 + "\n")


def save_results_to_csv(data: List[Dict], output_path: Path):

    if not data:
        logging.warning(f"No data to save to {output_path}.")
        return
    try:
        with open(output_path, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=data[0].keys())
            writer.writeheader()
            writer.writerows(data)
        logging.info(f"💾 Saved results to {output_path}")
    except Exception as e:
        logging.error(f"Failed to save CSV to {output_path}: {e}")



def main(args):

    models_dir = Path(args.models_dir)
    output_dir = Path(args.output_dir)
    plots_dir = output_dir / 'plots'; plots_dir.mkdir(parents=True, exist_ok=True)
    reports_dir = output_dir / 'reports'; reports_dir.mkdir(parents=True, exist_ok=True)
    try:
        validation_data = pd.read_csv(args.data_path)
        if args.target_col not in validation_data.columns: raise ValueError(f"Target column '{args.target_col}' not found.")
        X_val = validation_data.drop(columns=[args.target_col])
        y_val = validation_data[args.target_col].values
        logging.info(f"Loaded validation data: {X_val.shape[0]} samples from {args.data_path}")
    except Exception as e:
        logging.error(f"Fatal: Failed to load data. {e}", exc_info=True)
        return
    global_results_list = []
    thresholds = np.arange(0.0, 1.01, 0.01)
    model_files = list(models_dir.glob('*_final_calibrated_model.pkl'))
    if not model_files:
        logging.error(f"No final model files (*_final_calibrated_model.pkl) found in '{models_dir}'.")
        return
    for model_file in model_files:
        model_name = model_file.stem.replace('_final_calibrated_model', '')
        logging.info(f"\n{'='*40}\nEVALUATING MODEL: {model_name}\n{'='*40}")
        try:
            artifact = joblib.load(model_file)
            y_pred_proba, n_features, sel_features = get_model_predictions(artifact, X_val, model_name)
            if y_pred_proba is None: continue


            global_metrics = calculate_metrics(y_val, y_pred_proba, thresholds)


            opt_f1_idx = np.argmax(global_metrics['f1_scores'])
            opt_thr_f1 = float(thresholds[opt_f1_idx])

            opt_youdenj_idx = np.argmax(global_metrics['youden_j'])
            opt_thr_youdenj = float(thresholds[opt_youdenj_idx])

            opt_balacc_idx = np.argmax(global_metrics['balanced_accuracies'])
            opt_thr_balacc = float(thresholds[opt_balacc_idx])


            global_ci = bootstrap_all_cis(y_val, y_pred_proba, opt_thr_youdenj, args.n_bootstrap, args.n_jobs, args.seed)

            metrics_at_f1 = {'f1': global_metrics['f1_scores'][opt_f1_idx], 'accuracy': global_metrics['accuracy'][opt_f1_idx], 'sensitivity': global_metrics['sensitivity'][opt_f1_idx], 'specificity': global_metrics['specificity'][opt_f1_idx], 'youden_j': global_metrics['youden_j'][opt_f1_idx], 'balanced_accuracy': global_metrics['balanced_accuracies'][opt_f1_idx]}
            metrics_at_youdenj = {'f1': global_metrics['f1_scores'][opt_youdenj_idx], 'accuracy': global_metrics['accuracy'][opt_youdenj_idx], 'sensitivity': global_metrics['sensitivity'][opt_youdenj_idx], 'specificity': global_metrics['specificity'][opt_youdenj_idx], 'youden_j': global_metrics['youden_j'][opt_youdenj_idx], 'balanced_accuracy': global_metrics['balanced_accuracies'][opt_youdenj_idx]}
            metrics_at_balacc = {'f1': global_metrics['f1_scores'][opt_balacc_idx], 'accuracy': global_metrics['accuracy'][opt_balacc_idx], 'sensitivity': global_metrics['sensitivity'][opt_balacc_idx], 'specificity': global_metrics['specificity'][opt_balacc_idx], 'youden_j': global_metrics['youden_j'][opt_balacc_idx], 'balanced_accuracy': global_metrics['balanced_accuracies'][opt_balacc_idx]}

            optimal_thresholds = MappingProxyType({'F1': opt_thr_f1, 'YoudenJ': opt_thr_youdenj, 'BalancedAcc': opt_thr_balacc})
            metrics_at_optimal = {'F1': metrics_at_f1, 'YoudenJ': metrics_at_youdenj, 'BalancedAcc': metrics_at_balacc}


            result_row = {
                'model': model_name,
                'n_features': n_features,
                'threshold_f1': opt_thr_f1,
                'threshold_youdenj': opt_thr_youdenj,
                'threshold_balancedacc': opt_thr_balacc,

                'youden_j': metrics_at_youdenj['youden_j'],
                'f1': metrics_at_youdenj['f1'],
                'f1_ci': f"[{global_ci['f1'][0]:.3f}, {global_ci['f1'][1]:.3f}]",
                'accuracy': metrics_at_youdenj['accuracy'],
                'accuracy_ci': f"[{global_ci['accuracy'][0]:.3f}, {global_ci['accuracy'][1]:.3f}]",
                'balanced_accuracy': metrics_at_youdenj['balanced_accuracy'],
                'sensitivity': metrics_at_youdenj['sensitivity'],
                'sensitivity_ci': f"[{global_ci['sensitivity'][0]:.3f}, {global_ci['sensitivity'][1]:.3f}]",
                'specificity': metrics_at_youdenj['specificity'],
                'specificity_ci': f"[{global_ci['specificity'][0]:.3f}, {global_ci['specificity'][1]:.3f}]",
                'selected_features_preview': str(sel_features[:3]) + '...' if len(sel_features) > 3 else str(sel_features)
            }
            global_results_list.append(result_row)
            print_summary_table(f"GLOBAL RESULTS: {model_name}", result_row, is_global=True)

            preprocessor = artifact.get('preprocessor')
            if preprocessor:
                X_val_processed = pd.DataFrame(preprocessor.transform(X_val), columns=get_feature_names_from_column_transformer(preprocessor))
                subgroup_results = run_subgroup_analysis(y_pred_proba, y_val, X_val_processed, thresholds, args.n_bootstrap, args.n_jobs, args.seed)
                if subgroup_results:
                    print_summary_table(f"SUBGROUP RESULTS: {model_name}", subgroup_results)
                    subgroup_data = []
                    for k, v in subgroup_results.items():
                        row = {
                            'model': model_name,
                            'group': k,

                            'opt_youden_threshold': v['threshold_youden'],
                            **v
                        }
                        subgroup_data.append(row)
                    save_results_to_csv(subgroup_data, reports_dir / f"{model_name}_subgroups_optimal_thresholds_summary.csv")
            else: logging.warning(f"Could not find a preprocessor for '{model_name}'. Skipping subgroup analysis.")

            if args.plot_results:
                plot_path = plots_dir / f"{model_name}_threshold_metrics_plot.png"
                plot_threshold_metrics(global_metrics, dict(optimal_thresholds), metrics_at_optimal, model_name, plot_path, y_val, y_pred_proba, n_bootstrap_plot=args.n_bootstrap_plot, seed=args.seed)
        except Exception as e:
            logging.error(f"Unexpected error while processing model '{model_name}': {e}", exc_info=True)
            continue
    if global_results_list:
        save_results_to_csv(global_results_list, reports_dir / "all_models_global_results.csv")
    logging.info(f"\n=== EVALUATION COMPLETE ===\nProcessed {len(global_results_list)} models successfully.")
    logging.info(f"All outputs saved in '{output_dir}'. Log file: '{log_filename}'")




if __name__ == "__main__":


    NCV_DEFAULT_MODELS_PATH = str(Path("nested_cv_output") / "logloss" / "models")

    parser = argparse.ArgumentParser(
        description="Enhanced Model Evaluation Pipeline with Comprehensive Subgroup Analysis.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )

    parser.add_argument('--data_path', type=str, default='validation_set.csv', help='Path to the validation data CSV file.')

    parser.add_argument('--models_dir', type=str, default=NCV_DEFAULT_MODELS_PATH,
                        help='Directory containing trained model .pkl files (e.g., nested_cv_output/logloss/models).')

    parser.add_argument('--output_dir', type=str, default='Threshold_results', help='Directory to save all outputs.')
    parser.add_argument('--target_col', type=str, default='MDR status', help='Name of the target/outcome column in the data file.')
    parser.add_argument('--n_bootstrap', type=int, default=1000, help='Number of bootstrap iterations for confidence intervals.')
    parser.add_argument('--n_bootstrap_plot', type=int, default=1000, help='Number of bootstrap iterations for plot CI bands.')
    parser.add_argument('--n_jobs', type=int, default=-1, help='Number of CPU cores for parallel processing (-1 means all).')
    parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility of bootstrap sampling.')
    parser.add_argument('--no-plots', action='store_false', dest='plot_results', help='Add this flag to disable plot generation.')

    if 'ipykernel' in sys.modules and 'google.colab' in sys.modules:
        logging.info("Running in Colab environment - using default arguments.")
        args = parser.parse_args([])
    else:
        args = parser.parse_args()

    main(args)


🔍 GLOBAL RESULTS: LogisticRegression
Metric               Value      95% CI               Optimal Threshold
----------------------------------------------------------------------
Youden's J           0.403      -                    0.50 (Youden)
Sensitivity          0.709      [0.646, 0.772]
Specificity          0.694      [0.628, 0.759]
F1 Score             0.712      [0.658, 0.761]
Accuracy             0.702      [0.654, 0.749]
Balanced Accuracy    0.701     


🔍 SUBGROUP RESULTS: LogisticRegression
Subgroup                            N      Youden   Sens     Spec     Threshold 
--------------------------------------------------------------------------------
Age < Median                        190    0.359    0.707    0.652    0.43      
Age >= Median                       192    0.391    0.847    0.544    0.54      
Institution Type_Lab = 1            280    0.345    0.801    0.544    0.50      
Institution Type_Lab = 0            102    0.329    0.909    0.420    0.19      
Instit

**6. Model Re-evaluation (Post-Threshold Optimization)**          
Reassess model performance on the validation set after applying optimized thresholds.

In [12]:




!pip install -q statsmodels mlxtend joblib seaborn matplotlib pandas numpy scikit-learn scipy

import os
import re
from pathlib import Path
from itertools import combinations
from typing import Dict, Tuple, List, Optional

import joblib
import logging
import warnings

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
from pathlib import Path

from sklearn.calibration import calibration_curve
from sklearn.metrics import (
    accuracy_score, average_precision_score, balanced_accuracy_score,
    brier_score_loss, cohen_kappa_score, confusion_matrix, f1_score,
    matthews_corrcoef, precision_score, recall_score, roc_auc_score,
    roc_curve, precision_recall_curve, log_loss
)
from sklearn.utils import resample
from scipy.special import logit
from scipy import stats
import statsmodels.api as sm
from statsmodels.stats.contingency_tables import mcnemar
from mlxtend.evaluate import mcnemar_table
from statsmodels.stats.proportion import proportion_confint
from statsmodels.stats.multitest import multipletests
from dataclasses import dataclass
import seaborn as sns
import matplotlib.pyplot as plt
from typing import Dict
from scipy.special import expit as sigmoid_func
from scipy.special import expit
from statsmodels.stats.proportion import proportion_confint
from scipy.special import expit, logit


try:
    import statsmodels.api as sm
    STATSMODELS_AVAILABLE = True
except ImportError:
    STATSMODELS_AVAILABLE = False
    warnings.warn("statsmodels not available - some statistical functions will be limited")

try:
    from statsmodels.stats.multitest import multipletests
    MULTITEST_AVAILABLE = True
except ImportError:
    MULTITEST_AVAILABLE = False
    warnings.warn("statsmodels.stats.multitest not available - multiple comparison corrections will be limited")

try:
    from statsmodels.stats.contingency_tables import mcnemar
    MCNEMAR_AVAILABLE = True
except ImportError:
    MCNEMAR_AVAILABLE = False
    warnings.warn("statsmodels.stats.contingency_tables not available - McNemar test will be limited")

try:
    from mlxtend.evaluate import mcnemar_table
    MLXTEND_AVAILABLE = True
except ImportError:
    MLXTEND_AVAILABLE = False
    warnings.warn("mlxtend not available - McNemar table functionality will be limited")

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score, brier_score_loss,
    matthews_corrcoef, cohen_kappa_score, roc_curve,
    precision_recall_curve, balanced_accuracy_score, log_loss, confusion_matrix
)


warnings.filterwarnings("ignore", category=FutureWarning)
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

print("✅ Environment setup complete!")



for folder in ("Evaluation2", "model_comparison"):
    Path(folder).mkdir(parents=True, exist_ok=True)

@dataclass
class MetricResult:
    value: float
    cilow: float
    cihigh: float
    name: str = ""
    description: str = ""

class HybridFeatureSelector:


    def __init__(self, selected_features=None):
        self.selected_features_ = selected_features or []
        self.selected_features = selected_features or []

    def transform(self, X):

        if hasattr(X, 'columns'):
            return X[self.selected_features_] if self.selected_features_ else X
        else:
            return X

    def fit(self, X, y=None):
        return self

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)


def calculate_all_metrics(y_true: np.ndarray, y_pred: np.ndarray, y_prob: np.ndarray) -> Dict[str, float]:

    metrics = {}
    n_bins = 10


    metrics['AUC_ROC'] = roc_auc_score(y_true, y_prob)
    metrics['AUPRC'] = average_precision_score(y_true, y_prob)
    metrics['Accuracy'] = accuracy_score(y_true, y_pred)
    metrics['Balanced_Accuracy'] = balanced_accuracy_score(y_true, y_pred)
    metrics['Precision'] = precision_score(y_true, y_pred, zero_division=0)
    metrics['Recall'] = recall_score(y_true, y_pred, zero_division=0)
    metrics['Specificity'] = specificity_score(y_true, y_pred, zero_division=0)
    metrics['F1'] = f1_score(y_true, y_pred, zero_division=0)
    metrics['MCC'] = matthews_corrcoef(y_true, y_pred)
    metrics['Kappa'] = cohen_kappa_score(y_true, y_pred)


    metrics['LogLoss'] = log_loss(y_true, y_prob)
    metrics['Brier'] = brier_score_loss(y_true, y_prob)


    metrics['ECE'] = calculate_ece(y_true, y_prob, n_bins=n_bins, method='quantile')
    metrics['MCE'] = calculate_mce(y_true, y_prob, n_bins=n_bins, method='quantile')


    intercept, slope = calibration_slope_intercept(y_true, y_prob)
    metrics['Cal_Intercept'] = intercept
    metrics['Cal_Slope'] = slope



    z_statistic, z_p_value = np.nan, np.nan
    hl_statistic, hl_p_value = np.nan, np.nan

    try:

        z_statistic, z_p_value = z_test_standard(y_true, y_prob)
    except Exception as e:
        logging.warning(f"Z-test failed in metric calculation: {e}. Defaulting to NaN.")

    try:

        hl_statistic, hl_p_value = hosmer_lemeshow_test_advanced(y_true, y_prob, n_bins=n_bins)
    except Exception as e:
        logging.warning(f"HL-test failed in metric calculation: {e}. Defaulting to NaN.")

    metrics['Spiegelhalter_Z'] = z_statistic
    metrics['Spiegelhalter_P_Value'] = z_p_value

    metrics['HL_Statistic'] = hl_statistic
    metrics['HL_P_Value'] = hl_p_value

    extended_metrics = classification_extended_metrics(y_true, y_pred)
    metrics.update(extended_metrics)

    return metrics

def classification_extended_metrics(y_true, y_pred):

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    npv = tn / (tn + fn) if (tn + fn) > 0 else 0
    ppv = tp / (tp + fp) if (tp + fp) > 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0

    return {
        'TN': tn, 'FP': fp, 'FN': fn, 'TP': tp,
        'NPV': npv, 'PPV': ppv, 'FPR': fpr, 'FNR': fnr
    }

def bootstrap_metrics_ci_for_table(y_true, y_pred, y_prob, n_bootstrap=1000) -> Dict[str, float]:

    ci_results = {}


    z_stat, z_p = np.nan, np.nan
    hl_stat, hl_p = np.nan, np.nan


    point_intercept, point_slope = np.nan, np.nan
    intercept_ci_low, intercept_ci_high = np.nan, np.nan
    slope_ci_low, slope_ci_high = np.nan, np.nan


    def _add_ci(metric_name, metric_fn, use_y_pred=False, **kwargs):


        y_score = y_pred if use_y_pred else y_prob


        _, cilow, cihigh = bootstrap_metric_ci(
            y_true=y_true,
            y_score=y_score,
            metric_fn=metric_fn,
            n_bootstrap=n_bootstrap,
            **kwargs
        )
        ci_results[f'{metric_name}_CI_Low'] = cilow
        ci_results[f'{metric_name}_CI_High'] = cihigh




    _add_ci('AUC_ROC', roc_auc_score)
    _add_ci('AUPRC', average_precision_score)


    _add_ci('F1', f1_score, use_y_pred=True, zero_division=0)
    _add_ci('Accuracy', accuracy_score, use_y_pred=True)
    _add_ci('Precision', precision_score, use_y_pred=True, zero_division=0)
    _add_ci('Recall', recall_score, use_y_pred=True, zero_division=0)
    _add_ci('Specificity', specificity_score, use_y_pred=True, zero_division=0)
    _add_ci('MCC', matthews_corrcoef, use_y_pred=True)
    _add_ci('Kappa', cohen_kappa_score, use_y_pred=True)
    _add_ci('Balanced_Accuracy', balanced_accuracy_score, use_y_pred=True)


    _add_ci('Brier', brier_score_loss)
    _add_ci('LogLoss', log_loss)


    _add_ci('ECE', calculate_ece, n_bins=10, method='quantile')
    _add_ci('MCE', calculate_mce, n_bins=10, method='quantile')


    try:

        point_intercept, point_slope, i_ci, s_ci = calibration_slope_intercept_ci(y_true, y_prob, n_bootstrap=n_bootstrap)


        intercept_ci_low, intercept_ci_high = i_ci
        slope_ci_low, slope_ci_high = s_ci
        point_intercept, point_slope = point_intercept, point_slope


        z_stat, z_p = z_test_standard(y_true, y_prob)
        hl_stat, hl_p = hosmer_lemeshow_test_advanced(y_true, y_prob, n_bins=10)

    except Exception as e:

        logging.warning(f"Failed to calculate specialized CIs/Stats: {e}")



    ci_results['Cal_Intercept_CI_Low'] = intercept_ci_low
    ci_results['Cal_Intercept_CI_High'] = intercept_ci_high
    ci_results['Cal_Slope_CI_Low'] = slope_ci_low
    ci_results['Cal_Slope_CI_High'] = slope_ci_high


    ci_results['Spiegelhalter_P_Value'] = z_p
    ci_results['HL_P_Value'] = hl_p

    return ci_results



def enforce_numeric(df: pd.DataFrame) -> pd.DataFrame:

    if df.empty:
        return df
    df = df.apply(pd.to_numeric, errors="coerce")
    df.dropna(axis=0, how="all", inplace=True)
    df.dropna(axis=1, how="all", inplace=True)
    return df

def get_feature_names_from_column_transformer(ct) -> list:

    feature_names = []
    if not hasattr(ct, 'transformers_'):
        return []
    for name, transformer, cols in ct.transformers_:
        if name == "remainder" and transformer == "drop":
            continue
        if hasattr(transformer, "steps"):
            transformer = transformer.steps[-1][1]
        if hasattr(transformer, "get_feature_names_out"):
            try:
                names = transformer.get_feature_names_out(cols)
                feature_names.extend(names)
                continue
            except Exception:
                pass
        if hasattr(transformer, "categories_"):
            cats = transformer.categories_
            for i, col in enumerate(cols):
                feature_names.extend([f"{col}_{cat}" for cat in cats[i]])
        else:
            feature_names.extend(cols if isinstance(cols, list) else [cols])
    return feature_names

def normalize_model_name(name: str) -> str:

    name = re.sub(r'(_FINAL)?(_final)?_?calibrated_?model\.?pkl?', '', name, flags=re.IGNORECASE)
    name = re.sub(r'\bv\d+\b', '', name)
    name = re.sub(r'[-_]', ' ', name)

    model_map = {
        "catboost": "CatBoost",
        "lightgbm": "LightGBM",
        "logisticregression": "LogisticRegression",
        "logistic regression": "LogisticRegression",
        "randomforest": "RandomForest",
        "random forest": "RandomForest",
        "svm": "SVM",
        "support vector machine": "SVM",
        "xgboost": "XGBoost"
    }

    name_lower = name.lower().strip()
    for key, normalized in model_map.items():
        if key in name_lower:
            return normalized
    return name.strip().title()

def get_model_predictions_comprehensive(artifact: dict, x_val: pd.DataFrame) -> Tuple[Optional[np.ndarray], int, List[str]]:

    try:
        logging.info(f"Inspecting artifact: Found keys: {list(artifact.keys())}")
        model = artifact["calibrated_model"]
        preprocessor = artifact["preprocessor"]
        selector = artifact.get("selector")
        metadata = artifact.get("metadata", {})
        predefined = metadata.get("selected_features")

        if not all([model, preprocessor]):
            missing = [k for k, v in {'calibrated': model, 'preprocessor': preprocessor}.items() if not v]
            raise ValueError(f"Missing required components: {missing}")


        x_proc = preprocessor.transform(x_val)


        if hasattr(x_proc, 'toarray'):
            x_proc = x_proc.toarray()

        feat_names = get_feature_names_from_column_transformer(preprocessor)


        x_df_full = pd.DataFrame(x_proc, index=x_val.index)


        if x_df_full.shape[1] == len(feat_names):
             x_df_full.columns = feat_names
        else:

             x_df_full.columns = [f'__temp_f_{i}' for i in range(x_df_full.shape[1])]


        x_df_aligned = x_df_full.reindex(columns=feat_names, fill_value=0.0)


        if selector is not None and hasattr(selector, "transform"):
            x_sel = selector.transform(x_df_aligned)
            sel_names = list(getattr(selector, 'selected_features_', []))
        elif predefined:

            x_sel = x_df_aligned[predefined].copy()
            sel_names = predefined


            missing = set(predefined) - set(x_sel.columns)
            if missing:
                logging.error(f"Missing critical features: {list(missing)[:3]}...")
                return None, 0, []
        else:
            logging.warning("No feature selection applied, using all aligned features.")
            x_sel = x_df_aligned
            sel_names = feat_names


        x_sel = enforce_numeric(x_sel).fillna(0)
        num_features_used = x_sel.shape[1]

        if num_features_used == 0:
            logging.error("No features available for prediction.")
            return None, 0, []

        try:

            if "lightgbm" in str(type(model)).lower():
                y_pred_proba = model.predict_proba(x_sel, predict_disable_shape_check=True)[:, 1]
            else:
                y_pred_proba = model.predict_proba(x_sel)[:, 1]

            logging.info(f"✅ Predictions generated using {num_features_used} features.")
            return y_pred_proba, num_features_used, sel_names

        except Exception as e:

            try:
                logging.warning("Prediction failed with DataFrame input. Retrying with NumPy array.")
                y_pred_proba = model.predict_proba(x_sel.values)[:, 1]
                return y_pred_proba, num_features_used, sel_names
            except Exception as e2:
                logging.error(f"❌ Final prediction failed: {e2}")
                return None, 0, []

    except Exception as exc:
        logging.error(f"❌ Failed to process model artifact: {str(exc)}", exc_info=True)
        return None, 0, []


def load_validation_data(path="validation_set.csv"):

    if not os.path.exists(path):
        logging.error("Validation data not found: %s", path)
        return None, None
    try:
        df = pd.read_csv(path)
        return df.drop(columns="MDR status"), df["MDR status"].values
    except Exception as e:
        logging.error("Error loading data: %s", e)
        return None, None

def load_calibrated_models(model_dir="trained_models") -> Dict[str, dict]:

    models: Dict[str, dict] = {}
    p = Path(model_dir)
    if not p.exists():
        logging.error("Model directory not found: %s", model_dir)
        return models

    files = sorted(list(p.glob("*_FINAL_final_calibrated_model.pkl")) +
                   list(p.glob("*_final_calibrated_model.pkl")))

    if not files:
        logging.error("No model files found in %s", model_dir)
        return models

    for f in files:
        try:
            name = normalize_model_name(f.stem)
            art = joblib.load(f)

            if "calibrated" in art and "calibrated_model" not in art:
                art["calibrated_model"] = art["calibrated"]
            if "selector" in art and "feature_selector" not in art:
                art["feature_selector"] = art["selector"]
            if "metadata" in art and "selected_features" in art["metadata"]:
                art["selected_features"] = art["metadata"]["selected_features"]

            required_keys = {"calibrated_model", "preprocessor"}
            if not required_keys.issubset(art.keys()):
                missing = required_keys - set(art.keys())
                logging.warning("Skipping %s - missing keys: %s", f.name, missing)
                continue

            models[name] = {"artifact": art}
            logging.info("Loaded model: %s", name)
        except Exception as exc:
            logging.error("Failed to load %s: %s", f.name, exc)
    return models

def load_optimal_thresholds(path="Threshold_results/reports/all_models_global_results.csv") -> Dict[str, float]:

    paths = [
        path,
        "Threshold_results/all_models_global_results.csv",
        "all_models_global_results.csv",
    ]
    for fp in paths:
        if not os.path.exists(fp):
            continue
        try:
            df = pd.read_csv(fp)
            model_col = "Model" if "Model" in df.columns else "model"
            if model_col not in df.columns:
                continue

            df["Normalized_Model"] = df[model_col].apply(normalize_model_name)

            threshold_col = None
            for col_name in ["threshold_youdenj", "youden_threshold", "Threshold", "threshold"]:
                if col_name in df.columns:
                    threshold_col = col_name
                    break

            if threshold_col:
                print(f"DEBUG: Loading thresholds from column '{threshold_col}'")
                return df.set_index("Normalized_Model")[threshold_col].to_dict()
            else:
                logging.warning("No threshold column found in %s", fp)
                continue

        except Exception as e:
            logging.warning("Error loading thresholds from %s: %s", fp, e)

    logging.warning("Using default threshold 0.5")
    return {}



def bootstrap_metric_ci(y_true, y_score, metric_fn, n_bootstrap=1000, alpha=0.05, **kws):

    rng = np.random.default_rng(42)
    vals = []
    for _ in range(n_bootstrap):
        idx = rng.choice(len(y_true), len(y_true), replace=True)
        if len(np.unique(y_true[idx])) < 2:
            continue
        try:
            vals.append(metric_fn(y_true[idx], y_score[idx], **kws))
        except Exception:
            continue
    vals = np.asarray(vals)
    return (
        np.nanmean(vals),
        np.nanpercentile(vals, 100 * alpha / 2),
        np.nanpercentile(vals, 100 * (1 - alpha / 2)),
    )

def calculate_ece(y_true, y_prob, n_bins=10, method='uniform'):

    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)


    mask = ~(np.isnan(y_prob) | np.isinf(y_prob))
    y_true = y_true[mask]
    y_prob = y_prob[mask]

    if len(y_true) == 0:
        return np.nan

    if method == 'quantile':

        quantiles = np.linspace(0, 1, n_bins + 1)
        bin_edges = np.percentile(y_prob, quantiles * 100)

    else:

        bin_edges = np.linspace(0, 1, n_bins + 1)

    ece = 0.0
    total_samples = len(y_true)

    for i in range(n_bins):

        if i == n_bins - 1:

            mask = (y_prob >= bin_edges[i]) & (y_prob <= bin_edges[i+1])
        else:

            mask = (y_prob >= bin_edges[i]) & (y_prob < bin_edges[i+1])

        n_in_bin = np.sum(mask)
        if n_in_bin > 0:
            avg_pred = np.mean(y_prob[mask])
            avg_true = np.mean(y_true[mask])
            ece += (n_in_bin / total_samples) * np.abs(avg_pred - avg_true)

    return ece

def calculate_mce(y_true, y_prob, n_bins=10, method='uniform'):

    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)


    mask = ~(np.isnan(y_prob) | np.isinf(y_prob))
    y_true = y_true[mask]
    y_prob = y_prob[mask]

    if len(y_true) == 0:
        return np.nan


    if method == 'quantile':
        quantiles = np.linspace(0, 1, n_bins + 1)
        bin_edges = np.percentile(y_prob, quantiles * 100)
        bin_edges[0] = 0.0
        bin_edges[-1] = 1.0
    else:

        bin_edges = np.linspace(0., 1. + 1e-8, n_bins + 1)

    max_error = 0.0
    total_samples = len(y_true)

    for i in range(n_bins):

        if i == n_bins - 1:
            mask_bin = (y_prob >= bin_edges[i]) & (y_prob <= bin_edges[i+1])
        else:
            mask_bin = (y_prob >= bin_edges[i]) & (y_prob < bin_edges[i+1])

        n_in_bin = np.sum(mask_bin)
        if n_in_bin > 0:
            avg_pred = np.mean(y_prob[mask_bin])
            avg_true = np.mean(y_true[mask_bin])
            error = abs(avg_pred - avg_true)
            if error > max_error:
                max_error = error

    return max_error if max_error > 0 else np.nan


def calibration_slope_intercept(y_true, y_prob, method='logistic'):

    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)

    mask = ~(np.isnan(y_prob) | np.isinf(y_prob) | np.isnan(y_true))
    y_true, y_prob = y_true[mask], y_prob[mask]

    if len(y_true) < 10 or len(np.unique(y_true)) < 2:
        return np.nan, np.nan

    y_prob = np.clip(y_prob, 1e-6, 1 - 1e-6)
    X_base = logit(y_prob) if method == 'logistic' else y_prob
    X = sm.add_constant(X_base.reshape(-1, 1))

    try:
        mod = sm.GLM(y_true, X, family=sm.families.Binomial())
        res = mod.fit(disp=0)
        intercept, slope = float(res.params[0]), float(res.params[1])
        return intercept, slope
    except Exception:
        return np.nan, np.nan

def calibration_slope_intercept_ci(y_true, y_prob, n_bootstrap=1000, alpha=0.05, seed=42):


    intercept_point, slope_point = calibration_slope_intercept(y_true, y_prob)


    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)
    rng = np.random.default_rng(seed)
    slopes, intercepts = [], []

    for _ in range(n_bootstrap):
        try:
            indices = rng.choice(len(y_true), len(y_true), replace=True)
            if len(np.unique(y_true[indices])) < 2:
                continue

            int_b, slp_b = calibration_slope_intercept(y_true[indices], y_prob[indices])
            if not (np.isnan(slp_b) or np.isnan(int_b)):
                slopes.append(slp_b)
                intercepts.append(int_b)
        except Exception:
            continue

    if slopes and intercepts:
        slope_ci = (np.percentile(slopes, 100 * alpha/2), np.percentile(slopes, 100 * (1 - alpha/2)))
        intercept_ci = (np.percentile(intercepts, 100 * alpha/2), np.percentile(intercepts, 100 * (1 - alpha/2)))
    else:

        slope_ci = (np.nan, np.nan)
        intercept_ci = (np.nan, np.nan)

    return intercept_point, slope_point, intercept_ci, slope_ci

def z_test_standard(y_true, y_prob):


    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)

    mask = ~(np.isnan(y_prob) | np.isinf(y_prob))
    y_true, y_prob = y_true[mask], y_prob[mask]

    if len(y_true) < 10:
        return np.nan, np.nan


    numerator = np.sum(y_true - y_prob)


    denominator = np.sqrt(np.sum(y_prob * (1 - y_prob)))

    if denominator < 1e-8:
        return np.nan, np.nan


    z_stat = numerator / denominator
    p_value = 2 * (1 - norm.cdf(abs(z_stat)))

    return z_stat, p_value

def hosmer_lemeshow_test_advanced(y_true, y_prob, n_bins=10, min_expected_freq=5):


    if not STATSMODELS_AVAILABLE:
        warnings.warn("statsmodels not available - returning NaN for Hosmer-Lemeshow test")
        return np.nan, np.nan

    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)

    mask = ~(np.isnan(y_prob) | np.isinf(y_prob))
    y_true, y_prob = y_true[mask], y_prob[mask]

    if len(y_true) < 20 or len(np.unique(y_true)) < 2:
        return np.nan, np.nan


    df = pd.DataFrame({'y_true': y_true, 'y_prob': y_prob})
    try:
        df['bin'] = pd.qcut(df['y_prob'], n_bins, labels=False, duplicates='drop')
    except ValueError:
        df['bin'] = np.floor(df['y_prob'] * n_bins).astype(int)
        df.loc[df['bin'] == n_bins, 'bin'] = n_bins - 1


    summary = df.groupby('bin').agg(
        observed=('y_true', 'sum'),
        expected=('y_prob', 'sum'),
        n_total=('y_true', 'size')
    ).reset_index()


    while True:
        sparse_bins = summary[summary['expected'] < min_expected_freq]
        if sparse_bins.empty or len(summary) <= 2:
            break


        merge_idx = sparse_bins.index[0]
        if merge_idx == 0:
            summary.loc[1, ['observed', 'expected', 'n_total']] += summary.loc[0, ['observed', 'expected', 'n_total']]
            summary = summary.drop(0).reset_index(drop=True)
        else:
            summary.loc[merge_idx - 1, ['observed', 'expected', 'n_total']] += summary.loc[merge_idx, ['observed', 'expected', 'n_total']]
            summary = summary.drop(merge_idx).reset_index(drop=True)


    g = len(summary)
    summary['variance'] = summary['expected'] * (1 - summary['expected'] / summary['n_total'])
    hl_statistic = ((summary['observed'] - summary['expected'])**2 / (summary['variance'] + 1e-8)).sum()


    df_hl = g - 2
    if df_hl <= 0:
        return hl_statistic, np.nan

    p_value = 1 - chi2.cdf(hl_statistic, df_hl)

    return hl_statistic, p_value

def perform_mcnemar(y_true, y_pred1, y_pred2):

    try:
        table = mcnemar_table(y_target=y_true, y_model1=y_pred1, y_model2=y_pred2)
        return mcnemar(table).pvalue, table
    except Exception:
        return np.nan, None


def specificity_score(y_true, y_pred, zero_division=0):

    cm = confusion_matrix(y_true, y_pred)
    if cm.shape != (2, 2):
        return 0.0
    tn, fp, fn, tp = cm.ravel()
    denom = tn + fp
    if denom == 0:
        return zero_division
    return tn / denom


def cohens_d(group1, group2):

    n1, n2 = len(group1), len(group2)
    if n1 < 2 or n2 < 2:
        return np.nan
    m1, m2 = np.mean(group1), np.mean(group2)
    s1, s2 = np.std(group1, ddof=1), np.std(group2, ddof=1)
    pooled_std = np.sqrt(((n1 - 1) * s1**2 + (n2 - 1) * s2**2) / (n1 + n2 - 2))
    return np.nan if pooled_std == 0 else (m1 - m2) / pooled_std

def cliffs_delta(group1, group2):

    if len(group1) == 0 or len(group2) == 0:
        return np.nan
    dominance = sum(int(x > y) - int(x < y) for x in group1 for y in group2)
    return dominance / (len(group1) * len(group2))

def interpret_cohens_d(d):
    d = abs(d)
    if np.isnan(d):
        return "Unknown"
    return ("Negligible" if d < 0.2 else
            "Small" if d < 0.5 else
            "Medium" if d < 0.8 else "Large")

def interpret_cliffs_delta(delta):
    d = abs(delta)
    if np.isnan(d):
        return "Unknown"
    return ("Negligible" if d < 0.147 else
            "Small" if d < 0.33 else
            "Medium" if d < 0.474 else "Large")

def mcnemar_effect_size(table):

    if table is None:
        return np.nan
    b, c = table[0, 1], table[1, 0]
    if c == 0:
        return np.inf if b > 0 else np.nan
    return b / c

def interpret_odds_ratio(odds_ratio):
    if np.isnan(odds_ratio):
        return "Unknown"
    if np.isinf(odds_ratio):
        return "Complete dominance"
    return ("Large" if odds_ratio >= 3 or odds_ratio <= 1/3 else
            "Medium" if odds_ratio >= 1.5 or odds_ratio <= 1/1.5 else "Small")

def bootstrap_effect_size_ci(group1, group2, effect_fn, n_bootstrap=1000, alpha=0.05):
    rng = np.random.default_rng(42)
    effects = []
    for _ in range(n_bootstrap):
        idx1 = rng.choice(len(group1), len(group1), replace=True)
        idx2 = rng.choice(len(group2), len(group2), replace=True)
        eff = effect_fn(np.array(group1)[idx1], np.array(group2)[idx2])
        if not np.isnan(eff):
            effects.append(eff)
    if not effects:
        return np.nan, np.nan, np.nan
    effects = np.array(effects)
    return effects.mean(), np.percentile(effects, 100*alpha/2), np.percentile(effects, 100*(1-alpha/2))


def apply_multiple_comparisons_correction(df, pval_cols, alpha=0.05, method='both'):

    df_corrected = df.copy()
    for col in pval_cols:
        if col not in df.columns:
            continue
        pvals = df[col].dropna().values
        valid_idx = df[col].notna()
        if len(pvals) == 0:
            continue
        if method in ['bonferroni', 'both']:
            bonf_pvals = np.minimum(pvals * len(pvals), 1.0)
            df_corrected.loc[valid_idx, f'{col}_bonferroni'] = bonf_pvals
            df_corrected.loc[valid_idx, f'{col}_bonferroni_sig'] = bonf_pvals < alpha
            df_corrected.loc[valid_idx, f'{col}_bonferroni_alpha'] = alpha / len(pvals)
        if method in ['fdr', 'both']:
            _, fdr_pvals, _, _ = multipletests(pvals, alpha=alpha, method='fdr_bh')
            df_corrected.loc[valid_idx, f'{col}_fdr'] = fdr_pvals
            df_corrected.loc[valid_idx, f'{col}_fdr_sig'] = fdr_pvals < alpha
    return df_corrected

def print_correction_summary(df_corrected, original_alpha=0.05):

    print("\n📊 Multiple Comparisons Correction Summary:")
    print("=" * 60)
    pval_cols = [col for col in df_corrected.columns
                 if 'p_value' in col.lower() or 'pval' in col.lower()
                 and not any(x in col for x in ['bonferroni', 'fdr'])]
    for col in pval_cols:
        if col in df_corrected.columns:
            original_sig = (df_corrected[col] < original_alpha).sum()
            total_tests = df_corrected[col].notna().sum()
            print(f"\n{col.upper()}:")
            print(f"  Original significant results (p < {original_alpha}): {original_sig}/{total_tests}")
            if f'{col}_bonferroni_sig' in df_corrected.columns:
                bonf_sig = df_corrected[f'{col}_bonferroni_sig'].sum()
                bonf_alpha = df_corrected[f'{col}_bonferroni_alpha'].iloc[0] if not df_corrected.empty else 'N/A'
                print(f"  Bonferroni significant (α = {bonf_alpha:.4f}): {bonf_sig}/{total_tests}")
            if f'{col}_fdr_sig' in df_corrected.columns:
                fdr_sig = df_corrected[f'{col}_fdr_sig'].sum()
                print(f"  FDR significant (α = {original_alpha}): {fdr_sig}/{total_tests}")

def plot_significance_comparison(corrected_df, pval_cols, out_dir="Evaluation2"):

    fig, axes = plt.subplots(1, len(pval_cols), figsize=(6*len(pval_cols), 5))
    if len(pval_cols) == 1:
        axes = [axes]
    for i, col in enumerate(pval_cols):
        if col not in corrected_df.columns:
            continue
        original_sig = (corrected_df[col] < 0.05).sum()
        bonf_sig = corrected_df.get(f'{col}_bonferroni_sig', pd.Series([])).sum()
        fdr_sig = corrected_df.get(f'{col}_fdr_sig', pd.Series([])).sum()
        methods = ['Original', 'Bonferroni', 'FDR']
        counts = [original_sig, bonf_sig, fdr_sig]
        colors = ['lightblue', 'orange', 'lightgreen']
        bars = axes[i].bar(methods, counts, color=colors, edgecolor='black', linewidth=0.7)
        axes[i].set_title(f'Significant Results: {col}', fontsize=14)
        axes[i].set_ylabel('Number of Significant Tests', fontsize=12)
        for bar, count in zip(bars, counts):
            height = bar.get_height()
            axes[i].text(bar.get_x() + bar.get_width()/2., height + 0.1,
                         f'{int(count)}', ha='center', va='bottom', fontsize=11)
    plt.tight_layout()
    plt.savefig(f"{out_dir}/significance_comparison.png", dpi=450, bbox_inches='tight')
    plt.close()

def pairwise_auprc_comparison_with_effect_size(y_true, model_probs, out=None):

    keys = list(model_probs.keys())
    results = []
    for m1, m2 in combinations(keys, 2):
        p1, p2 = model_probs[m1], model_probs[m2]
        try:
            auprc1 = average_precision_score(y_true, p1)
            auprc2 = average_precision_score(y_true, p2)
            rng, n_boot, diffs, ap1s, ap2s = np.random.default_rng(42), 1000, [], [], []
            for _ in range(n_boot):
                idx = rng.choice(len(y_true), len(y_true), replace=True)
                try:
                    ap1 = average_precision_score(y_true[idx], p1[idx])
                    ap2 = average_precision_score(y_true[idx], p2[idx])
                    diffs.append(ap1 - ap2)
                    ap1s.append(ap1)
                    ap2s.append(ap2)
                except Exception:
                    continue
            if not diffs: continue
            pval = 2 * min((np.array(diffs) > 0).mean(), (np.array(diffs) < 0).mean())
            coh_d = cohens_d(np.array(ap1s), np.array(ap2s))
            cl_delta = cliffs_delta(np.array(ap1s), np.array(ap2s))
            cd_mean, cd_low, cd_high = bootstrap_effect_size_ci(np.array(ap1s), np.array(ap2s), cohens_d)
            results.append({
                "Model_A": m1, "Model_B": m2, "AUPRC_A": auprc1, "AUPRC_B": auprc2,
                "AUPRC_diff_mean": np.mean(diffs), "AUPRC_diff_std": np.std(diffs),
                "AUPRC_p_value": pval, "Cohens_d": coh_d, "Cohens_d_CI_low": cd_low,
                "Cohens_d_CI_high": cd_high, "Cohens_d_interpretation": interpret_cohens_d(coh_d),
                "Cliffs_delta": cl_delta, "Cliffs_delta_interpretation": interpret_cliffs_delta(cl_delta),
            })
        except Exception as e:
            logging.warning(f"Error comparing {m1} vs {m2}: {e}")
            continue
    df = pd.DataFrame(results)
    if not df.empty and 'AUPRC_p_value' in df.columns:
        df_corrected = apply_multiple_comparisons_correction(df, ['AUPRC_p_value'], alpha=0.05, method='both')
        if out:
            df.to_csv(out, index=False)
            corrected_out = str(out).replace('.csv', '_corrected.csv')
            df_corrected.to_csv(corrected_out, index=False)
            logging.info(f"AUPRC comparison results saved to {out}")
        return df_corrected
    return df

def pairwise_mcnemar_with_effect_size(models, x_val, y_true, thresholds, out=None):

    keys = list(models.keys())
    results, model_predictions = [], {}
    for name in keys:
        try:
            y_prob, _, _ = get_model_predictions_comprehensive(models[name]['artifact'], x_val)
            if y_prob is not None:
                thr = thresholds.get(name, 0.5)
                pred = (y_prob >= thr).astype(int)
                model_predictions[name] = pred
        except Exception as e:
            logging.warning(f"Failed to get predictions for {name}: {e}")
            continue
    for m1, m2 in combinations(keys, 2):
        if m1 not in model_predictions or m2 not in model_predictions:
            continue
        pred1, pred2 = model_predictions[m1], model_predictions[m2]
        try:
            p_val, table = perform_mcnemar(y_true, pred1, pred2)
            odds_ratio = mcnemar_effect_size(table)
            interp = interpret_odds_ratio(odds_ratio)
        except Exception as e:
            logging.warning(f"McNemar test failed for {m1} vs {m2}: {e}")
            p_val, odds_ratio, interp = np.nan, np.nan, "Unknown"
        results.append({
            "Model_A": m1, "Model_B": m2, "McNemar_p_value": p_val,
            "Odds_ratio": odds_ratio, "Effect_size_interpretation": interp
        })
    df = pd.DataFrame(results)
    if not df.empty and 'McNemar_p_value' in df.columns:
        df_corrected = apply_multiple_comparisons_correction(df, ['McNemar_p_value'], alpha=0.05, method='both')
        if out:
            df.to_csv(out, index=False)
            corrected_out = str(out).replace('.csv', '_corrected.csv')
            df_corrected.to_csv(corrected_out, index=False)
            logging.info(f"McNemar comparison results saved to {out}")
        return df_corrected
    return df

def create_effect_size_summary(auprc_df, mcnemar_df, out_dir):

    summary_lines = ["# Effect Size Analysis Summary\n"]
    if auprc_df is not None and not auprc_df.empty and "Cohens_d_interpretation" in auprc_df:
        summary_lines.append("## AUPRC Cohen's d:")
        counts = auprc_df['Cohens_d_interpretation'].value_counts().to_dict()
        summary_lines.extend([f"- {k}: {v} comparisons" for k, v in counts.items()])
    if mcnemar_df is not None and not mcnemar_df.empty and "Effect_size_interpretation" in mcnemar_df:
        summary_lines.append("\n## McNemar Odds Ratio Interpretation:")
        counts = mcnemar_df['Effect_size_interpretation'].value_counts().to_dict()
        summary_lines.extend([f"- {k}: {v} comparisons" for k, v in counts.items()])
    Path(out_dir).mkdir(exist_ok=True, parents=True)
    with open(Path(out_dir, "effect_size_summary.txt"), "w") as f:
        f.write("\n".join(summary_lines))
    logging.info(f"Effect size summary saved to {out_dir}/effect_size_summary.txt")



matplotlib.rcParams.update({
    'font.family': 'DejaVu Sans',
    'font.size': 6,
    'axes.titlesize': 6,
    'axes.labelsize': 6,
    'xtick.labelsize': 6,
    'ytick.labelsize': 6,
    'legend.fontsize': 6,
    'figure.dpi': 450,
})

def plot_heatmap(df: pd.DataFrame, title="Model Performance Heatmap"):

    df = enforce_numeric(df)
    if df.empty:
        logging.warning("No data for heatmap.")
        return
    plt.figure(figsize=(3.5, 3.5))
    sns.heatmap(df, annot=True, fmt=".3f", cmap="viridis", linewidths=0.5, cbar_kws={'shrink':0.7})
    plt.title(title, fontsize=7)
    plt.xlabel('Metrics', fontsize=7)
    plt.ylabel('Models', fontsize=7)
    plt.xticks(rotation=45, ha="right")
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig("Evaluation2/performance_heatmap.png", dpi=450, bbox_inches='tight')
    plt.close()

def plot_radar_chart(df: pd.DataFrame, title="Radar chart"):
    matplotlib.rcParams['font.family'] = 'DejaVu Sans'
    plt.rcParams['pdf.fonttype'] = 42
    df = enforce_numeric(df)
    if df.empty:
        logging.warning("No data for radar chart.")
        return
    df_norm = (df - df.min()) / (df.max() - df.min() + 1e-9)
    n_metrics = len(df.columns)
    angles = np.linspace(0, 2 * np.pi, n_metrics, endpoint=False).tolist()
    angles += angles[:1]
    fig, ax = plt.subplots(figsize=(3.5, 3.5), subplot_kw=dict(polar=True), dpi=450)
    colors = plt.cm.tab10(np.linspace(0, 1, len(df_norm)))
    for idx, (row_name, row) in enumerate(df_norm.iterrows()):
        vals = row.tolist() + [row.iloc[0]]
        ax.plot(angles, vals, label=str(row_name), color=colors[idx], lw=1)
        ax.fill(angles, vals, alpha=0.13, color=colors[idx])
    ax.set_thetagrids(np.degrees(angles[:-1]), df.columns, fontsize=5, fontfamily='DejaVu Sans')
    ax.set_ylim(0, 1)
    ax.set_yticks([0.2, 0.4, 0.6, 0.8, 1.0])
    ax.set_yticklabels(['0.2', '0.4', '0.6', '0.8', '1.0'], fontsize=5, fontfamily='DejaVu Sans')
    ax.grid(True, alpha=0.27, linewidth=0.45)
    ax.spines['polar'].set_color('black')
    ax.spines['polar'].set_linewidth(0.6)
    ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), fontsize=6, frameon=False, borderaxespad=0, ncol=3)
    plt.title(title, fontsize=6, pad=13, fontfamily='DejaVu Sans')
    plt.tight_layout(pad=0.3)
    plt.savefig("Evaluation2/metrics_radar_chart.png", dpi=450, bbox_inches='tight')
    plt.close()

def plot_roc_curves(details: Dict[str, dict]):

    plt.figure(figsize=(3.5, 3.5))
    colors = list(sns.color_palette("colorblind"))

    for idx, (m, res) in enumerate(details.items()):
        if "roc_curve" not in res:
            continue


        roc_data = res["roc_curve"]
        if len(roc_data) == 3:
            fpr, tpr, _ = roc_data
        elif len(roc_data) == 2:
            fpr, tpr = roc_data
        else:
            print(f"⚠️ Unexpected ROC curve format for {m}, skipping")
            continue


        auc = res.get("Metrics", {}).get("AUC_ROC", np.nan)
        plt.plot(fpr, tpr, color=colors[idx % len(colors)], label=f"{m} (AUC = {auc:.3f})", lw=1)

    plt.plot([0, 1], [0, 1], "k--", alpha=0.5, lw=1)
    plt.xlabel("False Positive Rate", fontsize=6)
    plt.ylabel("True Positive Rate", fontsize=6)
    plt.title("ROC Curves", fontsize=7)
    plt.legend(fontsize=5, loc='lower right')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig("Evaluation2/roc_curves.png", dpi=450, bbox_inches='tight')
    plt.close()

def plot_pr_curves(details: Dict[str, dict]):

    plt.figure(figsize=(3.5, 3.5))
    colors = list(sns.color_palette("colorblind"))

    for idx, (m, res) in enumerate(details.items()):
        if "pr_curve" not in res:
            continue


        pr_data = res["pr_curve"]
        if len(pr_data) == 3:
            precision, recall, _ = pr_data
        elif len(pr_data) == 2:
            precision, recall = pr_data
        else:
            print(f"⚠️ Unexpected PR curve format for {m}, skipping")
            continue


        auprc = res.get("Metrics", {}).get("AUPRC", np.nan)
        plt.plot(recall, precision, color=colors[idx % len(colors)], label=f"{m} (AUPRC = {auprc:.3f})", lw=1)

    plt.xlabel("Recall", fontsize=6)
    plt.ylabel("Precision", fontsize=6)
    plt.title("Precision-Recall Curves", fontsize=7)
    plt.legend(fontsize=5, loc='lower left')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig("Evaluation2/pr_curves.png", dpi=450, bbox_inches='tight')
    plt.close()

def plot_calibration_curves(results, n_bins=10, save_path="Evaluation2/calibration_plots.png", show_bin_counts=True):

    colors = plt.get_cmap('tab10').colors
    model_names = list(results.keys())
    n_models = len(model_names)
    if n_models == 0: return
    fig, axes = plt.subplots(1, n_models, figsize=(6 * n_models, 6), sharey=True, squeeze=False)
    axes = axes.flatten()

    for i, model in enumerate(model_names):
        res = results[model]
        y_true, y_prob, metrics = np.asarray(res['y_true']), np.asarray(res['y_prob']), res.get('metrics', {})
        try:
            df = pd.DataFrame({'prob': y_prob, 'target': y_true})
            df['bin'], bin_edges = pd.qcut(df['prob'], n_bins, labels=False, retbins=True, duplicates='drop')
        except Exception as e:
            print(f"Quantile binning failed for {model}: {e}, using uniform bins.")
            bin_edges = np.linspace(0, 1, n_bins + 1)
            df['bin'] = np.digitize(df['prob'], bin_edges) - 1
        means, observed, ci_lowers, ci_uppers, bin_counts = [], [], [], [], []
        for b in range(len(bin_edges)-1):
            in_bin = (df['bin'] == b)
            if not np.any(in_bin):
                means.append((bin_edges[b] + bin_edges[b+1]) / 2)
                observed.append(np.nan); ci_lowers.append(np.nan); ci_uppers.append(np.nan); bin_counts.append(0)
                continue
            prob_mean, obs_mean = df.loc[in_bin, 'prob'].mean(), df.loc[in_bin, 'target'].mean()
            n_bin, pos_bin = in_bin.sum(), int(df.loc[in_bin, 'target'].sum())
            ci_lo, ci_hi = proportion_confint(pos_bin, n_bin, method='wilson')
            means.append(prob_mean); observed.append(obs_mean); ci_lowers.append(ci_lo)
            ci_uppers.append(ci_hi); bin_counts.append(n_bin)

        ax = axes[i]
        color = colors[i % len(colors)]
        ax.plot(means, observed, marker='o', color=color, label="Calibration", lw=3, markersize=8)
        ax.fill_between(means, ci_lowers, ci_uppers, color=color, alpha=0.18, label="95% Wilson CI")
        ax.plot([0, 1], [0, 1], 'k--', lw=2, label='Perfect')

        thresh = res.get('youden_threshold', 0.5)
        ax.axvline(x=thresh, color='gray', ls='--', lw=2, label=f"Thresh={thresh:.3f}")

        metric_text = (f"ECE: {metrics.get('ECE', np.nan):.3f}\nMCE: {metrics.get('MCE', np.nan):.3f}\n"
                       f"Slope: {metrics.get('Slope', np.nan):.2f}\nIntercept: {metrics.get('Intercept', np.nan):.2f}\n"
                       f"Brier: {metrics.get('Brier', np.nan):.3f}")
        ax.text(0.62, 0.19, metric_text, color=color, fontsize=11,
                bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8), transform=ax.transAxes)

        if show_bin_counts:
            for mx, oy, count in zip(means, observed, bin_counts):
                if not np.isnan(oy):
                    ax.annotate(f"n={int(count)}", (mx, oy), textcoords="offset points",
                                xytext=(0,-16), ha='center', fontsize=10, color='dimgray')
        ax.set_xlabel('Mean Predicted Probability', fontsize=14)
        if i == 0: ax.set_ylabel('Fraction of Positives', fontsize=14)
        ax.set_title(f'Calibration: {model}', fontsize=16)
        ax.legend(loc='upper left', fontsize=10); ax.set_xlim([0, 1]); ax.set_ylim([0, 1])
        ax.grid(True, alpha=0.3); ax.set_aspect('equal', 'box'); ax.tick_params(labelsize=12)
    plt.tight_layout()
    plt.savefig(save_path, dpi=450, bbox_inches='tight')
    plt.close()

def plot_overlaid_calibration_curves(results, n_bins=10, save_path="Evaluation2/calibration_overlay.png", show_thresholds=True):

    plt.rcParams.update({
        "font.family": "DejaVu Sans", "font.size": 7, "axes.titlesize": 7,
        "axes.labelsize": 7, "xtick.labelsize": 6, "ytick.labelsize": 6,
        "legend.fontsize": 6, "lines.linewidth": 1, "figure.dpi": 450, "axes.linewidth": 0.7,
    })
    colors = plt.cm.tab10(np.linspace(0, 1, len(results)))
    model_names = list(results.keys())
    plt.figure(figsize=(3.5, 3.5), dpi=450)
    threshold_lines, threshold_labels = [], []

    for i, model in enumerate(model_names):
        res = results[model]
        y_true, y_prob = np.asarray(res['y_true']), np.asarray(res['y_prob'])
        threshold = res.get('youden_threshold', 0.5)
        df = pd.DataFrame({'prob': y_prob, 'target': y_true})
        try:
            df['bin'], bin_edges = pd.qcut(df['prob'], n_bins, labels=False, retbins=True, duplicates='drop')
        except Exception:
            bin_edges = np.linspace(0, 1, n_bins + 1)
            df['bin'] = np.digitize(df['prob'], bin_edges) - 1
        means, observed = [], []
        for b in range(len(bin_edges) - 1):
            in_bin = (df['bin'] == b)
            if np.any(in_bin):
                means.append(df.loc[in_bin, 'prob'].mean())
                observed.append(df.loc[in_bin, 'target'].mean())
            else:
                means.append((bin_edges[b] + bin_edges[b+1]) / 2)
                observed.append(np.nan)
        means, observed = np.array(means), np.array(observed)
        valid_idx = ~np.isnan(observed)
        if np.any(valid_idx):
            plt.plot(means[valid_idx], observed[valid_idx], marker='o', color=colors[i % len(colors)],
                     label=model, lw=1, markersize=2, alpha=0.92)
            if show_thresholds:
                thresh_line = plt.axvline(x=threshold, color=colors[i % len(colors)],
                                          linestyle=':', alpha=0.7, lw=1)
                threshold_lines.append(thresh_line)
                threshold_labels.append(f"{model} thr: {threshold:.2f}")
    plt.plot([0, 1], [0, 1], "k--", lw=1, label="Perfect calibration")
    plt.xlabel('Mean predicted probability'); plt.ylabel('Fraction of positives')
    plt.title('Calibration curves (all models)'); plt.xlim([0, 1]); plt.ylim([0, 1])
    if show_thresholds and threshold_lines:
        main_legend = plt.legend(fontsize=6, loc='upper left', frameon=False, borderaxespad=0.1)
        threshold_legend = plt.legend(threshold_lines, threshold_labels, fontsize=5, loc='lower right',
                                      frameon=False, title="Thresholds", title_fontsize=6)
        plt.gca().add_artist(main_legend)
        plt.gca().add_artist(threshold_legend)
    else:
        plt.legend(fontsize=6, loc='upper left', frameon=False, borderaxespad=0.1)
    plt.grid(True, alpha=0.16, lw=0.4, linestyle='--'); plt.tick_params(labelsize=6, width=0.7, length=3)
    plt.tight_layout(pad=0.35); plt.savefig(save_path, dpi=450, bbox_inches='tight'); plt.close()


def plot_threshold_histograms(details: Dict[str, dict], thresholds: Dict[str, float]):

    print(f"🔎 Starting plotting function. Number of models received: {len(details)}")

    n_models = len(details)
    if n_models == 0:
        print("❌ Error: The 'details' dictionary is empty. No plots will be generated.")
        return

    fig, axes = plt.subplots(1, n_models, figsize=(6 * n_models, 5), squeeze=False)
    axes = axes.flatten()

    for idx, (model, res) in enumerate(details.items()):
        if "y_prob" not in res:
            print(f"⚠️ Warning: '{model}' has no 'y_prob' key. Skipping.")
            continue

        y_prob = res["y_prob"]
        print(f"  - Plotting for '{model}':")


        y_prob = pd.to_numeric(np.asarray(y_prob).flatten(), errors='coerce')
        y_prob = y_prob[~np.isnan(y_prob)]


        if len(y_prob) == 0:
            print(f"    ❌ Error: 'y_prob' for '{model}' is empty after cleaning. Skipping.")
            continue

        print(f"    - Plotting with {len(y_prob)} valid probability values.")

        if len(np.unique(y_prob)) < 2:
            print(f"    ❌ Error: Data for '{model}' has only one unique value after cleaning. Cannot plot KDE. Skipping.")
            continue

        thr = thresholds.get(model, 0.5)
        ax = axes[idx]


        sns.histplot(x=y_prob, bins=20, kde=False, ax=ax, stat="density",
                     label="Probability Distribution", color="skyblue", alpha=0.6)
        sns.kdeplot(x=y_prob, ax=ax, fill=True,
                    label="KDE with 95% CI", color="navy", lw=2)
        ax.axvline(thr, color='r', linestyle='--', lw=2, label=f'Optimal Thr: {thr:.3f}')

        ax.set_title(f"Probability Distribution: {model}", fontsize=14)
        ax.set_xlabel("Predicted Probability", fontsize=12)
        ax.set_ylabel("Density", fontsize=12)
        ax.legend(fontsize=11)
        ax.tick_params(labelsize=10)

    plt.tight_layout()
    save_path = "Evaluation2/threshold_histograms_with_ci.png"
    plt.savefig(save_path, dpi=450, bbox_inches='tight')
    plt.close()
    print(f"✅ Figure saved to: {save_path}")


def evaluate_models_and_compile_results(models, X_test, y_test, thresholds):

    all_eval_results = {}

    for name, model_info in models.items():
        try:

            y_prob, num_feat, sel_names = get_model_predictions_comprehensive(model_info['artifact'], X_test)
            if y_prob is None:
                continue

            y_true = y_test
            thr = thresholds.get(name, 0.5)
            y_pred = (y_prob >= thr).astype(int)


            ci = {}
            metrics = {}


            metrics = calculate_all_metrics(y_true, y_pred, y_prob)


            ci = bootstrap_metrics_ci_for_table(y_true, y_pred, y_prob, n_bootstrap=1000)
            metrics.update(ci)


            all_eval_results[name] = {
                "Metrics": metrics,
                "y_prob": y_prob,
                "y_true": y_true,
                "y_pred": y_pred,
                "num_features": num_feat,
                "youden_threshold": thr,

                "roc_curve": roc_curve(y_true, y_prob)[:2],
                "pr_curve": precision_recall_curve(y_true, y_prob)[:2],
                "calibration_data": calibration_curve(y_true, y_prob, n_bins=10)
            }
            print(f"✅ Successfully processed {name}")

        except Exception as e:
            logging.error(f"Failed to process and compile metrics for {name}: {e}")
            continue


    if not all_eval_results:
        print("❌ No models were successfully processed")
        return {}, pd.DataFrame(), pd.DataFrame()

    full_metrics_list = []
    for name, data in all_eval_results.items():
        row = {'Model': name, 'N_Features': data['num_features']}
        row.update(data['Metrics'])
        full_metrics_list.append(row)


    full_metrics_df = pd.DataFrame(full_metrics_list)


    full_metrics_path = Path("Evaluation2/full_metrics_with_ci.csv")
    full_metrics_df.to_csv(full_metrics_path, index=False)
    print(f"💾 Full metrics saved → {full_metrics_path}")


    summary_metrics = ['AUPRC', 'AUC_ROC', 'F1', 'Accuracy', 'Balanced_Accuracy', 'MCC', 'Kappa', 'ECE', 'MCE']
    available_metrics = [m for m in summary_metrics if m in full_metrics_df.columns]

    summary_df = full_metrics_df[['Model', 'N_Features'] + available_metrics].set_index('Model').round(4)


    summary_path = Path("Evaluation2/summary_metrics_with_features.csv")
    summary_df.to_csv(summary_path)
    print(f"💾 Summary metrics saved → {summary_path}")

    return all_eval_results, summary_df, full_metrics_df


def evaluate_models(models, x_val, y_true, thresholds, out_dir="Evaluation2"):

    Path(out_dir).mkdir(exist_ok=True, parents=True)
    metrics_summary, full_metrics, model_probs, details = {}, [], {}, {}

    for name, obj in models.items():
        logging.info("Evaluating %s", name)
        y_prob, n_feat, sel = get_model_predictions_comprehensive(obj["artifact"], x_val)
        if y_prob is None:
            logging.error("Skipping %s due to prediction failure", name)
            continue


        youden_threshold = thresholds.get(name, 0.5)
        y_pred = (y_prob >= youden_threshold).astype(int)
        model_probs[name] = y_prob


        cm = confusion_matrix(y_true, y_pred)
        tn, fp, fn, tp = (cm.ravel() if cm.shape == (2,2) else (0,0,0,0))
        npv = tn / (tn + fn) if (tn + fn) > 0 else 0
        ppv = tp / (tp + fp) if (tp + fp) > 0 else 0
        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
        fnr = fn / (fn + tp) if (fn + tp) > 0 else 0


        auprc, auprc_lo, auprc_hi = bootstrap_metric_ci(y_true, y_prob, average_precision_score)
        auc, auc_lo, auc_hi = bootstrap_metric_ci(y_true, y_prob, roc_auc_score)
        f1, f1_lo, f1_hi = bootstrap_metric_ci(y_true, y_pred, f1_score)
        spec, spec_lo, spec_hi = bootstrap_metric_ci(y_true, y_pred, specificity_score)
        sens, sens_lo, sens_hi = bootstrap_metric_ci(y_true, y_pred, recall_score)
        prec, prec_lo, prec_hi = bootstrap_metric_ci(y_true, y_pred, precision_score)
        brier, b_lo, b_hi = bootstrap_metric_ci(y_true, y_prob, brier_score_loss)
        kappa, k_lo, k_hi = bootstrap_metric_ci(y_true, y_pred, cohen_kappa_score)
        acc, acc_lo, acc_hi = bootstrap_metric_ci(y_true, y_pred, accuracy_score)
        mcc, mcc_lo, mcc_hi = bootstrap_metric_ci(y_true, y_pred, matthews_corrcoef)
        bal_acc, ba_lo, ba_hi = bootstrap_metric_ci(y_true, y_pred, balanced_accuracy_score)
        logloss, ll_lo, ll_hi = bootstrap_metric_ci(y_true, y_prob, log_loss)
        ece, ece_lo, ece_hi = bootstrap_metric_ci(y_true, y_prob, calculate_ece, n_bins=10)
        mce, mce_lo, mce_hi = bootstrap_metric_ci(y_true, y_prob, calculate_mce, n_bins=10)


        slope_result, intercept_result = calibration_slope_intercept_ci(y_true, y_prob)


        slope, slope_lo, slope_hi = slope_result.value, slope_result.cilow, slope_result.cihigh
        intercept, intercept_lo, intercept_hi = intercept_result.value, intercept_result.cilow, intercept_result.cihigh

        z_stat, z_p = z_test_standard(y_true, y_prob)
        hl_stat, hl_p = hosmer_lemeshow_test_advanced(y_true, y_prob, n_bins=10)


        roc_fpr, roc_tpr, _ = roc_curve(y_true, y_prob)
        pr_curve_vals = precision_recall_curve(y_true, y_prob)[:2]
        cal_mean, cal_frac = calibration_curve(y_true, y_prob, n_bins=10, strategy="uniform")


        model_metrics = {
            "Model": name, "n_features_used": n_feat, "Youden_Threshold": youden_threshold,

            "TN": tn, "FP": fp, "FN": fn, "TP": tp, "NPV": npv, "PPV": ppv, "FPR": fpr, "FNR": fnr,


            "AUPRC": auprc, "AUPRC_CI_low": auprc_lo, "AUPRC_CI_high": auprc_hi,
            "AUC_ROC": auc, "AUC_ROC_CI_low": auc_lo, "AUC_ROC_CI_high": auc_hi,
            "F1": f1, "F1_CI_low": f1_lo, "F1_CI_high": f1_hi,
            "Specificity": spec, "Specificity_CI_low": spec_lo, "Specificity_CI_high": spec_hi,
            "Sensitivity": sens, "Sensitivity_CI_low": sens_lo, "Sensitivity_CI_high": sens_hi,
            "Precision": prec, "Precision_CI_low": prec_lo, "Precision_CI_high": prec_hi,
            "Brier": brier, "Brier_CI_low": b_lo, "Brier_CI_high": b_hi,
            "Kappa": kappa, "Kappa_CI_low": k_lo, "Kappa_CI_high": k_hi,
            "Accuracy": acc, "Accuracy_CI_low": acc_lo, "Accuracy_CI_high": acc_hi,
            "MCC": mcc, "MCC_CI_low": mcc_lo, "MCC_CI_high": mcc_hi,
            "Balanced_Accuracy": bal_acc, "Balanced_Accuracy_CI_low": ba_lo, "Balanced_Accuracy_CI_high": ba_hi,
            "Log_Loss": logloss, "Log_Loss_CI_low": ll_lo, "Log_Loss_CI_high": ll_hi,
            "ECE": ece, "ECE_CI_low": ece_lo, "ECE_CI_high": ece_hi,
            "MCE": mce, "MCE_CI_low": mce_lo, "MCE_CI_high": mce_hi,


            "calibration_intercept": intercept,
            "calibration_intercept_CI_low": intercept_lo,
            "calibration_intercept_CI_high": intercept_hi,

            "calibration_slope": slope,
            "calibration_slope_CI_low": slope_lo,
            "calibration_slope_CI_high": slope_hi,

            "z_statistic": sp_z, "z_p_value": sp_p,
            "hl_statistic": hl_stat, "hl_p_value": hl_p,
        }
        full_metrics.append(model_metrics)


        metrics_summary[name] = {
            "N_Features_Used": n_feat, "Youden_Threshold": youden_threshold,
            "AUPRC": auprc, "AUC_ROC": auc, "F1": f1, "Balanced_Accuracy": bal_acc,
            "Kappa": kappa, "Accuracy": acc, "MCC": mcc, "ECE": ece, "MCE": mce,

            "calibration_intercept": intercept,
            "calibration_slope": slope,
        }


    full_metrics_df = pd.DataFrame(full_metrics)
    full_metrics_path = Path(out_dir, "full_metrics_with_ci.csv")
    full_metrics_df.to_csv(full_metrics_path, index=False)
    logging.info("Full metrics saved → %s", full_metrics_path)

    summary_df = pd.DataFrame(metrics_summary).T
    summary_path = Path(out_dir, "summary_metrics_with_features.csv")
    summary_df.to_csv(summary_path)
    logging.info("Summary metrics saved → %s", summary_path)



    if model_probs:
        print("\n🔧 Performing statistical comparisons with multiple testing corrections...")

        effect_size_results = pairwise_auprc_comparison_with_effect_size(
            y_true, model_probs, out=Path(out_dir, "auprc_pairwise_with_effect_size.csv")
        )
        mcnemar_effect_results = pairwise_mcnemar_with_effect_size(
            models, x_val, y_true, thresholds, out=Path(out_dir, "mcnemar_with_effect_size.csv")
        )

        if not effect_size_results.empty:
            print_correction_summary(effect_size_results)
            plot_significance_comparison(effect_size_results, ['AUPRC_p_value'], out_dir)

        if not mcnemar_effect_results.empty:
            print_correction_summary(mcnemar_effect_results)
            plot_significance_comparison(mcnemar_effect_results, ['McNemar_p_value'], out_dir)

        create_effect_size_summary(effect_size_results, mcnemar_effect_results, out_dir)


    return details, summary_df, full_metrics_df



def main():
    print("🚀 Starting Enhanced Model Evaluation with Feature Analysis")


    X_val, y_val = load_validation_data()
    if X_val is None or y_val is None:
        print("❌ Validation data loading failed - aborting.")
        return

    model_search_path = Path("trained_models")
    if Path("nested_cv_output").exists():
        model_search_path = Path("nested_cv_output") / "logloss" / "models"

    models_dict = load_calibrated_models(model_dir=str(model_search_path))
    if not models_dict:
        print("❌ No models loaded - aborting.")
        return

    print("\nLoaded models for evaluation:")
    for model_name in models_dict.keys():
        print(f"  - {model_name}")


    try:
        opt_thresh = load_optimal_thresholds()
        print("\nApplying thresholds to models:")
        final_thresholds = {}
        for name in models_dict.keys():

            if name in opt_thresh:
                final_thresholds[name] = opt_thresh[name]
                print(f"  ✓ {name}: {opt_thresh[name]:.4f} (direct match)")
                continue

            matched = False
            for threshold_name, thresh_value in opt_thresh.items():
                if name.lower() == threshold_name.lower():
                    final_thresholds[name] = thresh_value
                    print(f"  ≈ {name} matched to '{threshold_name}': {thresh_value:.4f}")
                    matched = True
                    break
            if not matched:
                final_thresholds[name] = 0.5
                print(f"⚠️  No threshold found for '{name}', using default 0.5")

    except Exception as exc:
        logging.warning("Threshold loading failed: %s - using 0.5 for all models", exc)
        final_thresholds = {name: 0.5 for name in models_dict.keys()}


    print("\n🔬 Calculating comprehensive metrics for each model...")
    details, summary_df, full_metrics_df = evaluate_models_and_compile_results(
        models_dict, X_val, y_val, final_thresholds
    )

    if not details:
        print("❌ No successful evaluations - aborting.")
        return


    print("\n📊 Performing pairwise statistical comparisons (McNemar and AUPRC)...")


    model_probs = {m: d.get('y_prob') for m, d in details.items() if 'y_prob' in d}


    auprc_df_corrected = pairwise_auprc_comparison_with_effect_size(
        y_val, model_probs, out="Evaluation2/auprc_pairwise_with_effect_size.csv"
    )
    mcnemar_df_corrected = pairwise_mcnemar_with_effect_size(
        models_dict, X_val, y_val, final_thresholds, out="Evaluation2/mcnemar_with_effect_size.csv"
    )


    comparison_dfs = [df for df in [auprc_df_corrected, mcnemar_df_corrected] if df is not None and not df.empty]
    if comparison_dfs:
        pval_cols = ['AUPRC_p_value', 'McNemar_p_value']

        print_correction_summary(comparison_dfs[0], original_alpha=0.05)
        plot_significance_comparison(comparison_dfs[0].rename(columns={'AUPRC_p_value': 'AUPRC', 'McNemar_p_value': 'McNemar'}),
                                     pval_cols=[c for c in pval_cols if c in comparison_dfs[0].columns])


        create_effect_size_summary(auprc_df_corrected, mcnemar_df_corrected, out_dir="Evaluation2")




    print("\n🔍 Verifying thresholds stored in details:")
    for model_name, model_data in details.items():
        stored_threshold = model_data.get('youden_threshold', 'NOT FOUND')
        print(f"  {model_name}: {stored_threshold:.4f}" if isinstance(stored_threshold, float) else f"  {model_name}: {stored_threshold}")

    print("🎨 Generating comprehensive visualizations...")


    core_metrics = ["AUPRC", "AUC_ROC", "F1", "Balanced_Accuracy",
                    "Kappa", "Accuracy", "MCC", "ECE", "MCE"]
    available_metrics = [m for m in summary_df.columns if m in core_metrics]

    if available_metrics:
        plot_heatmap(summary_df[available_metrics], "Model Performance Heatmap")
        radar_metrics = ["AUPRC", "AUC_ROC", "F1", "Accuracy", "Kappa", "MCC"]
        radar_cols = [m for m in radar_metrics if m in summary_df.columns]
        if radar_cols and len(radar_cols) >= 3:
            plot_radar_chart(summary_df[radar_cols], "Radar Chart - Model Performance")

    plot_roc_curves(details)
    plot_pr_curves(details)
    plot_calibration_curves(details)
    plot_overlaid_calibration_curves(details, show_thresholds=True)
    plot_threshold_histograms(details, final_thresholds)

    print("\n✅ Evaluation complete - Results saved in 'Evaluation2/' directory.")

if __name__ == "__main__":
    main()

✅ Environment setup complete!
🚀 Starting Enhanced Model Evaluation with Feature Analysis

Loaded models for evaluation:
  - CatBoost
  - LightGBM
  - LogisticRegression
  - RandomForest
  - SVM
  - XGBoost
DEBUG: Loading thresholds from column 'threshold_youdenj'

Applying thresholds to models:
  ✓ CatBoost: 0.5100 (direct match)
  ✓ LightGBM: 0.5700 (direct match)
  ✓ LogisticRegression: 0.5000 (direct match)
  ✓ RandomForest: 0.3800 (direct match)
  ✓ SVM: 0.3900 (direct match)
  ✓ XGBoost: 0.6100 (direct match)

🔬 Calculating comprehensive metrics for each model...
✅ Successfully processed CatBoost
✅ Successfully processed LightGBM
✅ Successfully processed LogisticRegression
✅ Successfully processed RandomForest
✅ Successfully processed SVM
✅ Successfully processed XGBoost
💾 Full metrics saved → Evaluation2/full_metrics_with_ci.csv
💾 Summary metrics saved → Evaluation2/summary_metrics_with_features.csv

📊 Performing pairwise statistical comparisons (McNemar and AUPRC)...

📊 Multiple

**7. Best Model Selection**  
Select the best-performing model using composite metrics (e.g., AUC, Brier score, calibration).

In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from pathlib import Path


input_path = Path("Evaluation2/full_metrics_with_ci.csv")
if not input_path.exists():
    print(f"❌ Error: {input_path} not found.")
else:
    metrics_df = pd.read_csv(input_path)

    if 'Unnamed: 0' in metrics_df.columns:
        metrics_df = metrics_df.rename(columns={'Unnamed: 0': 'Model'})


    column_aliases = {
        'Spiegelhalter_p': 'Spiegel_p',
        'Spiegelhalter_P_Value': 'Spiegel_p',
        'HL_p_value': 'HL_p',
        'HL_P_Value': 'HL_p',
        'Recall': 'Sensitivity',
        'AUC': 'AUC_ROC',
        'AUROC': 'AUC_ROC',
        'Brier_Score': 'Brier'
    }

    new_columns = []
    for col in metrics_df.columns:
        renamed = col


        sorted_aliases = sorted(column_aliases.keys(), key=len, reverse=True)

        for old_name in sorted_aliases:
            new_name = column_aliases[old_name]


            if col == new_name or col.startswith(new_name + "_") or col.startswith(new_name + " "):
                renamed = col
                break

            if col == old_name:
                renamed = new_name
                break


            if col.startswith(old_name + "_") or col.startswith(old_name + " "):
                 renamed = col.replace(old_name, new_name, 1)
                 break

        new_columns.append(renamed)

    metrics_df.columns = new_columns


    print("Columns after renaming:", metrics_df.columns.tolist())

    weights = {
        'AUPRC': 0.20,
        'AUC_ROC': 0.20,
        'F1': 0.15,
        'Sensitivity': 0.10,
        'Balanced_Accuracy': 0.10,
        'MCC': 0.05,
        'Brier': 0.05,
        'ECE': 0.05,
        'Spiegel_p': 0.05,
        'HL_p': 0.05
    }


    valid_weights = {k: v for k, v in weights.items() if k in metrics_df.columns}
    metric_names = list(valid_weights.keys())

    print(f"✅ Valid metrics found: {metric_names}")
    missing = set(weights.keys()) - set(valid_weights.keys())
    if missing:
        print(f"⚠️ Warning: The following metrics were not found: {missing}")


    normalized = metrics_df[metric_names].copy()


    for metric in ['ECE', 'Brier', 'LogLoss', 'MCE']:
        if metric in normalized.columns:
            normalized[metric] = 1 - MinMaxScaler().fit_transform(normalized[[metric]])


    for metric in ['AUPRC', 'AUC_ROC', 'F1', 'Accuracy', 'Sensitivity',
                   'Balanced_Accuracy', 'MCC', 'Spiegel_p', 'HL_p']:
        if metric in normalized.columns:
            normalized[metric] = MinMaxScaler().fit_transform(normalized[[metric]])


    total_weight = sum(valid_weights.values())
    final_weights = {k: v / total_weight for k, v in valid_weights.items()}

    metrics_df['CompositeScore'] = 0
    for metric, weight in final_weights.items():
        metrics_df['CompositeScore'] += normalized[metric] * weight


    output_columns = ['Model', 'CompositeScore']


    ci_suffix_pairs = [
        ('_CI_Low', '_CI_High'),
        ('_CI_low', '_CI_high'),
        ('_low', '_high'),
        ('_CI_Lower', '_CI_Upper'),
        ('_lower', '_upper')
    ]

    for metric in metric_names:
        output_columns.append(metric)


        found_ci = False
        for low_suff, high_suff in ci_suffix_pairs:
            low_col = f"{metric}{low_suff}"
            high_col = f"{metric}{high_suff}"

            if low_col in metrics_df.columns and high_col in metrics_df.columns:
                output_columns.extend([low_col, high_col])
                found_ci = True
                break



    best_models = metrics_df[output_columns].sort_values('CompositeScore', ascending=False)


    display_df = best_models.copy()

    for metric in metric_names:

        cols = [c for c in best_models.columns if c.startswith(metric)]

        ci_cols = [c for c in cols if c != metric]
        if len(ci_cols) == 2:
            c1, c2 = ci_cols

            if 'high' in c1.lower() or 'upper' in c1.lower():
                high_col, low_col = c1, c2
            else:
                low_col, high_col = c1, c2

            display_df[f"{metric} (95% CI)"] = display_df.apply(
                lambda x: f"{x[metric]:.3f} ({x[low_col]:.3f}-{x[high_col]:.3f})",
                axis=1
            )

            display_df = display_df.drop(columns=[low_col, high_col, metric])


    best_models.to_csv("Evaluation2/best_models_ranked_with_CI.csv", index=False)
    display_df.to_csv("Evaluation2/best_models_ranked_display.csv", index=False)

    print("\nTop Models (Composite Score):")
    cols_to_show = ['Model', 'CompositeScore'] + [c for c in display_df.columns if c not in ['Model', 'CompositeScore']]
    print(display_df[cols_to_show].head().to_string(index=False))

Columns after renaming: ['Model', 'N_Features', 'AUC_ROC', 'AUPRC', 'Accuracy', 'Balanced_Accuracy', 'Precision', 'Sensitivity', 'Specificity', 'F1', 'MCC', 'Kappa', 'LogLoss', 'Brier', 'ECE', 'MCE', 'Cal_Intercept', 'Cal_Slope', 'Spiegelhalter_Z', 'Spiegel_p', 'HL_Statistic', 'HL_p', 'TN', 'FP', 'FN', 'TP', 'NPV', 'PPV', 'FPR', 'FNR', 'AUC_ROC_CI_Low', 'AUC_ROC_CI_High', 'AUPRC_CI_Low', 'AUPRC_CI_High', 'F1_CI_Low', 'F1_CI_High', 'Accuracy_CI_Low', 'Accuracy_CI_High', 'Precision_CI_Low', 'Precision_CI_High', 'Sensitivity_CI_Low', 'Sensitivity_CI_High', 'Specificity_CI_Low', 'Specificity_CI_High', 'MCC_CI_Low', 'MCC_CI_High', 'Kappa_CI_Low', 'Kappa_CI_High', 'Balanced_Accuracy_CI_Low', 'Balanced_Accuracy_CI_High', 'Brier_CI_Low', 'Brier_CI_High', 'LogLoss_CI_Low', 'LogLoss_CI_High', 'ECE_CI_Low', 'ECE_CI_High', 'MCE_CI_Low', 'MCE_CI_High', 'Cal_Intercept_CI_Low', 'Cal_Intercept_CI_High', 'Cal_Slope_CI_Low', 'Cal_Slope_CI_High']
✅ Valid metrics found: ['AUPRC', 'AUC_ROC', 'F1', 'Sensiti

**8. Visual Model Comparison (Composite Score, ROC, PR, Calibration)**              
Generate visual comparisons across models including ROC, PR, calibration curves, and composite scores.

In [14]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from pathlib import Path


plt.rcParams.update({
    'font.family': 'DejaVu Sans',
    'font.size': 8,
    'axes.titlesize': 9,
    'axes.labelsize': 8,
    'xtick.labelsize': 7,
    'ytick.labelsize': 7,
    'legend.fontsize': 7,
    'figure.dpi': 600,
    'axes.linewidth': 0.8,
    'lines.linewidth': 1.2,
})


input_path = Path("Evaluation2/best_models_ranked_with_CI.csv")
if not input_path.exists():

    input_path = Path("Evaluation2/full_metrics_with_ci.csv")

df = pd.read_csv(input_path)


if 'Unnamed: 0' in df.columns:
    df = df.rename(columns={'Unnamed: 0': 'Model'})

if 'CompositeScore' not in df.columns:
    if 'AUPRC' in df.columns:
        df = df.sort_values('AUPRC', ascending=False)
        df['CompositeScore'] = df['AUPRC']
    else:
        df['CompositeScore'] = 0


fig = plt.figure(figsize=(12, 8))
gs = fig.add_gridspec(2, 2, height_ratios=[1, 1.4], wspace=0.25, hspace=0.35)


ax1 = fig.add_subplot(gs[0, 0])
sns.barplot(x='CompositeScore', y='Model', data=df, palette='viridis', edgecolor='black', linewidth=0.7, ax=ax1)
ax1.set_title('A) Overall Model Ranking (Composite Score)', fontsize=10, weight='bold', loc='left')
ax1.set_xlabel('Composite Score (0-1)', fontsize=8)
ax1.set_ylabel('')
ax1.grid(axis='x', linestyle='--', alpha=0.3)


for i, v in enumerate(df['CompositeScore']):
    ax1.text(v + 0.01, i, f"{v:.3f}", va='center', fontsize=7)


ax2 = fig.add_subplot(gs[0, 1])


metric_aliases = {
    'AUPRC': ['AUPRC'],
    'Sensitivity': ['Sensitivity', 'Recall'],
    'F1': ['F1', 'F1_Score'],
    'Brier': ['Brier', 'Brier_Score']
}

colors = sns.color_palette("deep", len(metric_aliases))
y_positions = np.arange(len(df))
offset_step = 0.15
start_offset = -((len(metric_aliases)-1) * offset_step) / 2

for idx, (display_name, aliases) in enumerate(metric_aliases.items()):

    col_name = next((a for a in aliases if a in df.columns), None)

    if col_name:

        low_col = next((c for c in df.columns if c.startswith(col_name) and ('low' in c or 'Low' in c)), None)
        high_col = next((c for c in df.columns if c.startswith(col_name) and ('high' in c or 'High' in c)), None)

        y_err = None
        if low_col and high_col:
            y_err = [df[col_name] - df[low_col], df[high_col] - df[col_name]]

        ax2.errorbar(
            x=df[col_name],
            y=y_positions + start_offset + (idx * offset_step),
            xerr=y_err,
            fmt='o',
            color=colors[idx],
            capsize=3,
            label=display_name,
            alpha=0.9
        )

ax2.set_yticks(y_positions)
ax2.set_yticklabels(df['Model'])
ax2.set_title('B) Key Performance Metrics (95% CI)', fontsize=10, weight='bold', loc='left')
ax2.set_xlabel('Metric Value', fontsize=8)
ax2.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), frameon=False, ncol=4, fontsize=8)
ax2.grid(axis='x', color='gray', alpha=0.2, linestyle='--')
ax2.invert_yaxis()


ax3 = fig.add_subplot(gs[1, :])


heatmap_metrics_map = {
    'AUPRC': 'AUPRC',
    'AUC_ROC': 'AUC',
    'F1': 'F1',
    'ECE': 'ECE',
    'Cal_Slope': 'Slope',
    'Cal_Intercept': 'Intercept',
    'Spiegelhalter_P_Value': 'Spiegel P',
    'Brier': 'Brier'
}


available_metrics = {}
for k, label in heatmap_metrics_map.items():

    if k in df.columns:
        available_metrics[k] = label

    elif k == 'Cal_Slope' and 'Slope' in df.columns: available_metrics['Slope'] = label
    elif k == 'Cal_Intercept' and 'Intercept' in df.columns: available_metrics['Intercept'] = label
    elif k == 'Spiegelhalter_P_Value' and 'Spiegel_p' in df.columns: available_metrics['Spiegel_p'] = label
    elif k == 'Brier' and 'Brier_Score' in df.columns: available_metrics['Brier_Score'] = label

heatmap_data = df.set_index('Model')[list(available_metrics.keys())]
heatmap_labels = heatmap_data.rename(columns=available_metrics)


heatmap_norm = heatmap_data.copy()
scaler = MinMaxScaler()

for col in heatmap_norm.columns:

    if any(x in col for x in ['ECE', 'Brier', 'LogLoss', 'MCE']):
        heatmap_norm[col] = 1 - scaler.fit_transform(heatmap_norm[[col]])

    elif 'Slope' in col:

        dist = abs(heatmap_norm[col] - 1)
        heatmap_norm[col] = 1 - scaler.fit_transform(dist.values.reshape(-1,1))

    elif 'Intercept' in col:
        dist = abs(heatmap_norm[col])
        heatmap_norm[col] = 1 - scaler.fit_transform(dist.values.reshape(-1,1))

    else:
        heatmap_norm[col] = scaler.fit_transform(heatmap_norm[[col]])

sns.heatmap(
    heatmap_norm,
    annot=heatmap_labels.round(3),
    fmt='',
    cmap="RdYlBu",
    cbar_kws={'label': 'Relative Performance (Normalized)'},
    linewidths=1,
    linecolor='white',
    ax=ax3,
    annot_kws={"size": 8}
)

ax3.set_title('C) Detailed Performance Heatmap (Blue = Better)', fontsize=10, weight='bold', loc='left')
ax3.set_ylabel('')
ax3.set_xlabel('')
ax3.set_xticklabels(list(available_metrics.values()), rotation=0)
ax3.tick_params(axis='both', length=0)


plt.tight_layout(pad=1.5)
save_png = "Evaluation2/model_comparison_visualization.png"
save_pdf = "Evaluation2/model_comparison_visualization.pdf"
plt.savefig(save_png, dpi=600, bbox_inches='tight')
plt.savefig(save_pdf, bbox_inches='tight')
plt.close(fig)

print(f"✅ Visualization saved to: {save_png}")

✅ Visualization saved to: Evaluation2/model_comparison_visualization.png


**9. Model Persistence (Save Best Model**)           
Save the final best-performing model for reproducibility and future use.


In [15]:
import os
import warnings
import logging
import traceback
import joblib
import json
import csv
import argparse
import sys
from pathlib import Path
from itertools import combinations
from typing import Dict, List, Tuple, Optional, Callable
from datetime import datetime

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib


from sklearn.metrics import (
    accuracy_score, average_precision_score, balanced_accuracy_score,
    brier_score_loss, cohen_kappa_score, confusion_matrix, f1_score,
    matthews_corrcoef, precision_score, recall_score, roc_auc_score,
    roc_curve, precision_recall_curve, log_loss
)
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from scipy.special import logit
from scipy import stats
import statsmodels.api as sm
from statsmodels.stats.contingency_tables import mcnemar
from statsmodels.stats.multitest import multipletests
from mlxtend.evaluate import mcnemar_table


logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)


NCV_BASE_OUT_DIR = "nested_cv_output"
NCV_METRIC_DIR = "logloss"
NCV_MODEL_DIR = "models"



def validate_file_path(file_path: str | Path, description: str) -> Path:

    path = Path(file_path)
    if not path.exists():
        raise FileNotFoundError(f"❌ {description} not found at: {path.absolute()}")
    return path

def extract_model_components(artifact: dict, model_name: str) -> Tuple[Dict, Dict]:

    components = {}


    components["calibrated_model"] = artifact.get("calibrated_model") or artifact.get("calibrated")
    if components["calibrated_model"] is None:
        logging.warning(f"No calibrated model found for {model_name}")


    components["base_model"] = artifact.get("base_model") or artifact.get("model")
    if components["base_model"] is None:
        logging.warning(f"No base model found for {model_name}")

    components["preprocessor"] = artifact.get("preprocessor")
    components["feature_selector"] = artifact.get("selector") or artifact.get("feature_selector")

    metadata = artifact.get("metadata", {})
    components["selected_features"] = metadata.get("selected_features")

    return components, metadata

def robust_load_artifact_builder(model_artifacts_path: Path):

    def robust_load_artifact(model_name: str) -> dict:
        base_filename = f"{model_name.replace(' ', '_')}_final_calibrated_model.pkl"
        artifact_path = model_artifacts_path / base_filename

        if not artifact_path.exists():
             raise FileNotFoundError(f"Model artifact not found for {model_name} at {artifact_path}")

        return joblib.load(artifact_path)
    return robust_load_artifact

def normalize_model_name_for_matching(name):

    name = str(name).strip().lower()

    name = name.replace(' ', '').replace('-', '').replace('_', '')
    if 'catboost' in name: return 'catboost'
    if 'lightgbm' in name: return 'lightgbm'
    if 'logistic' in name: return 'logisticregression'
    if 'randomforest' in name: return 'randomforest'
    if 'svm' in name: return 'svm'
    if 'xgboost' in name: return 'xgboost'
    return name

def get_model_threshold(thresholds_df, model_name, threshold_col, default=0.5):

    normalized_target = normalize_model_name_for_matching(model_name)


    print(f"🔍 DEBUG: Looking for model '{model_name}' (normalized: '{normalized_target}')")
    print(f"🔍 DEBUG: Available columns in thresholds file: {thresholds_df.columns.tolist()}")

    model_col = None
    for possible_col in ['Model', 'model', 'Algorithm', 'algorithm', 'name']:
        if possible_col in thresholds_df.columns:
            model_col = possible_col
            break

    if model_col is None:
        first_col = thresholds_df.columns[0]
        print(f"⚠️ No standard model column found. Using first column: '{first_col}'")
        model_col = first_col

    print(f"🔍 DEBUG: Using model column: '{model_col}'")
    print(f"🔍 DEBUG: Available models in thresholds: {thresholds_df[model_col].tolist()}")


    for idx, row in thresholds_df.iterrows():
        current_model_name = str(row[model_col])
        if normalize_model_name_for_matching(current_model_name) == normalized_target:
            threshold_value = row[threshold_col]
            print(f"✅ Found threshold for {model_name}: {threshold_value}")
            return float(threshold_value)

    print(f"⚠️ No threshold found for {model_name}, using default {default}")
    return default


def save_best_and_lr_models_with_youdenj_and_group_thresholds(ncv_base_dir: str = NCV_BASE_OUT_DIR, ncv_metric_dir: str = NCV_METRIC_DIR):

    base_output_path = Path(ncv_base_dir) / ncv_metric_dir
    model_artifacts_path = base_output_path / NCV_MODEL_DIR
    evaluation_path = Path("Evaluation2")
    output_dir = Path("deployment_artifacts"); output_dir.mkdir(exist_ok=True, parents=True)

    robust_load_artifact = robust_load_artifact_builder(model_artifacts_path)

    try:
        print("🚀 Starting model bundling process...")

        metrics_path = validate_file_path(evaluation_path / "full_metrics_with_ci.csv", "Model metrics file")
        thresholds_path = validate_file_path("Threshold_results/reports/all_models_global_results.csv", "Threshold results file")

        metrics_df = pd.read_csv(metrics_path)

        if 'Unnamed: 0' in metrics_df.columns:
            metrics_df = metrics_df.rename(columns={'Unnamed: 0': 'Model'}).set_index("Model")
        elif 'Model' in metrics_df.columns:
            metrics_df = metrics_df.set_index("Model")
        else:
            first_col = metrics_df.columns[0]
            metrics_df = metrics_df.set_index(first_col)

        thresholds_df = pd.read_csv(thresholds_path)


        if 'CompositeScore' in metrics_df.columns: primary_metric = 'CompositeScore'
        elif 'AUPRC' in metrics_df.columns: primary_metric = 'AUPRC'
        else: primary_metric = metrics_df.columns[0]

        best_model_name = metrics_df[primary_metric].idxmax()
        lr_model_name = 'LogisticRegression'
        print(f"📊 Best model identified: {best_model_name} ({primary_metric}: {metrics_df.loc[best_model_name, primary_metric]:.4f})")


        threshold_col = None

        possible_cols = ['threshold_youdenj', 'youden_threshold', 'threshold_optimal', 'Threshold', 'threshold']
        for col in possible_cols:
            if col in thresholds_df.columns:
                threshold_col = col
                break

        if not threshold_col:
            print("⚠️ No threshold column found, using default 0.5 for all models")
            best_model_threshold = 0.5
            lr_threshold = 0.5
        else:
            print(f"🔍 Using threshold column: '{threshold_col}'")
            best_model_threshold = get_model_threshold(thresholds_df, best_model_name, threshold_col)
            lr_threshold = get_model_threshold(thresholds_df, lr_model_name, threshold_col)

        saved_paths = []


        print(f"\n🔄 Processing best model: {best_model_name}")
        best_model_artifact = robust_load_artifact(best_model_name)
        best_components, best_metadata = extract_model_components(best_model_artifact, best_model_name)

        best_bundle = {
            "model": best_components["calibrated_model"],
            "calibrated_model": best_components["calibrated_model"],
            "base_model": best_components["base_model"],
            "preprocessor": best_components["preprocessor"],
            "feature_selector": best_components["feature_selector"],
            "selected_features": best_components["selected_features"],
            "metadata": {
                "model_type": "best_model",
                "model_name": best_model_name,
                "created_at": datetime.now().isoformat(),
                "version": "1.0.0",
                "primary_metric": primary_metric,
                "primary_metric_value": float(metrics_df.loc[best_model_name, primary_metric]),
                "selected_features": best_components["selected_features"],
                "n_features": len(best_components["selected_features"]) if best_components["selected_features"] else None,
                "has_calibrated_model": best_components["calibrated_model"] is not None,
                "has_base_model": best_components["base_model"] is not None,
                "has_feature_selector": best_components["feature_selector"] is not None
            },
            "youden_j_threshold": float(best_model_threshold)
        }


        group_thresh_file = Path("Threshold_results/reports") / f"{best_model_name}_subgroups_optimal_thresholds_summary.csv"
        if group_thresh_file.exists():
            print(f"✅ Found group-specific thresholds file: {group_thresh_file}")
            group_df = pd.read_csv(group_thresh_file)


            sg_thresh_col = None
            for col in ['threshold_youdenj', 'opt_youden_threshold', 'threshold_optimal', 'threshold', 'opt_f1_threshold']:
                if col in group_df.columns:
                    sg_thresh_col = col
                    break

            if sg_thresh_col:
                print(f"   Using subgroup threshold column: '{sg_thresh_col}'")
                group_thresholds = {
                    row['group']: {"threshold": float(row[sg_thresh_col])}
                    for _, row in group_df.iterrows()
                }
                best_bundle["group_specific_thresholds"] = group_thresholds
                print(f"   Added {len(group_thresholds)} group-specific thresholds")
            else:
                 print(f"⚠️ Could not find a valid threshold column in {group_thresh_file.name}. Available: {group_df.columns.tolist()}")
        else:
            print(f"⚠️ No group-specific thresholds found for {best_model_name}")

        best_out_path = output_dir / f"{best_model_name}_with_thresholds_v1.0.0.pkl"
        joblib.dump(best_bundle, best_out_path)
        saved_paths.append(best_out_path)
        print(f"✅ Saved best model bundle: {best_out_path}")


        print(f"\n🔄 Processing logistic regression model: {lr_model_name}")
        lr_artifact = robust_load_artifact(lr_model_name)
        lr_components, lr_metadata = extract_model_components(lr_artifact, lr_model_name)

        lr_bundle = {
            "model": lr_components["calibrated_model"],
            "calibrated_model": lr_components["calibrated_model"],
            "base_model": lr_components["base_model"],
            "preprocessor": lr_components["preprocessor"],
            "feature_selector": lr_components["feature_selector"],
            "selected_features": lr_components["selected_features"],
            "metadata": {
                "model_type": "logistic_regression",
                "model_name": lr_model_name,
                "created_at": datetime.now().isoformat(),
                "version": "1.0.0",
                "primary_metric": primary_metric,
                "primary_metric_value": float(metrics_df.loc[lr_model_name, primary_metric]),
                "selected_features": lr_components["selected_features"],
                "n_features": len(lr_components["selected_features"]) if lr_components["selected_features"] else None,
                "has_calibrated_model": lr_components["calibrated_model"] is not None,
                "has_base_model": lr_components["base_model"] is not None,
                "has_feature_selector": lr_components["feature_selector"] is not None
            },
            "youden_j_threshold": float(lr_threshold)
        }

        lr_out_path = output_dir / f"{lr_model_name}_with_thresholds_v1.0.0.pkl"
        joblib.dump(lr_bundle, lr_out_path)
        saved_paths.append(lr_out_path)
        print(f"✅ Saved LR model bundle: {lr_out_path}")


        print(f"\n💾 Saving metadata files...")
        for bundle, name, model_name in zip(
            [best_bundle, lr_bundle],
            ["best_model", "logistic_regression"],
            [best_model_name, lr_model_name]
        ):
            meta_path = output_dir / f"{name}_metadata.json"
            metadata_json = {
                **bundle["metadata"],
                "youden_j_threshold": bundle["youden_j_threshold"],
                "model_components": {
                    "has_calibrated_model": bundle["calibrated_model"] is not None,
                    "has_base_model": bundle["base_model"] is not None,
                    "has_preprocessor": bundle["preprocessor"] is not None,
                    "has_feature_selector": bundle["feature_selector"] is not None
                }
            }
            if "group_specific_thresholds" in bundle:
                metadata_json["group_specific_thresholds"] = bundle["group_specific_thresholds"]

            with open(meta_path, 'w') as f:
                json.dump(metadata_json, f, indent=2)
            print(f"✅ Saved metadata: {meta_path}")

        print(f"\n🎉 MODEL BUNDLING COMPLETE!")
        print(f"📁 Artifacts saved to: {output_dir.absolute()}")
        return saved_paths

    except Exception as e:
        print(f"\n❌❌❌ CRITICAL ERROR: Model bundling failed ❌❌❌")
        print(f"Error: {str(e)}")
        traceback.print_exc()
        sys.exit(1)

if __name__ == "__main__":
    save_best_and_lr_models_with_youdenj_and_group_thresholds(
        ncv_base_dir="nested_cv_output",
        ncv_metric_dir="logloss"
    )



🚀 Starting model bundling process...
📊 Best model identified: CatBoost (AUPRC: 0.8661)
🔍 Using threshold column: 'threshold_youdenj'
🔍 DEBUG: Looking for model 'CatBoost' (normalized: 'catboost')
🔍 DEBUG: Available columns in thresholds file: ['model', 'n_features', 'threshold_f1', 'threshold_youdenj', 'threshold_balancedacc', 'youden_j', 'f1', 'f1_ci', 'accuracy', 'accuracy_ci', 'balanced_accuracy', 'sensitivity', 'sensitivity_ci', 'specificity', 'specificity_ci', 'selected_features_preview']
🔍 DEBUG: Using model column: 'model'
🔍 DEBUG: Available models in thresholds: ['LogisticRegression', 'SVM', 'XGBoost', 'LightGBM', 'CatBoost', 'RandomForest']
✅ Found threshold for CatBoost: 0.51
🔍 DEBUG: Looking for model 'LogisticRegression' (normalized: 'logisticregression')
🔍 DEBUG: Available columns in thresholds file: ['model', 'n_features', 'threshold_f1', 'threshold_youdenj', 'threshold_balancedacc', 'youden_j', 'f1', 'f1_ci', 'accuracy', 'accuracy_ci', 'balanced_accuracy', 'sensitivity',

**10. Final Evaluation (Separated Test Set)**  
Assess the generalizability of the selected model using the independent test set.

In [16]:
import os
import sys
import logging
import warnings
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

from pathlib import Path
from scipy.stats import norm, chi2
from scipy.special import logit
from functools import wraps
from typing import Dict, List, Tuple, Optional, Any, Union

from sklearn.metrics import (
    roc_auc_score, average_precision_score, f1_score,
    precision_score, recall_score, confusion_matrix, roc_curve,
    precision_recall_curve, brier_score_loss, balanced_accuracy_score,
    matthews_corrcoef, cohen_kappa_score, log_loss, accuracy_score
)
from sklearn.utils import resample
from joblib import Parallel, delayed
from tqdm.auto import tqdm


sns.set(style="whitegrid")
plt.rcParams['font.family'] = 'DejaVu Sans'
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)



def validate_input_data(y_true, y_prob):

    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)


    mask = ~(np.isnan(y_prob) | np.isinf(y_prob))
    y_true = y_true[mask]
    y_prob = y_prob[mask]

    return y_true, y_prob

def validation_required(func):
    @wraps(func)
    def wrapper(y_true, y_prob, *args, **kwargs):
        y_true, y_prob = validate_input_data(y_true, y_prob)
        if len(y_true) < 2 or len(np.unique(y_true)) < 2:
            return np.nan
        return func(y_true, y_prob, *args, **kwargs)
    return wrapper

@validation_required
def calculate_calibration_slope_intercept(y_true, y_prob):
    try:
        epsilon = 1e-7
        p = np.clip(y_prob, epsilon, 1 - epsilon)
        logit_p = logit(p)
        X = sm.add_constant(logit_p)
        model = sm.Logit(y_true, X).fit(disp=False)
        return model.params[1], model.params[0]
    except Exception:
        return np.nan, np.nan



def calculate_spiegelhalter_z(y_true, y_prob):

    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)

    mask = ~(np.isnan(y_prob) | np.isinf(y_prob))
    y_true, y_prob = y_true[mask], y_prob[mask]

    if len(y_true) < 10:
        return np.nan, np.nan

    numerator = np.sum(y_true - y_prob)

    denominator = np.sqrt(np.sum(y_prob * (1 - y_prob)))

    if denominator < 1e-8:
        return np.nan, np.nan

    z_stat = numerator / denominator

    p_value = 2 * (1 - norm.cdf(abs(z_stat)))

    return float(z_stat), float(p_value)

def hosmer_lemeshow_test(y_true, y_prob, n_bins=10, min_expected_freq=5):

    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)

    mask = ~(np.isnan(y_prob) | np.isinf(y_prob))
    y_true, y_prob = y_true[mask], y_prob[mask]

    if len(y_true) < 20 or len(np.unique(y_true)) < 2:
        return np.nan, np.nan

    df = pd.DataFrame({'y_true': y_true, 'y_prob': y_prob})


    try:
        df['bin'] = pd.qcut(df['y_prob'], n_bins, labels=False, duplicates='drop')
    except ValueError:

        df['bin'] = np.floor(df['y_prob'] * n_bins).astype(int)
        df.loc[df['bin'] == n_bins, 'bin'] = n_bins - 1


    summary = df.groupby('bin').agg(
        observed=('y_true', 'sum'),
        expected=('y_prob', 'sum'),
        n_total=('y_true', 'size')
    ).reset_index()


    while True:

        sparse_bins = summary[summary['expected'] < min_expected_freq]

        if sparse_bins.empty or len(summary) <= 2:
            break

        merge_idx = sparse_bins.index[0]

        if merge_idx == 0:

            target_idx = 1
            source_idx = 0
        else:

            target_idx = merge_idx - 1
            source_idx = merge_idx


        summary.loc[target_idx, ['observed', 'expected', 'n_total']] += \
            summary.loc[source_idx, ['observed', 'expected', 'n_total']]

        summary = summary.drop(source_idx).reset_index(drop=True)


    g = len(summary)
    if g <= 2:
        return np.nan, np.nan


    numerator = (summary['observed'] - summary['expected'])**2
    denominator = summary['expected'] * (1 - summary['expected'] / summary['n_total'])


    denominator[denominator < 1e-8] = 1e-8

    hl_statistic = (numerator / denominator).sum()

    df_hl = g - 2
    p_value = 1 - chi2.cdf(hl_statistic, df_hl)

    return float(hl_statistic), float(p_value)

def fixed_bin_calibration(y_true, y_prob, bin_edges):

    inds = np.digitize(y_prob, bin_edges, right=False) - 1
    inds = np.clip(inds, 0, len(bin_edges) - 2)

    n_bins = len(bin_edges) - 1
    prob_true = np.full(n_bins, np.nan)
    prob_pred = np.full(n_bins, np.nan)

    for b in range(n_bins):
        mask = inds == b
        if np.any(mask):
            prob_true[b] = np.mean(y_true[mask])
            prob_pred[b] = np.mean(y_prob[mask])
        else:
            prob_pred[b] = (bin_edges[b] + bin_edges[b+1]) / 2.0

    return prob_true, prob_pred

@validation_required
def calculate_ece(y_true, y_prob, n_bins=10):
    bin_edges = np.linspace(0., 1. + 1e-8, n_bins + 1)
    prob_true, prob_pred = fixed_bin_calibration(y_true, y_prob, bin_edges)


    mask = ~np.isnan(prob_true)
    if not np.any(mask): return np.nan


    inds = np.digitize(y_prob, bin_edges, right=False) - 1
    inds = np.clip(inds, 0, n_bins - 1)
    counts = np.bincount(inds, minlength=n_bins)

    weights = counts[mask] / len(y_true)
    ece = np.sum(weights * np.abs(prob_pred[mask] - prob_true[mask]))
    return float(ece)

@validation_required
def calculate_mce(y_true, y_prob, n_bins=10):
    bin_edges = np.linspace(0., 1. + 1e-8, n_bins + 1)
    prob_true, prob_pred = fixed_bin_calibration(y_true, y_prob, bin_edges)
    mask = ~np.isnan(prob_true)
    if not np.any(mask): return np.nan
    return float(np.max(np.abs(prob_pred[mask] - prob_true[mask])))



def calculate_all_metrics(y_true, y_pred, y_proba, calibration_bins=10):

    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel() if cm.shape == (2,2) else (0,0,0,0)

    ece = calculate_ece(y_true, y_proba, n_bins=calibration_bins)
    mce = calculate_mce(y_true, y_proba, n_bins=calibration_bins)
    slope, intercept = calculate_calibration_slope_intercept(y_true, y_proba)
    sz, sp = calculate_spiegelhalter_z(y_true, y_proba)
    hl_stat, hl_p = hosmer_lemeshow_test(y_true, y_proba, n_bins=calibration_bins)

    return {
        'Test_Samples': len(y_true),
        'Positive_Rate': np.mean(y_true),
        'AUROC': roc_auc_score(y_true, y_proba),
        'AUPRC': average_precision_score(y_true, y_proba),
        'Accuracy': accuracy_score(y_true, y_pred),
        'Balanced_Accuracy': balanced_accuracy_score(y_true, y_pred),
        'Sensitivity': recall_score(y_true, y_pred, zero_division=0),
        'Specificity': tn / (tn + fp) if (tn + fp) > 0 else 0.0,
        'Precision': precision_score(y_true, y_pred, zero_division=0),
        'F1': f1_score(y_true, y_pred, zero_division=0),
        'MCC': matthews_corrcoef(y_true, y_pred),
        'Kappa': cohen_kappa_score(y_true, y_pred),
        'Brier_Score': brier_score_loss(y_true, y_proba),
        'Log_Loss': log_loss(y_true, y_proba),
        'ECE': ece,
        'MCE': mce,
        'Calibration_Slope': slope,
        'Calibration_Intercept': intercept,
        'Spiegelhalter_Z': sz,
        'Spiegelhalter_p': sp,
        'HL_statistic': hl_stat,
        'HL_p_value': hl_p,
        'TN': tn, 'FP': fp, 'FN': fn, 'TP': tp,
        'NPV': tn / (tn + fn) if (tn + fn) > 0 else 0.0,
        'PPV': tp / (tp + fp) if (tp + fp) > 0 else 0.0
    }

def bootstrap_metrics(y_true, y_pred, y_proba, n_bootstrap=1000):

    boot_stats = []
    rng = np.random.default_rng(42)

    for _ in tqdm(range(n_bootstrap), desc="Bootstrapping metrics", leave=False):
        idx = rng.choice(len(y_true), size=len(y_true), replace=True)

        if len(np.unique(y_true[idx])) < 2: continue

        try:
            m = calculate_all_metrics(y_true[idx], y_pred[idx], y_proba[idx])
            boot_stats.append(m)
        except Exception:
            continue

    df_boot = pd.DataFrame(boot_stats)
    ci_results = {}
    for col in df_boot.columns:
        if pd.api.types.is_numeric_dtype(df_boot[col]):
            ci_results[f"{col}_CI_low"] = df_boot[col].quantile(0.025)
            ci_results[f"{col}_CI_high"] = df_boot[col].quantile(0.975)
    return ci_results


def compute_bootstrap_ci(y_true, y_proba, curve_kind, grid, n_bootstraps=1000, bin_edges=None):

    rng = np.random.default_rng(42)
    boot_curves = []

    def get_curve(yt, yp):
        if curve_kind == 'roc':
            x, y, _ = roc_curve(yt, yp)
            return x, y
        elif curve_kind == 'pr':
            p, r, _ = precision_recall_curve(yt, yp)
            return r, p
        elif curve_kind == 'cal':
            pt, pp = fixed_bin_calibration(yt, yp, bin_edges)
            mask = ~np.isnan(pt)
            return pp[mask], pt[mask]

    for _ in range(n_bootstraps):
        idx = rng.integers(0, len(y_true), len(y_true))
        if len(np.unique(y_true[idx])) < 2: continue

        try:
            x, y = get_curve(y_true[idx], y_proba[idx])

            if len(x) > 1:

                order = np.argsort(x)
                x, y = x[order], y[order]
                _, uidx = np.unique(x, return_index=True)
                y_interp = np.interp(grid, x[uidx], y[uidx], left=y[0], right=y[-1])
                boot_curves.append(y_interp)
        except Exception: continue

    boot_curves = np.array(boot_curves)
    if len(boot_curves) == 0: return np.zeros_like(grid), np.zeros_like(grid), np.zeros_like(grid)

    return np.mean(boot_curves, axis=0), np.percentile(boot_curves, 2.5, axis=0), np.percentile(boot_curves, 97.5, axis=0)

def plot_comprehensive_evaluation(y_true, y_proba, model_name="CatBoost", output_dir="."):

    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(12, 4), dpi=600)


    grid_roc = np.linspace(0, 1, 200)
    mean_tpr, lo_tpr, hi_tpr = compute_bootstrap_ci(y_true, y_proba, 'roc', grid_roc)
    auc = roc_auc_score(y_true, y_proba)

    ax1.plot(grid_roc, mean_tpr, color='tab:blue', lw=2, label=f'AUC = {auc:.3f}')
    ax1.fill_between(grid_roc, lo_tpr, hi_tpr, color='tab:blue', alpha=0.2)
    ax1.plot([0,1], [0,1], 'k--', lw=1)
    ax1.set_title('ROC Curve')
    ax1.set_xlabel('False Positive Rate')
    ax1.set_ylabel('True Positive Rate')
    ax1.legend(loc='lower right')


    grid_pr = np.linspace(0, 1, 200)
    mean_prec, lo_prec, hi_prec = compute_bootstrap_ci(y_true, y_proba, 'pr', grid_pr)
    ap = average_precision_score(y_true, y_proba)
    prev = np.mean(y_true)

    ax2.plot(grid_pr, mean_prec, color='tab:green', lw=2, label=f'AP = {ap:.3f}')
    ax2.fill_between(grid_pr, lo_prec, hi_prec, color='tab:green', alpha=0.2)
    ax2.axhline(prev, color='k', ls='--', label=f'Baseline ({prev:.2f})')
    ax2.set_title('Precision-Recall')
    ax2.set_xlabel('Recall')
    ax2.set_ylabel('Precision')
    ax2.legend(loc='lower left')


    n_bins = 10
    bin_edges = np.linspace(0, 1, n_bins + 1)
    grid_cal = np.linspace(0, 1, 50)
    mean_cal, lo_cal, hi_cal = compute_bootstrap_ci(y_true, y_proba, 'cal', grid_cal, bin_edges=bin_edges)


    pt, pp = fixed_bin_calibration(y_true, y_proba, bin_edges)
    mask = ~np.isnan(pt)
    ece = calculate_ece(y_true, y_proba)

    ax3.plot(grid_cal, mean_cal, color='tab:orange', lw=1.5)
    ax3.fill_between(grid_cal, lo_cal, hi_cal, color='tab:orange', alpha=0.2)
    ax3.plot(pp[mask], pt[mask], 's', color='tab:orange', label=f'ECE = {ece:.3f}')
    ax3.plot([0,1], [0,1], 'k--', lw=1)
    ax3.set_title('Calibration')
    ax3.set_xlabel('Predicted Probability')
    ax3.set_ylabel('Observed Fraction')
    ax3.legend(loc='upper left')

    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f"{model_name}_evaluation_plots.png"))
    plt.close()



def load_CatBoost_model(model_path):
    if not os.path.exists(model_path): raise FileNotFoundError(f"Missing: {model_path}")
    print(f"🔍 Loading: {model_path}")

    artifact = joblib.load(model_path)


    thresholds = {}
    metadata = artifact.get('metadata', {})

    for k in ['youden_j_threshold', 'threshold_youdenj', 'threshold']:
        if k in artifact: thresholds['Youden_J'] = float(artifact[k])
        if k in metadata: thresholds['Youden_J'] = float(metadata[k])


    if not thresholds: thresholds['Default_0.5'] = 0.5

    return artifact, thresholds, metadata

def evaluate_CatBoost_test(
        test_csv="test_set.csv",
        model_path="deployment_artifacts/CatBoost_with_thresholds_v1.0.0.pkl",
        output_dir="test_evaluation_results",
        n_bootstrap=1000
    ):

    os.makedirs(output_dir, exist_ok=True)
    print(f"🚀 Starting Evaluation for {Path(model_path).stem}")


    df_test = pd.read_csv(test_csv)
    target_col = 'MDR status' if 'MDR status' in df_test.columns else 'outcome'
    X_test = df_test.drop(columns=[target_col])
    y_test = df_test[target_col].values

    artifact, thresholds, meta = load_CatBoost_model(model_path)
    model = artifact['calibrated_model'] if 'calibrated_model' in artifact else artifact['model']
    preprocessor = artifact['preprocessor']
    sel_features = artifact.get('selected_features', meta.get('selected_features'))


    print("🔮 Generating predictions...")
    X_proc = preprocessor.transform(X_test)


    if sel_features:

        def get_names(ct):
            names = []
            for nm, pipe, cols in ct.transformers_:
                if nm == 'remainder': continue
                if hasattr(pipe, 'get_feature_names_out'):
                    names.extend(pipe.get_feature_names_out(cols))
                elif hasattr(pipe, 'categories_'):
                    for c, cats in zip(cols, pipe.categories_):
                        names.extend([f"{c}_{x}" for x in cats])
                else: names.extend(cols)
            return names

        all_feats = get_names(preprocessor)
        if hasattr(X_proc, 'toarray'): X_proc = X_proc.toarray()


        df_proc = pd.DataFrame(X_proc, columns=all_feats)

        valid_feats = [f for f in sel_features if f in df_proc.columns]
        X_final = df_proc[valid_feats]
    else:
        X_final = X_proc

    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_final)[:, 1]
    else:
        y_proba = model.decision_function(X_final)
        y_proba = 1 / (1 + np.exp(-y_proba))


    results_list = []

    for thr_name, thr_val in thresholds.items():
        print(f"\n📊 Evaluating @ {thr_name} = {thr_val:.4f}")
        y_pred = (y_proba >= thr_val).astype(int)


        metrics = calculate_all_metrics(y_test, y_pred, y_proba)

        ci = bootstrap_metrics(y_test, y_pred, y_proba, n_bootstrap)

        combined = {'Threshold_Name': thr_name, 'Threshold_Value': thr_val, **metrics, **ci}
        results_list.append(combined)

        print(f"   AUROC: {metrics['AUROC']:.3f} | F1: {metrics['F1']:.3f} | ECE: {metrics['ECE']:.3f}")


    res_df = pd.DataFrame(results_list)
    res_df.to_csv(os.path.join(output_dir, "CatBoost_Robust_Metrics.csv"), index=False)

    print("\n🎨 Generating Plots...")
    plot_comprehensive_evaluation(y_test, y_proba, "CatBoost", output_dir)

    print("\n✅ Evaluation Complete.")
    return res_df

if __name__ == "__main__":

    if os.path.exists("test_set.csv") and os.path.exists("deployment_artifacts/CatBoost_with_thresholds_v1.0.0.pkl"):
        evaluate_CatBoost_test()
    else:
        print("❌ Files not found. Please upload 'test_set.csv' and model artifact.")

🚀 Starting Evaluation for CatBoost_with_thresholds_v1.0.0
🔍 Loading: deployment_artifacts/CatBoost_with_thresholds_v1.0.0.pkl
🔮 Generating predictions...

📊 Evaluating @ Youden_J = 0.5100


Bootstrapping metrics:   0%|          | 0/1000 [00:00<?, ?it/s]

   AUROC: 0.832 | F1: 0.746 | ECE: 0.064

🎨 Generating Plots...

✅ Evaluation Complete.


**11. Comparative Analysis (Best Model vs. Logistic Regression)**  
Compare the chosen model against logistic regression as a baseline classifier.

In [17]:


import os
import sys
import logging
import warnings
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Tuple, Optional, Union, Any
from functools import wraps

import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

from scipy.stats import norm, chi2
from scipy.special import logit
from sklearn.calibration import calibration_curve
from sklearn.metrics import (
    roc_auc_score, average_precision_score, f1_score,
    precision_score, recall_score, confusion_matrix, roc_curve,
    precision_recall_curve, brier_score_loss, balanced_accuracy_score,
    matthews_corrcoef, cohen_kappa_score, log_loss, accuracy_score
)
from sklearn.utils import resample
from mlxtend.evaluate import mcnemar_table
from statsmodels.stats.contingency_tables import mcnemar
from joblib import Parallel, delayed
from tqdm.auto import tqdm


warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)


CONFIG = {
    'paths': {
        'test_data': "test_set.csv",
        'output_dir': "model_comparison",
        'logs_dir': "model_comparison/logs",
        'model_catboost': "deployment_artifacts/CatBoost_with_thresholds_v1.0.0.pkl",
        'model_lr': "deployment_artifacts/LogisticRegression_with_thresholds_v1.0.0.pkl"
    },
    'bootstrap': {
        'n_bootstraps': 1000,
        'n_jobs': -1,
        'random_seed': 42
    },
    'calibration': {
        'n_bins': 10,
        'method': 'uniform'
    },
    'plotting': {
        'dpi': 600,
        'figsize': (10.5, 3.5),
        'font_family': 'DejaVu Sans'
    },
    'logging': {
        'level': 'INFO',
        'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    }
}


plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("tab10")


logger = logging.getLogger(__name__)



def setup_logging() -> logging.Logger:


    log_dir = Path(CONFIG['paths']['logs_dir'])
    log_dir.mkdir(parents=True, exist_ok=True)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    log_file = log_dir / f"model_comparison_{timestamp}.log"

    logging.basicConfig(
        level=getattr(logging, CONFIG['logging']['level']),
        format=CONFIG['logging']['format'],
        handlers=[
            logging.FileHandler(log_file, mode='w', encoding='utf-8'),
            logging.StreamHandler(sys.stdout)
        ]
    )

    logger.info("=" * 60)
    logger.info("Model Comparison Tool")
    logger.info(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    logger.info(f"Python: {sys.version}")
    logger.info(f"Log file: {log_file}")
    logger.info("=" * 60)

    return logger

def validate_input_data(y_true: np.ndarray, y_prob: np.ndarray,
                       context: str = "") -> Dict[str, Any]:

    results = {
        'is_valid': True,
        'warnings': [],
        'errors': [],
        'y_true_clean': y_true.copy(),
        'y_prob_clean': y_prob.copy(),
        'n_original': len(y_true)
    }


    if not isinstance(y_true, np.ndarray):
        results['errors'].append("y_true must be numpy array")
        results['is_valid'] = False

    if not isinstance(y_prob, np.ndarray):
        results['errors'].append("y_prob must be numpy array")
        results['is_valid'] = False

    if not results['is_valid']:
        return results


    if y_true.shape != y_prob.shape:
        results['errors'].append(
            f"Shape mismatch: y_true {y_true.shape}, y_prob {y_prob.shape}"
        )
        results['is_valid'] = False
        return results

    nan_mask = np.isnan(y_prob) | np.isinf(y_prob)
    nan_count = np.sum(nan_mask)

    if nan_count > 0:
        results['warnings'].append(f"Removed {nan_count} NaN/Inf values")
        finite_mask = ~nan_mask
        results['y_true_clean'] = y_true[finite_mask]
        results['y_prob_clean'] = y_prob[finite_mask]

    if len(results['y_prob_clean']) > 0:
        out_of_bounds = np.sum(
            (results['y_prob_clean'] < 0) | (results['y_prob_clean'] > 1)
        )
        if out_of_bounds > 0:
            results['warnings'].append(
                f"Clipped {out_of_bounds} values outside [0, 1]"
            )
            results['y_prob_clean'] = np.clip(results['y_prob_clean'], 0, 1)


    if len(results['y_true_clean']) > 0:
        unique_classes = np.unique(results['y_true_clean'])
        if len(unique_classes) < 2:
            results['errors'].append("Only one class present")
            results['is_valid'] = False


        if len(results['y_true_clean']) < 20:
            results['warnings'].append(
                f"Small sample size: {len(results['y_true_clean'])}"
            )


    if results['warnings'] and context:
        for warning in results['warnings']:
            logger.warning(f"{context}: {warning}")

    return results

def validation_required(func):

    @wraps(func)
    def wrapper(y_true, y_prob, *args, **kwargs):
        context = f"{func.__name__}"
        validation = validate_input_data(y_true, y_prob, context)

        if not validation['is_valid']:
            error_msg = f"Input validation failed: {validation['errors']}"
            logger.error(error_msg)
            raise ValueError(error_msg)

        if len(validation['y_true_clean']) < 10:
            logger.warning(f"Insufficient data after cleaning: "
                          f"{len(validation['y_true_clean'])} samples")
            return np.nan

        return func(validation['y_true_clean'], validation['y_prob_clean'],
                   *args, **kwargs)
    return wrapper


@validation_required
def calculate_ece(y_true: np.ndarray, y_prob: np.ndarray,
                 n_bins: int = 10, method: str = 'uniform') -> float:

    if method == 'quantile':
        quantiles = np.linspace(0, 1, n_bins + 1)
        bin_edges = np.percentile(y_prob, quantiles * 100)
        bin_edges[0] = 0.0
        bin_edges[-1] = 1.0
    else:
        bin_edges = np.linspace(0.0, 1.0 + 1e-8, n_bins + 1)

    ece = 0.0
    total_samples = len(y_true)

    for i in range(n_bins):
        if i == n_bins - 1:
            mask = (y_prob >= bin_edges[i]) & (y_prob <= bin_edges[i+1])
        else:
            mask = (y_prob >= bin_edges[i]) & (y_prob < bin_edges[i+1])

        n_in_bin = np.sum(mask)
        if n_in_bin > 0:
            avg_pred = np.mean(y_prob[mask])
            avg_true = np.mean(y_true[mask])
            ece += (n_in_bin / total_samples) * np.abs(avg_pred - avg_true)

    return ece

@validation_required
def calculate_mce(y_true: np.ndarray, y_prob: np.ndarray,
                 n_bins: int = 10) -> float:

    bin_edges = np.linspace(0.0, 1.0 + 1e-8, n_bins + 1)
    max_error = 0.0

    for i in range(n_bins):
        if i == n_bins - 1:
            mask = (y_prob >= bin_edges[i]) & (y_prob <= bin_edges[i+1])
        else:
            mask = (y_prob >= bin_edges[i]) & (y_prob < bin_edges[i+1])

        n_in_bin = np.sum(mask)
        if n_in_bin > 0:
            avg_pred = np.mean(y_prob[mask])
            avg_true = np.mean(y_true[mask])
            error = abs(avg_pred - avg_true)
            max_error = max(max_error, error)

    return max_error if max_error > 0 else np.nan

@validation_required
def calculate_calibration_slope_intercept(y_true: np.ndarray,
                                         y_prob: np.ndarray) -> Tuple[float, float]:

    try:
        epsilon = 1e-7
        p = np.clip(y_prob, epsilon, 1 - epsilon)
        logit_p = logit(p)
        X = sm.add_constant(logit_p)

        if len(np.unique(y_true)) < 2:
            return np.nan, np.nan

        model = sm.Logit(y_true, X).fit(disp=False, maxiter=100)
        return float(model.params[1]), float(model.params[0])
    except Exception as e:
        logger.debug(f"Calibration slope calculation failed: {e}")
        return np.nan, np.nan



def calculate_spiegelhalter_z(y_true, y_prob):

    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)

    mask = ~(np.isnan(y_prob) | np.isinf(y_prob))
    y_true, y_prob = y_true[mask], y_prob[mask]

    if len(y_true) < 10:
        return np.nan, np.nan


    numerator = np.sum(y_true - y_prob)

    denominator = np.sqrt(np.sum(y_prob * (1 - y_prob)))

    if denominator < 1e-8:
        return np.nan, np.nan

    z_stat = numerator / denominator

    p_value = 2 * (1 - norm.cdf(abs(z_stat)))

    return float(z_stat), float(p_value)

def hosmer_lemeshow_test(y_true, y_prob, n_bins=10, min_expected_freq=5):

    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)

    mask = ~(np.isnan(y_prob) | np.isinf(y_prob))
    y_true, y_prob = y_true[mask], y_prob[mask]

    if len(y_true) < 20 or len(np.unique(y_true)) < 2:
        return np.nan, np.nan

    df = pd.DataFrame({'y_true': y_true, 'y_prob': y_prob})


    try:
        df['bin'] = pd.qcut(df['y_prob'], n_bins, labels=False, duplicates='drop')
    except ValueError:

        df['bin'] = np.floor(df['y_prob'] * n_bins).astype(int)
        df.loc[df['bin'] == n_bins, 'bin'] = n_bins - 1

    summary = df.groupby('bin').agg(
        observed=('y_true', 'sum'),
        expected=('y_prob', 'sum'),
        n_total=('y_true', 'size')
    ).reset_index()


    while True:

        sparse_bins = summary[summary['expected'] < min_expected_freq]

        if sparse_bins.empty or len(summary) <= 2:
            break

        merge_idx = sparse_bins.index[0]

        if merge_idx == 0:

            target_idx = 1
            source_idx = 0
        else:

            target_idx = merge_idx - 1
            source_idx = merge_idx

        summary.loc[target_idx, ['observed', 'expected', 'n_total']] += \
            summary.loc[source_idx, ['observed', 'expected', 'n_total']]

        summary = summary.drop(source_idx).reset_index(drop=True)


    g = len(summary)
    if g <= 2:
        return np.nan, np.nan


    numerator = (summary['observed'] - summary['expected'])**2
    denominator = summary['expected'] * (1 - summary['expected'] / summary['n_total'])


    denominator[denominator < 1e-8] = 1e-8

    hl_statistic = (numerator / denominator).sum()


    df_hl = g - 2
    p_value = 1 - chi2.cdf(hl_statistic, df_hl)

    return float(hl_statistic), float(p_value)

def fixed_bin_calibration(y_true: np.ndarray,
                         y_prob: np.ndarray,
                         bin_edges: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:

    inds = np.digitize(y_prob, bin_edges, right=False) - 1
    inds = np.clip(inds, 0, len(bin_edges) - 2)

    n_bins = len(bin_edges) - 1
    prob_true = np.full(n_bins, np.nan)
    prob_pred = np.full(n_bins, np.nan)

    for b in range(n_bins):
        mask = inds == b
        if np.any(mask):
            prob_true[b] = np.mean(y_true[mask])
            prob_pred[b] = np.mean(y_prob[mask])
        else:

            prob_pred[b] = (bin_edges[b] + bin_edges[b + 1]) / 2.0

    return prob_true, prob_pred



def compute_bootstrap_ci_parallel(
    y_true: np.ndarray,
    y_proba: np.ndarray,
    curve_kind: str,
    grid: np.ndarray,
    n_bootstraps: int = 1000,
    bin_edges: Optional[np.ndarray] = None,
    n_jobs: int = -1,
    random_seed: int = 42
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:

    logger.info(f"Computing bootstrap CI for {curve_kind} "
               f"({n_bootstraps} iterations, {n_jobs} jobs)")

    rng = np.random.default_rng(random_seed)
    n = len(y_true)

    def run_one_bootstrap(seed: int) -> np.ndarray:

        local_rng = np.random.default_rng(seed)
        idx = local_rng.integers(0, n, n)
        yt = y_true[idx]
        yp = y_proba[idx]

        try:
            if curve_kind == 'roc':
                x, yv, _ = roc_curve(yt, yp)
            elif curve_kind == 'pr':
                precision, recall, _ = precision_recall_curve(yt, yp)
                x, yv = recall, precision
            elif curve_kind == 'cal':
                if bin_edges is None:
                    return np.full_like(grid, np.nan)
                prob_true, prob_pred = fixed_bin_calibration(yt, yp, bin_edges)
                mask = ~np.isnan(prob_true)
                x, yv = prob_pred[mask], prob_true[mask]
            else:
                raise ValueError(f"Unknown curve kind: {curve_kind}")


            if len(x) < 2:
                return np.full_like(grid, np.nan)


            order = np.argsort(x)
            x, yv = x[order], yv[order]

            xu, idx_unique = np.unique(x, return_index=True)
            yu = yv[idx_unique]

            if len(xu) > 1:
                return np.interp(grid, xu, yu,
                               left=yu[0], right=yu[-1])
            else:
                return np.full_like(grid, yu[0])

        except Exception as e:
            logger.debug(f"Bootstrap iteration failed: {e}")
            return np.full_like(grid, np.nan)


    seeds = rng.integers(0, 1_000_000, n_bootstraps)


    boot_curves = Parallel(n_jobs=n_jobs, verbose=0)(
        delayed(run_one_bootstrap)(seed)
        for seed in tqdm(seeds, desc=f"Bootstrapping {curve_kind}",
                        disable=len(seeds) < 100)
    )

    boot_curves = np.array(boot_curves)

    valid_mask = ~np.any(np.isnan(boot_curves), axis=1)
    n_valid = np.sum(valid_mask)

    if n_valid < n_bootstraps:
        logger.warning(f"{n_bootstraps - n_valid} bootstrap iterations failed")

    if n_valid == 0:
        raise RuntimeError("All bootstrap iterations failed")

    boot_curves = boot_curves[valid_mask]


    mean_curve = np.nanmean(boot_curves, axis=0)
    lower_ci = np.nanpercentile(boot_curves, 2.5, axis=0)
    upper_ci = np.nanpercentile(boot_curves, 97.5, axis=0)

    return mean_curve, lower_ci, upper_ci

def bootstrap_metrics(
    y_true: np.ndarray,
    y_pred: np.ndarray,
    y_proba: np.ndarray,
    n_bootstrap: int = 1000
) -> Dict[str, float]:

    logger.info(f"Bootstrapping metrics ({n_bootstrap} iterations)")

    boot_metrics = []
    rng = np.random.default_rng(CONFIG['bootstrap']['random_seed'])

    for i in tqdm(range(n_bootstrap), desc="Bootstrapping metrics"):
        idx = rng.choice(len(y_true), size=len(y_true), replace=True)
        metrics = calculate_all_metrics(
            y_true[idx], y_pred[idx], y_proba[idx]
        )
        boot_metrics.append(metrics)

    df_boot = pd.DataFrame(boot_metrics)
    ci_results = {}

    for col in df_boot.columns:
        if df_boot[col].notna().any():
            ci_results[f"{col}_lower"] = float(df_boot[col].quantile(0.025))
            ci_results[f"{col}_upper"] = float(df_boot[col].quantile(0.975))

    return ci_results



def cohens_d(group1: np.ndarray, group2: np.ndarray) -> float:

    n1, n2 = len(group1), len(group2)
    if n1 < 2 or n2 < 2: return np.nan

    m1, m2 = np.mean(group1), np.mean(group2)
    s1, s2 = np.std(group1, ddof=1), np.std(group2, ddof=1)

    pooled_std = np.sqrt(((n1 - 1) * s1**2 + (n2 - 1) * s2**2) / (n1 + n2 - 2))

    if pooled_std == 0: return np.nan
    return (m1 - m2) / pooled_std

def cliffs_delta(group1: np.ndarray, group2: np.ndarray) -> float:

    if len(group1) == 0 or len(group2) == 0: return np.nan


    comparisons = np.sign(np.subtract.outer(group1, group2))
    dominance = np.sum(comparisons)

    return dominance / (len(group1) * len(group2))

def interpret_cohens_d(d: float) -> str:

    abs_d = abs(d)
    if np.isnan(abs_d): return "Unknown"
    elif abs_d < 0.2: return "Negligible"
    elif abs_d < 0.5: return "Small"
    elif abs_d < 0.8: return "Medium"
    else: return "Large"

def interpret_cliffs_delta(delta: float) -> str:

    abs_delta = abs(delta)
    if np.isnan(abs_delta): return "Unknown"
    elif abs_delta < 0.147: return "Negligible"
    elif abs_delta < 0.33: return "Small"
    elif abs_delta < 0.474: return "Medium"
    else: return "Large"

def calculate_power_analysis(cohens_d_val: float, n_sample: int, alpha: float = 0.05) -> Dict[str, Any]:

    if np.isnan(cohens_d_val) or n_sample <= 0:
        return {'observed_power': np.nan, 'power_interpretation': 'Cannot calculate', 'recommended_n_80_power': np.nan}

    z_alpha = norm.ppf(1 - alpha / 2)
    delta = abs(cohens_d_val) * np.sqrt(n_sample / 2)
    power = 1 - norm.cdf(z_alpha - delta) + norm.cdf(-z_alpha - delta)

    if power >= 0.8: interp = "Adequate (≥80%)"
    elif power >= 0.6: interp = "Moderate (60-79%)"
    else: interp = "Low (<60%)"


    if abs(cohens_d_val) > 1e-10:
        n_80 = 2 * ((z_alpha + norm.ppf(0.8)) / abs(cohens_d_val)) ** 2
        n_80 = int(np.ceil(n_80))
    else:
        n_80 = np.inf

    return {
        'observed_power': float(power),
        'power_interpretation': interp,
        'recommended_n_80_power': n_80
    }

def mcnemar_test_with_effect_size(
    y_true: np.ndarray,
    y_pred1: np.ndarray,
    y_pred2: np.ndarray
) -> Tuple[float, float, str]:

    try:
        table = mcnemar_table(y_target=y_true, y_model1=y_pred1, y_model2=y_pred2)
        result = mcnemar(table, exact=False, correction=True)

        b, c = table[0, 1], table[1, 0]

        if c == 0:
            odds_ratio = 1.0 if b == 0 else np.inf
        else:
            odds_ratio = b / c


        if np.isnan(odds_ratio):
            interpretation = "Unknown"
        elif np.isinf(odds_ratio):
            interpretation = "Complete Dominance"
        elif odds_ratio == 0:
            interpretation = "Complete Dominance (Negative)"

        elif odds_ratio >= 2.0 or odds_ratio <= 0.5:
            interpretation = "Large"
        elif odds_ratio >= 1.25 or odds_ratio <= 0.8:
            interpretation = "Medium"
        elif odds_ratio >= 1.05 or odds_ratio <= 0.95:
            interpretation = "Small"
        else:
            interpretation = "Negligible"

        return float(result.pvalue), float(odds_ratio), interpretation

    except Exception as e:
        logger.error(f"McNemar test failed: {e}")
        return np.nan, np.nan, "Unknown"


def plot_effect_sizes(
    effect_results: Dict[str, Any],
    output_path: Optional[str] = None
) -> None:

    logger.info("Generating improved effect size plots...")

    plt.rcParams.update({
        "font.family": CONFIG['plotting']['font_family'],
        "font.size": 9,
        "axes.titlesize": 10,
        "axes.labelsize": 9,
        "xtick.labelsize": 8,
        "legend.fontsize": 8,
    })

    fig, axes = plt.subplots(1, 3, figsize=CONFIG['plotting']['figsize'],
                             dpi=CONFIG['plotting']['dpi'])
    ax1, ax2, ax3 = axes

    model_a = effect_results.get('Model_A', 'Model A')

    color_map = {
        'Negligible': '#D3D3D3', 'Small': '#87CEEB',
        'Medium': '#FFA500', 'Large': '#FF6347',
        'Complete Dominance': '#8B0000', 'Unknown': '#808080'
    }


    metrics = ['AUROC', 'AUPRC']
    vals = [effect_results.get(f'{m}_Cohens_d', np.nan) for m in metrics]
    interps = [effect_results.get(f'{m}_Cohens_d_interpretation', 'Unknown') for m in metrics]
    colors = [color_map.get(i, '#808080') for i in interps]

    bars1 = ax1.bar(metrics, vals, color=colors, alpha=0.85, width=0.6, edgecolor='black', linewidth=0.5)
    ax1.axhline(0, color='black', lw=0.8)


    for bar, val, interp in zip(bars1, vals, interps):
        if not np.isnan(val):
            y_pos = val + 0.1 if val >= 0 else val - 0.2
            ax1.text(bar.get_x() + bar.get_width()/2, y_pos,
                     f'{val:.2f}\n({interp})', ha='center', fontsize=7)

    ax1.set_ylabel("Cohen's d")

    ax1.set_title(f"Effect Sizes (Cohen's d)\nPositive favors {model_a}", fontsize=8, fontweight='bold')

    max_val = max([abs(v) for v in vals if not np.isnan(v)] + [1])
    ax1.set_ylim(-(max_val+0.5), max_val+0.5)
    ax1.grid(True, alpha=0.2, axis='y')


    vals = [effect_results.get(f'{m}_Cliffs_delta', np.nan) for m in metrics]
    interps = [effect_results.get(f'{m}_Cliffs_delta_interpretation', 'Unknown') for m in metrics]
    colors = [color_map.get(i, '#808080') for i in interps]

    bars2 = ax2.bar(metrics, vals, color=colors, alpha=0.85, width=0.6, edgecolor='black', linewidth=0.5)
    ax2.axhline(0, color='black', lw=0.8)

    for bar, val, interp in zip(bars2, vals, interps):
        if not np.isnan(val):
            y_pos = val + 0.05 if val >= 0 else val - 0.1
            ax2.text(bar.get_x() + bar.get_width()/2, y_pos,
                     f'{val:.2f}\n({interp})', ha='center', fontsize=7)

    ax2.set_ylabel("Cliff's Delta")

    ax2.set_title(f"Effect Sizes (Cliff's Delta)\nPositive favors {model_a}", fontsize=8, fontweight='bold')
    ax2.set_ylim(-1.2, 1.2)
    ax2.grid(True, alpha=0.2, axis='y')


    or_val = effect_results.get('McNemar_Odds_ratio', np.nan)
    interp = effect_results.get('McNemar_effect_interpretation', 'Unknown')

    if not np.isnan(or_val) and not np.isinf(or_val) and or_val > 0:
        log_or = np.log(or_val)
        color = color_map.get(interp, '#808080')

        ax3.bar(['McNemar'], [log_or], color=color, alpha=0.85, width=0.5, edgecolor='black', linewidth=0.5)
        ax3.axhline(0, color='black', lw=0.8, label='OR=1')


        for thresh in [np.log(1.25), np.log(2.0)]:
            ax3.axhline(thresh, color='gray', linestyle=':', alpha=0.3)
            ax3.axhline(-thresh, color='gray', linestyle=':', alpha=0.3)

        ax3.text(0, log_or + (0.1 if log_or>0 else -0.2),
                 f'OR={or_val:.2f}\n({interp})', ha='center', fontsize=7)


        ax3_twin = ax3.twinx()
        ticks = [0.5, 0.8, 1, 1.25, 2, 3, 5]
        ax3_twin.set_yticks(np.log(ticks))
        ax3_twin.set_yticklabels(ticks, fontsize=7)
        ax3_twin.set_ylabel("Odds Ratio")

    ax3.set_ylabel("Log(Odds Ratio)")

    ax3.set_title(f"Classification Difference\nOR > 1 favors {model_a}", fontsize=8, fontweight='bold')
    ax3.grid(True, alpha=0.2, axis='y')

    plt.tight_layout()

    if output_path:
        plt.savefig(output_path, dpi=CONFIG['plotting']['dpi'], bbox_inches='tight')
        plt.savefig(str(output_path).replace('.png', '.pdf'), bbox_inches='tight')
        plt.close()
        logger.info(f"Effect size plot saved to {output_path}")


def get_onehot_feature_names_from_ct(column_transformer) -> List[str]:

    names = []
    for name, transformer, columns in column_transformer.transformers_:
        if name == "remainder" and transformer == "drop":
            continue

        if (name == "cat" and hasattr(transformer, "named_steps") and
            "onehot" in transformer.named_steps):
            encoder = transformer.named_steps["onehot"]
            for col, categories in zip(columns, encoder.categories_):
                names.extend([f"{col}_{cat}" for cat in categories])
        else:
            names.extend(list(columns))

    return names

def load_and_evaluate_model(
    model_path: str,
    X_raw: pd.DataFrame,
    y_true: np.ndarray
) -> Tuple[str, np.ndarray, np.ndarray, float]:

    logger.info(f"Loading model from {model_path}")

    try:

        bundle = joblib.load(model_path)

        model_name = bundle.get("metadata", {}).get(
            "model_name",
            Path(model_path).stem
        )

        model = bundle["model"]
        preprocessor = bundle.get("preprocessor")
        selected_features = bundle.get("selected_features")
        threshold = bundle.get("youden_j_threshold", 0.5)

        if preprocessor is None:
            raise ValueError(f"Model {model_name} missing preprocessor")

        if selected_features is None:
            raise ValueError(f"Model {model_name} missing selected_features")


        X_transformed = preprocessor.transform(X_raw)
        feature_names = get_onehot_feature_names_from_ct(preprocessor)
        X_df = pd.DataFrame(X_transformed, columns=feature_names, index=X_raw.index)


        missing_features = [f for f in selected_features if f not in X_df.columns]
        if missing_features:
            raise ValueError(
                f"Model {model_name} missing features after transform: "
                f"{missing_features[:5]}..."
            )

        X_selected = X_df[selected_features]


        if hasattr(model, "predict_proba"):
            y_proba = model.predict_proba(X_selected)[:, 1]
        else:

            scores = model.decision_function(X_selected)
            y_proba = 1 / (1 + np.exp(-scores))

        y_pred = (y_proba >= threshold).astype(int)

        logger.info(f"Model '{model_name}' evaluated: "
                   f"{len(y_proba)} samples, threshold={threshold:.3f}")

        return model_name, y_proba, y_pred, threshold

    except Exception as e:
        logger.error(f"Failed to load/evaluate model {model_path}: {e}")
        raise

def calculate_all_metrics(
    y_true: np.ndarray,
    y_pred: np.ndarray,
    y_proba: np.ndarray
) -> Dict[str, Any]:

    cm = confusion_matrix(y_true, y_pred)
    if cm.shape == (2, 2):
        tn, fp, fn, tp = cm.ravel()
    else:
        tn = fp = fn = tp = 0


    cal_slope, cal_intercept = calculate_calibration_slope_intercept(y_true, y_proba)
    spiegel_z, spiegel_p = calculate_spiegelhalter_z(y_true, y_proba)
    hl_stat, hl_p = hosmer_lemeshow_test(y_true, y_proba,
                                        CONFIG['calibration']['n_bins'])


    metrics = {
        'Test_Samples': len(y_true),
        'Positive_Rate': float(np.mean(y_true)),
        'Intercept': float(cal_intercept),
        'Slope': float(cal_slope),
        'MCC': float(matthews_corrcoef(y_true, y_pred)),
        'Kappa': float(cohen_kappa_score(y_true, y_pred)),
        'Brier_Score': float(brier_score_loss(y_true, y_proba)),
        'Log_Loss': float(log_loss(y_true, y_proba)),
        'ECE': float(calculate_ece(y_true, y_proba,
                                  CONFIG['calibration']['n_bins'])),
        'MCE': float(calculate_mce(y_true, y_proba,
                                  CONFIG['calibration']['n_bins'])),
        'Spiegelhalter_Z': float(spiegel_z),
        'Spiegelhalter_p': float(spiegel_p),
        'HL_statistic': float(hl_stat),
        'HL_p_value': float(hl_p),
        'AUROC': float(roc_auc_score(y_true, y_proba)),
        'AUPRC': float(average_precision_score(y_true, y_proba)),
        'F1': float(f1_score(y_true, y_pred, zero_division=0)),
        'Precision': float(precision_score(y_true, y_pred, zero_division=0)),
        'Recall': float(recall_score(y_true, y_pred, zero_division=0)),
        'Accuracy': float(accuracy_score(y_true, y_pred)),
        'Balanced_Accuracy': float(balanced_accuracy_score(y_true, y_pred)),
        'TN': int(tn),
        'FP': int(fp),
        'FN': int(fn),
        'TP': int(tp),
    }


    if (tn + fp) > 0:
        metrics['Specificity'] = tn / (tn + fp)
    else:
        metrics['Specificity'] = np.nan

    if (tn + fn) > 0:
        metrics['NPV'] = tn / (tn + fn)
    else:
        metrics['NPV'] = np.nan

    if (tp + fp) > 0:
        metrics['PPV'] = tp / (tp + fp)
    else:
        metrics['PPV'] = np.nan

    if (fp + tn) > 0:
        metrics['FPR'] = fp / (fp + tn)
    else:
        metrics['FPR'] = np.nan

    if (fn + tp) > 0:
        metrics['FNR'] = fn / (fn + tp)
    else:
        metrics['FNR'] = np.nan

    return metrics

def compare_models_effect_size(
    model1_data: Tuple[str, np.ndarray, np.ndarray],
    model2_data: Tuple[str, np.ndarray, np.ndarray],
    y_true: np.ndarray,
    n_bootstrap: int = 1000
) -> Dict[str, Any]:

    name1, proba1, pred1 = model1_data
    name2, proba2, pred2 = model2_data

    logger.info(f"Comparing models: {name1} vs {name2}")

    results = {
        'Model_A': name1,
        'Model_B': name2,
        'N_samples': len(y_true)
    }


    try:
        auroc1 = roc_auc_score(y_true, proba1)
        auroc2 = roc_auc_score(y_true, proba2)
        auprc1 = average_precision_score(y_true, proba1)
        auprc2 = average_precision_score(y_true, proba2)

        results.update({
            'AUROC_A': float(auroc1), 'AUROC_B': float(auroc2),
            'AUROC_diff': float(auroc1 - auroc2),
            'AUPRC_A': float(auprc1), 'AUPRC_B': float(auprc2),
            'AUPRC_diff': float(auprc1 - auprc2),
        })
    except Exception as e:
        logger.error(f"Failed to calculate point metrics: {e}")


    rng = np.random.default_rng(CONFIG['bootstrap']['random_seed'])
    auroc_s1, auroc_s2 = [], []
    auprc_s1, auprc_s2 = [], []

    for _ in tqdm(range(n_bootstrap), desc="Bootstrapping effect sizes"):
        idx = rng.choice(len(y_true), len(y_true), replace=True)

        if len(np.unique(y_true[idx])) < 2: continue

        try:
            auroc_s1.append(roc_auc_score(y_true[idx], proba1[idx]))
            auroc_s2.append(roc_auc_score(y_true[idx], proba2[idx]))
            auprc_s1.append(average_precision_score(y_true[idx], proba1[idx]))
            auprc_s2.append(average_precision_score(y_true[idx], proba2[idx]))
        except Exception: continue


    for metric, s1, s2 in [('AUROC', auroc_s1, auroc_s2), ('AUPRC', auprc_s1, auprc_s2)]:
        if len(s1) == 0: continue

        s1, s2 = np.array(s1), np.array(s2)
        d_val = cohens_d(s1, s2)
        delta_val = cliffs_delta(s1, s2)
        power_res = calculate_power_analysis(d_val, len(y_true))

        results[f'{metric}_Cohens_d'] = float(d_val)
        results[f'{metric}_Cohens_d_interpretation'] = interpret_cohens_d(d_val)
        results[f'{metric}_Cliffs_delta'] = float(delta_val)
        results[f'{metric}_Cliffs_delta_interpretation'] = interpret_cliffs_delta(delta_val)


        results[f'{metric}_observed_power'] = power_res['observed_power']
        results[f'{metric}_power_interpretation'] = power_res['power_interpretation']
        results[f'{metric}_recommended_n_80_power'] = power_res['recommended_n_80_power']


    p_mcnemar, or_mcnemar, interp_mcnemar = mcnemar_test_with_effect_size(
        y_true, pred1, pred2
    )

    results.update({
        'McNemar_p_value': float(p_mcnemar),
        'McNemar_Odds_ratio': float(or_mcnemar),
        'McNemar_effect_interpretation': interp_mcnemar
    })

    return results


def plot_with_ci(
    models_data: List[Tuple[str, np.ndarray, np.ndarray]],
    y_true: np.ndarray,
    output_path: Optional[str] = None
) -> None:

    logger.info("Generating comparison plots")


    plt.rcParams.update({
        "font.family": CONFIG['plotting']['font_family'],
        "font.size": 9,
        "axes.titlesize": 10,
        "axes.labelsize": 9,
        "xtick.labelsize": 8,
        "ytick.labelsize": 8,
        "legend.fontsize": 8,
        "lines.linewidth": 1.2,
        "axes.linewidth": 0.8,
        "grid.linewidth": 0.3,
        "grid.alpha": 0.3,
    })


    fig, axes = plt.subplots(1, 3, figsize=CONFIG['plotting']['figsize'],
                            dpi=CONFIG['plotting']['dpi'])
    ax_roc, ax_pr, ax_cal = axes


    colors = plt.cm.tab10(np.linspace(0, 1, len(models_data)))

    grid_roc = np.linspace(0, 1, 201)
    grid_pr = np.linspace(0, 1, 201)
    grid_cal = np.linspace(0, 1, 100)

    bin_edges = np.linspace(0, 1, CONFIG['calibration']['n_bins'] + 1)


    logger.info("Plotting ROC curves...")
    for idx, (name, proba, _) in enumerate(models_data):

        fpr, tpr, _ = roc_curve(y_true, proba)
        auc_val = roc_auc_score(y_true, proba)

        mean_tpr, lo_tpr, hi_tpr = compute_bootstrap_ci_parallel(
            y_true, proba, 'roc', grid_roc,
            n_bootstraps=CONFIG['bootstrap']['n_bootstraps'],
            n_jobs=CONFIG['bootstrap']['n_jobs'],
            random_seed=CONFIG['bootstrap']['random_seed']
        )

        ax_roc.plot(fpr, tpr, '--', alpha=0.4, lw=0.8, color=colors[idx])
        ax_roc.plot(grid_roc, mean_tpr, lw=1.5, label=f"{name} (AUC={auc_val:.3f})",
                   color=colors[idx])
        ax_roc.fill_between(grid_roc, lo_tpr, hi_tpr, alpha=0.2, color=colors[idx])

    ax_roc.plot([0, 1], [0, 1], 'k--', alpha=0.3, lw=1)
    ax_roc.set_xlabel("False Positive Rate")
    ax_roc.set_ylabel("True Positive Rate")
    ax_roc.set_title("ROC Curves with 95% CI")
    ax_roc.legend(loc="lower right", frameon=True, fancybox=True,
                 framealpha=0.8, fontsize=7)
    ax_roc.grid(True, alpha=0.2)
    ax_roc.set_xlim([-0.02, 1.02])
    ax_roc.set_ylim([-0.02, 1.02])


    logger.info("Plotting PR curves...")
    baseline = float(np.mean(y_true))

    for idx, (name, proba, _) in enumerate(models_data):

        precision, recall, _ = precision_recall_curve(y_true, proba)
        ap = average_precision_score(y_true, proba)


        mean_prec, lo_prec, hi_prec = compute_bootstrap_ci_parallel(
            y_true, proba, 'pr', grid_pr,
            n_bootstraps=CONFIG['bootstrap']['n_bootstraps'],
            n_jobs=CONFIG['bootstrap']['n_jobs'],
            random_seed=CONFIG['bootstrap']['random_seed']
        )

        ax_pr.step(recall, precision, '--', alpha=0.4, lw=0.8, color=colors[idx])
        ax_pr.plot(grid_pr, mean_prec, lw=1.5, label=f"{name} (AP={ap:.3f})",
                  color=colors[idx])
        ax_pr.fill_between(grid_pr, lo_prec, hi_prec, alpha=0.2, color=colors[idx])

    ax_pr.axhline(baseline, color='k', linestyle='--', alpha=0.5,
                 lw=1, label=f'Baseline (P={baseline:.2f})')
    ax_pr.set_xlabel("Recall")
    ax_pr.set_ylabel("Precision")
    ax_pr.set_title("Precision-Recall Curves with 95% CI")
    ax_pr.legend(loc="lower left", frameon=True, fancybox=True,
                framealpha=0.8, fontsize=7)
    ax_pr.grid(True, alpha=0.2)
    ax_pr.set_xlim([-0.02, 1.02])
    ax_pr.set_ylim([-0.02, 1.02])

    logger.info("Plotting calibration curves...")
    for idx, (name, proba, _) in enumerate(models_data):

        prob_true, prob_pred = fixed_bin_calibration(y_true, proba, bin_edges)
        ece = calculate_ece(y_true, proba, CONFIG['calibration']['n_bins'])

        mean_cal, lo_cal, hi_cal = compute_bootstrap_ci_parallel(
            y_true, proba, 'cal', grid_cal,
            n_bootstraps=CONFIG['bootstrap']['n_bootstraps'],
            bin_edges=bin_edges,
            n_jobs=CONFIG['bootstrap']['n_jobs'],
            random_seed=CONFIG['bootstrap']['random_seed']
        )

        mask = ~np.isnan(prob_true)
        if np.any(mask):
            ax_cal.plot(prob_pred[mask], prob_true[mask], 'o--', alpha=0.8,
                       markersize=4, lw=1, color=colors[idx],
                       label=f"{name} (ECE={ece:.3f})")

        ax_cal.plot(grid_cal, mean_cal, lw=1.5, color=colors[idx])
        ax_cal.fill_between(grid_cal, lo_cal, hi_cal, alpha=0.2, color=colors[idx])

    ax_cal.plot([0, 1], [0, 1], 'k:', lw=1.5, label='Perfect calibration')
    ax_cal.set_xlabel("Mean Predicted Probability")
    ax_cal.set_ylabel("Fraction of Positives")
    ax_cal.set_title("Calibration Curves with 95% CI")
    ax_cal.legend(loc="upper left", frameon=True, fancybox=True,
                 framealpha=0.8, fontsize=7)
    ax_cal.grid(True, alpha=0.2)
    ax_cal.set_xlim([-0.02, 1.02])
    ax_cal.set_ylim([-0.02, 1.02])


    plt.tight_layout(pad=2.0)

    if output_path is None:
        output_path = Path(CONFIG['paths']['output_dir']) / "model_comparison_plots.png"

    plt.savefig(output_path, dpi=CONFIG['plotting']['dpi'], bbox_inches='tight')
    plt.savefig(str(output_path).replace('.png', '.pdf'), bbox_inches='tight')
    plt.close()

    logger.info(f"Plots saved to {output_path}")

def plot_effect_sizes(
    effect_results: Dict[str, Any],
    output_path: Optional[str] = None
) -> None:

    logger.info("Generating improved effect size plots...")

    plt.rcParams.update({
        "font.family": CONFIG['plotting']['font_family'],
        "font.size": 9,
        "axes.titlesize": 10,
        "axes.labelsize": 9,
        "xtick.labelsize": 8,
        "legend.fontsize": 8,
    })

    fig, axes = plt.subplots(1, 3, figsize=CONFIG['plotting']['figsize'],
                             dpi=CONFIG['plotting']['dpi'])
    ax1, ax2, ax3 = axes


    model_a = effect_results.get('Model_A', 'Model A')


    color_map = {
        'Negligible': '#D3D3D3', 'Small': '#87CEEB',
        'Medium': '#FFA500', 'Large': '#FF6347',
        'Complete Dominance': '#8B0000', 'Unknown': '#808080'
    }


    metrics = ['AUROC', 'AUPRC']
    vals = [effect_results.get(f'{m}_Cohens_d', np.nan) for m in metrics]
    interps = [effect_results.get(f'{m}_Cohens_d_interpretation', 'Unknown') for m in metrics]
    colors = [color_map.get(i, '#808080') for i in interps]

    bars1 = ax1.bar(metrics, vals, color=colors, alpha=0.85, width=0.6, edgecolor='black', linewidth=0.5)
    ax1.axhline(0, color='black', lw=0.8)


    for bar, val, interp in zip(bars1, vals, interps):
        if not np.isnan(val):
            y_pos = val + 0.1 if val >= 0 else val - 0.2
            ax1.text(bar.get_x() + bar.get_width()/2, y_pos,
                     f'{val:.2f}\n({interp})', ha='center', fontsize=7)

    ax1.set_ylabel("Cohen's d")

    ax1.set_title(f"Effect Sizes (Cohen's d)\nPositive favors {model_a}", fontsize=8, fontweight='bold')

    max_val = max([abs(v) for v in vals if not np.isnan(v)] + [1])
    ax1.set_ylim(-(max_val+0.5), max_val+0.5)
    ax1.grid(True, alpha=0.2, axis='y')


    vals = [effect_results.get(f'{m}_Cliffs_delta', np.nan) for m in metrics]
    interps = [effect_results.get(f'{m}_Cliffs_delta_interpretation', 'Unknown') for m in metrics]
    colors = [color_map.get(i, '#808080') for i in interps]

    bars2 = ax2.bar(metrics, vals, color=colors, alpha=0.85, width=0.6, edgecolor='black', linewidth=0.5)
    ax2.axhline(0, color='black', lw=0.8)

    for bar, val, interp in zip(bars2, vals, interps):
        if not np.isnan(val):
            y_pos = val + 0.05 if val >= 0 else val - 0.1
            ax2.text(bar.get_x() + bar.get_width()/2, y_pos,
                     f'{val:.2f}\n({interp})', ha='center', fontsize=7)

    ax2.set_ylabel("Cliff's Delta")

    ax2.set_title(f"Effect Sizes (Cliff's Delta)\nPositive favors {model_a}", fontsize=8, fontweight='bold')
    ax2.set_ylim(-1.2, 1.2)
    ax2.grid(True, alpha=0.2, axis='y')


    or_val = effect_results.get('McNemar_Odds_ratio', np.nan)
    interp = effect_results.get('McNemar_effect_interpretation', 'Unknown')

    if not np.isnan(or_val) and not np.isinf(or_val) and or_val > 0:
        log_or = np.log(or_val)
        color = color_map.get(interp, '#808080')

        ax3.bar(['McNemar'], [log_or], color=color, alpha=0.85, width=0.5, edgecolor='black', linewidth=0.5)
        ax3.axhline(0, color='black', lw=0.8, label='OR=1')


        for thresh in [np.log(1.25), np.log(2.0)]:
            ax3.axhline(thresh, color='gray', linestyle=':', alpha=0.3)
            ax3.axhline(-thresh, color='gray', linestyle=':', alpha=0.3)

        ax3.text(0, log_or + (0.1 if log_or>0 else -0.2),
                 f'OR={or_val:.2f}\n({interp})', ha='center', fontsize=7)


        ax3_twin = ax3.twinx()
        ticks = [0.5, 0.8, 1, 1.25, 2, 3, 5]
        ax3_twin.set_yticks(np.log(ticks))
        ax3_twin.set_yticklabels(ticks, fontsize=7)
        ax3_twin.set_ylabel("Odds Ratio")

    ax3.set_ylabel("Log(Odds Ratio)")

    ax3.set_title(f"Classification Difference\nOR > 1 favors {model_a}", fontsize=8, fontweight='bold')
    ax3.grid(True, alpha=0.2, axis='y')

    plt.tight_layout()

    if output_path:
        plt.savefig(output_path, dpi=CONFIG['plotting']['dpi'], bbox_inches='tight')
        plt.savefig(str(output_path).replace('.png', '.pdf'), bbox_inches='tight')
        plt.close()
        logger.info(f"Effect size plot saved to {output_path}")


def main() -> None:

    logger = setup_logging()

    try:

        output_dir = Path(CONFIG['paths']['output_dir'])
        output_dir.mkdir(parents=True, exist_ok=True)

        logger.info("Starting model comparison analysis")

        logger.info(f"Loading test data from {CONFIG['paths']['test_data']}")
        test_df = pd.read_csv(CONFIG['paths']['test_data'])

        if len(test_df) == 0:
            raise ValueError("Test dataset is empty")


        if 'MDR status' in test_df.columns:
            test_df = test_df.rename(columns={'MDR status': 'outcome'})
        elif 'outcome' not in test_df.columns:
            raise ValueError("Test data must contain 'outcome' or 'MDR status' column")

        X_test = test_df.drop(columns=['outcome'])
        y_test = test_df['outcome'].values

        logger.info(f"Test data loaded: {len(test_df)} samples, "
                   f"positive rate = {y_test.mean():.1%}")

        model_files = {
            'CatBoost': CONFIG['paths']['model_catboost'],
            'Logistic Regression': CONFIG['paths']['model_lr']
        }


        models_data = []
        all_results = []

        for model_name, model_path in model_files.items():
            logger.info(f"Evaluating {model_name}...")

            try:
                name, proba, pred, threshold = load_and_evaluate_model(
                    model_path, X_test, y_test
                )

                models_data.append((name, proba, pred))

                metrics = calculate_all_metrics(y_test, pred, proba)

                ci = bootstrap_metrics(
                    y_test, pred, proba,
                    n_bootstrap=CONFIG['bootstrap']['n_bootstraps']
                )


                model_results = {
                    'Model': name,
                    'Threshold': threshold,
                    **metrics,
                    **ci
                }

                all_results.append(model_results)

                logger.info(f"✓ {name}: AUC={metrics['AUROC']:.3f}, "
                           f"AP={metrics['AUPRC']:.3f}")

            except Exception as e:
                logger.error(f"Failed to evaluate {model_name}: {e}")
                continue

        if not models_data:
            raise RuntimeError("No models were successfully evaluated")


        results_df = pd.DataFrame(all_results)


        priority_cols = ['Model', 'Threshold', 'Test_Samples', 'Positive_Rate',
                        'AUROC', 'AUPRC', 'Accuracy', 'F1', 'Precision', 'Recall',
                        'Balanced_Accuracy', 'MCC', 'Kappa', 'Brier_Score',
                        'Log_Loss', 'ECE', 'MCE', 'Slope', 'Intercept',
                        'Spiegelhalter_Z', 'Spiegelhalter_p', 'HL_statistic',
                        'HL_p_value', 'Specificity', 'NPV', 'PPV', 'FPR', 'FNR',
                        'TN', 'FP', 'FN', 'TP']

        remaining_cols = [c for c in results_df.columns if c not in priority_cols]
        final_cols = priority_cols + remaining_cols

        results_df = results_df[final_cols]
        results_path = output_dir / "model_comparison_results.csv"
        results_df.to_csv(results_path, index=False)
        logger.info(f"Results saved to {results_path}")


        plot_path = output_dir / "model_comparison_plots.png"
        plot_with_ci(models_data, y_test, str(plot_path))

        if len(models_data) == 2:
            logger.info("Calculating effect sizes between models...")

            effect_results = compare_models_effect_size(
                models_data[0], models_data[1], y_test,
                n_bootstrap=CONFIG['bootstrap']['n_bootstraps']
            )

            effect_df = pd.DataFrame([effect_results])
            effect_path = output_dir / "effect_size_analysis.csv"
            effect_df.to_csv(effect_path, index=False)


            effect_plot_path = output_dir / "effect_sizes_plot.png"
            plot_effect_sizes(effect_results, str(effect_plot_path))

            logger.info("\n" + "="*60)
            logger.info("EFFECT SIZE SUMMARY")
            logger.info("="*60)

            logger.info(f"AUROC - Cohen's d: {effect_results.get('AUROC_Cohens_d', 'N/A'):.3f} "
                       f"({effect_results.get('AUROC_Cohens_d_interpretation', 'N/A')})")
            logger.info(f"AUPRC - Cohen's d: {effect_results.get('AUPRC_Cohens_d', 'N/A'):.3f} "
                       f"({effect_results.get('AUPRC_Cohens_d_interpretation', 'N/A')})")

            or_val = effect_results.get('McNemar_Odds_ratio', np.nan)
            if not np.isnan(or_val):
                logger.info(f"McNemar - Odds Ratio: {or_val:.3f} "
                           f"({effect_results.get('McNemar_effect_interpretation', 'N/A')})")

            if 'AUROC_observed_power' in effect_results:
                logger.info(f"\nStatistical Power:")
                logger.info(f"AUROC Power: {effect_results['AUROC_observed_power']:.1%} "
                           f"({effect_results['AUROC_power_interpretation']})")
                logger.info(f"AUPRC Power: {effect_results.get('AUPRC_observed_power', 'N/A')}")

                n_rec = effect_results.get('AUROC_recommended_n_80_power', 'N/A')
                logger.info(f"Recommended n for 80% power: {n_rec}")

        logger.info("\n" + "="*60)
        logger.info("ANALYSIS COMPLETE")
        logger.info("="*60)
        logger.info(f"Results:      {output_dir / 'model_comparison_results.csv'}")
        logger.info(f"Plots:        {output_dir / 'model_comparison_plots.png'}")

        if len(models_data) == 2:
            logger.info(f"Effect Sizes: {output_dir / 'effect_size_analysis.csv'}")
            logger.info(f"Effect Plot:  {output_dir / 'effect_sizes_plot.png'}")

        logger.info(f"Log file:     {output_dir / 'logs'} directory")

    except Exception as e:
        logger.critical(f"Fatal error in model comparison: {e}", exc_info=True)
        raise

    finally:
        logger.info("Analysis finished")
        logging.shutdown()

if __name__ == "__main__":
    main()

Bootstrapping metrics:   0%|          | 0/1000 [00:00<?, ?it/s]

Bootstrapping metrics:   0%|          | 0/1000 [00:00<?, ?it/s]

Bootstrapping roc:   0%|          | 0/1000 [00:00<?, ?it/s]

Bootstrapping roc:   0%|          | 0/1000 [00:00<?, ?it/s]

Bootstrapping pr:   0%|          | 0/1000 [00:00<?, ?it/s]

Bootstrapping pr:   0%|          | 0/1000 [00:00<?, ?it/s]

Bootstrapping cal:   0%|          | 0/1000 [00:00<?, ?it/s]

Bootstrapping cal:   0%|          | 0/1000 [00:00<?, ?it/s]

Bootstrapping effect sizes:   0%|          | 0/1000 [00:00<?, ?it/s]

**12. Decision Curve Analysis**  
Perform decision curve analysis (DCA) to evaluate clinical benefit across threshold probabilities.

In [18]:
import os
import pandas as pd
import joblib
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator, AutoMinorLocator
from tqdm import tqdm
from sklearn.utils import resample


os.makedirs("model_comparison", exist_ok=True)
N_BOOT = 1000

CLIN_THRESH = np.linspace(0.05, 0.5, 100)
RNG = np.random.default_rng(42)
COLORS = {
    "CatBoost": "#0173B2",
    "Logistic Regression": "#DE8F05"
}
Y_OFFSETS = {"CatBoost": 0.02, "Logistic Regression": -0.015}
TEXTBOX = dict(boxstyle="round,pad=0.14", fc="white", ec="gray", alpha=0.81, linewidth=0.46)
YOUDEN_TEXTBOX = dict(boxstyle="round,pad=0.09", fc="lightgray", ec="gray", alpha=0.73, linewidth=0.3)


print("Loading test_set.csv...")

try:
    test_df = pd.read_csv("test_set.csv")
    if "MDR status" in test_df.columns:
        test_df = test_df.rename(columns={"MDR status": "outcome"})
    X_test = test_df.drop("outcome", axis=1)
    y_test = test_df["outcome"].values
    print(f"✅ Test data loaded: {len(test_df)} samples")
except FileNotFoundError:
    print("❌ Error: test_set.csv not found.")
    exit()

def get_onehot_feature_names_from_ct(column_transformer):
    names = []
    for nm, pipe, cols in column_transformer.transformers_:
        if nm == "remainder" and pipe == "drop": continue
        if nm == "cat" and hasattr(pipe, "named_steps") and "onehot" in pipe.named_steps:
            cats = pipe.named_steps["onehot"].categories_
            for col, cat_list in zip(cols, cats):
                names += [f"{col}_{cat}" for cat in cat_list]
        else:
            names += list(cols)
    return names

model_paths = {
    "CatBoost": "deployment_artifacts/CatBoost_with_thresholds_v1.0.0.pkl",
    "Logistic Regression": "deployment_artifacts/LogisticRegression_with_thresholds_v1.0.0.pkl"
}

loaded_models = {}
for name, path in model_paths.items():
    if not os.path.exists(path):
        print(f"⚠️ Warning: {path} not found. Skipping {name}.")
        continue

    try:
        bundle = joblib.load(path)
        model = bundle["model"]
        pre = bundle.get("preprocessor")
        sel_feats = bundle.get("selected_features")

        thr = bundle.get("youden_j_threshold") or bundle.get("metadata", {}).get("threshold", 0.5)

        if pre is None or sel_feats is None:
            raise ValueError("Missing preprocessor or selected_features")
        loaded_models[name] = {"model": model, "preprocessor": pre, "selected_features": sel_feats, "youden_threshold": thr}
        print(f"✅ {name} model loaded")
    except Exception as e:
        print(f"❌ Error loading {name}: {e}")

if not loaded_models:
    print("❌ No models loaded. Exiting.")
    exit()

def get_predictions(model_info, X_raw):
    pre = model_info["preprocessor"]
    sel_feats = model_info["selected_features"]

    X_trans = pre.transform(X_raw)
    feat_names = get_onehot_feature_names_from_ct(pre)


    if hasattr(X_trans, "toarray"): X_trans = X_trans.toarray()
    X_df = pd.DataFrame(X_trans, columns=feat_names, index=X_raw.index)

    missing = [f for f in sel_feats if f not in X_df.columns]
    if missing: raise ValueError(f"Missing transformed columns: {missing[:3]}...")

    X_eval = X_df[sel_feats]
    mdl = model_info["model"]

    try:
        proba = mdl.predict_proba(X_eval)[:, 1]
    except AttributeError:
        scores = mdl.decision_function(X_eval)
        proba = 1 / (1 + np.exp(-scores))
    return proba

print("\nGenerating predictions...")
predictions, youden_thresholds = {}, {}
for name, info in loaded_models.items():
    try:
        predictions[name] = get_predictions(info, X_test)
        youden_thresholds[name] = info["youden_threshold"]
        print(f"✅ {name} predictions generated (Youden={youden_thresholds[name]:.3f})")
    except Exception as e:
        print(f"❌ Failed for {name}: {e}")



def compute_net_benefit(y_true, y_pred_prob, thresholds):
    n = len(y_true)
    nb = []
    for t in thresholds:
        tp = ((y_pred_prob >= t) & (y_true == 1)).sum()
        fp = ((y_pred_prob >= t) & (y_true == 0)).sum()

        weight = t / (1 - t) if t < 1.0 else 0

        val = (tp / n) - (fp / n) * weight
        nb.append(val)
    return np.array(nb)

def compute_clinical_utility_indexes(y_true, y_pred_prob, thresholds):
    results = {'NNT': [], 'FP_per_TP': []}
    for t in thresholds:
        pred_pos = y_pred_prob >= t
        tp = (pred_pos & (y_true == 1)).sum()
        fp = (pred_pos & (y_true == 0)).sum()

        if tp == 0:
            nnt, fp_per_tp = np.nan, np.nan
        else:

            nnt = np.sum(pred_pos) / tp
            fp_per_tp = fp / tp

        results['NNT'].append(nnt)
        results['FP_per_TP'].append(fp_per_tp)
    return {k: np.array(v) for k, v in results.items()}

def bootstrap_utility_curves(y_true, y_pred_prob, thresholds, n_boot=1000, rng=None):
    if rng is None: rng = np.random.default_rng(42)
    n = len(y_true)


    nnt_boot = np.full((n_boot, len(thresholds)), np.nan)
    fppt_boot = np.full((n_boot, len(thresholds)), np.nan)

    for i in range(n_boot):
        idx = rng.integers(0, n, n)
        yt, yp = y_true[idx], y_pred_prob[idx]


        if len(np.unique(yt)) < 2: continue

        for j, t in enumerate(thresholds):
            pred_pos = yp >= t
            tp = np.sum(pred_pos & (yt == 1))
            fp = np.sum(pred_pos & (yt == 0))

            if tp > 0:
                nnt_boot[i, j] = np.sum(pred_pos) / tp
                fppt_boot[i, j] = fp / tp

    return {
        'nnt_mean': np.nanmean(nnt_boot, axis=0),
        'nnt_low':  np.nanpercentile(nnt_boot, 2.5, axis=0),
        'nnt_high': np.nanpercentile(nnt_boot, 97.5, axis=0),
        'fppt_mean': np.nanmean(fppt_boot, axis=0),
        'fppt_low':  np.nanpercentile(fppt_boot, 2.5, axis=0),
        'fppt_high': np.nanpercentile(fppt_boot, 97.5, axis=0),
    }

results, utility_bands, clinical_utilities = {}, {}, {}

for name, proba in predictions.items():
    print(f"\nBootstrapping {name} Net Benefit and Utility Indices...")
    boot_mat = np.full((N_BOOT, len(CLIN_THRESH)), np.nan)
    skip = 0

    for i in tqdm(range(N_BOOT)):
        idx = RNG.choice(len(y_test), len(y_test), replace=True)
        ys, ps = y_test[idx], proba[idx]

        if len(np.unique(ys)) < 2:
            skip += 1
            continue

        boot_mat[i] = compute_net_benefit(ys, ps, CLIN_THRESH)

    if skip: print(f"⚠️ Skipped {skip}/{N_BOOT} bootstraps")


    valid_mask = ~np.isnan(boot_mat).all(axis=1)
    valid = boot_mat[valid_mask]

    mean_nb = np.nanmean(valid, axis=0)
    low_ci = np.nanpercentile(valid, 2.5, axis=0)
    high_ci = np.nanpercentile(valid, 97.5, axis=0)


    opt_idx = int(np.nanargmax(mean_nb))

    results[name] = {
        "thresholds": CLIN_THRESH,
        "net_benefit": mean_nb,
        "lower_ci": low_ci,
        "upper_ci": high_ci,
        "optimal_threshold": float(CLIN_THRESH[opt_idx]),
        "optimal_net_benefit": float(mean_nb[opt_idx]),
        "youden_threshold": float(youden_thresholds.get(name, np.nan)),
    }

    clinical_utilities[name] = compute_clinical_utility_indexes(y_test, proba, CLIN_THRESH)
    utility_bands[name] = bootstrap_utility_curves(y_test, proba, CLIN_THRESH, n_boot=N_BOOT, rng=RNG)


print("\nPlotting DCA and Clinical Utility Indices...")
plt.rcParams.update({
    'font.family': 'DejaVu Sans',
    'font.size': 7, 'axes.titlesize': 7, 'axes.labelsize': 7,
    'xtick.labelsize': 6, 'ytick.labelsize': 6, 'legend.fontsize': 6,
    'axes.linewidth': 0.7, 'lines.linewidth': 1.2, 'figure.dpi': 600
})

fig, axes = plt.subplots(1, 3, figsize=(10, 3.5), dpi=600)
ax_dca, ax_nnt, ax_fppt = axes


prev = float(y_test.mean())

treat_all = prev - (1 - prev) * (CLIN_THRESH / (1 - CLIN_THRESH))

ax_dca.axhline(0, linestyle="-", color="black", alpha=0.7, linewidth=0.8, label="Treat None")
ax_dca.plot(CLIN_THRESH, treat_all, "-.", color="tab:red", alpha=0.82, linewidth=1.0, label="Treat All")

for name, res in results.items():
    color = COLORS.get(name, "#333333")

    ax_dca.plot(res["thresholds"], res["net_benefit"], color=color, label=name, linewidth=1.2)
    ax_dca.fill_between(res["thresholds"], res["lower_ci"], res["upper_ci"], color=color, alpha=0.14)


    ot, nb_opt = res["optimal_threshold"], res["optimal_net_benefit"]

    if 0 < ot < 0.5:
        ax_dca.axvline(ot, linestyle=":", color=color, alpha=0.85, linewidth=0.8)
        ax_dca.text(ot + 0.004, nb_opt + 0.012 + Y_OFFSETS.get(name, 0),
                   f"Opt: {ot:.2f}", color=color, fontsize=6.2, bbox=TEXTBOX)

ax_dca.set_xlabel('Threshold probability', fontsize=7)
ax_dca.set_ylabel('Net benefit', fontsize=7)
ax_dca.set_xlim(CLIN_THRESH[0], CLIN_THRESH[-1])


y_max = max([max(res["upper_ci"]) for res in results.values()] + [max(treat_all)])
ax_dca.set_ylim(-0.05, y_max + 0.05)

ax_dca.set_title(f"Decision Curve Analysis\n(n={len(y_test):,}, prevalence={prev:.1%})", pad=9)
ax_dca.legend(loc="upper right", frameon=False, fontsize=6)
ax_dca.grid(True, alpha=0.13, lw=0.4, linestyle='--')
ax_dca.spines['top'].set_visible(False)
ax_dca.spines['right'].set_visible(False)


for name in results.keys():
    color = COLORS.get(name, "#333333")
    ax_nnt.plot(CLIN_THRESH, utility_bands[name]['nnt_mean'], color=color, label=name)
    ax_nnt.fill_between(CLIN_THRESH, utility_bands[name]['nnt_low'], utility_bands[name]['nnt_high'], color=color, alpha=0.13)

ax_nnt.set_xlabel("Threshold probability", fontsize=7)
ax_nnt.set_ylabel("Number Needed to Evaluate", fontsize=7)
ax_nnt.set_title("Number Needed to Evaluate\n(to find 1 positive)", pad=9, fontsize=7)
ax_nnt.grid(True, alpha=0.14, lw=0.4, linestyle='--')
ax_nnt.spines['top'].set_visible(False)
ax_nnt.spines['right'].set_visible(False)


for name in results.keys():
    color = COLORS.get(name, "#333333")
    ax_fppt.plot(CLIN_THRESH, utility_bands[name]['fppt_mean'], color=color, label=name)
    ax_fppt.fill_between(CLIN_THRESH, utility_bands[name]['fppt_low'], utility_bands[name]['fppt_high'], color=color, alpha=0.13)

ax_fppt.set_xlabel("Threshold probability", fontsize=7)
ax_fppt.set_ylabel("FP per TP", fontsize=7)
ax_fppt.set_title("False Positives per True Positive", pad=9, fontsize=7)
ax_fppt.grid(True, alpha=0.14, lw=0.4, linestyle='--')
ax_fppt.spines['top'].set_visible(False)
ax_fppt.spines['right'].set_visible(False)

plt.tight_layout(pad=0.5)
plt.savefig("model_comparison/dca_comprehensive_plot.png", dpi=600, bbox_inches="tight", facecolor='white')
plt.close()


dca_results_df = pd.DataFrame({'threshold': CLIN_THRESH})
for name in results:
    dca_results_df[f'{name}_net_benefit'] = results[name]['net_benefit']
    dca_results_df[f'{name}_lower_ci'] = results[name]['lower_ci']
    dca_results_df[f'{name}_upper_ci'] = results[name]['upper_ci']
    dca_results_df[f'{name}_NNT_mean'] = utility_bands[name]['nnt_mean']
    dca_results_df[f'{name}_FPPT_mean'] = utility_bands[name]['fppt_mean']

dca_results_df.to_csv('model_comparison/dca_results_with_utility.csv', index=False)
print("✅ Done.")

Loading test_set.csv...
✅ Test data loaded: 383 samples
✅ CatBoost model loaded
✅ Logistic Regression model loaded

Generating predictions...
✅ CatBoost predictions generated (Youden=0.510)
✅ Logistic Regression predictions generated (Youden=0.500)

Bootstrapping CatBoost Net Benefit and Utility Indices...


100%|██████████| 1000/1000 [00:01<00:00, 709.67it/s]



Bootstrapping Logistic Regression Net Benefit and Utility Indices...


100%|██████████| 1000/1000 [00:01<00:00, 700.23it/s]



Plotting DCA and Clinical Utility Indices...
✅ Done.


**13. Model Interpretability: SHAP Analysis**  
Apply SHAP values to explain global and local feature contributions to model predictions.

In [19]:
import json
import logging
import os
import pickle
import warnings
from dataclasses import dataclass, field
from pathlib import Path
from typing import Dict, List, Optional, Tuple

import joblib
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shap
from sklearn.calibration import CalibratedClassifierCV
from tqdm.auto import tqdm
from scipy.stats import norm

warnings.filterwarnings('ignore')

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


if "DISPLAY" not in os.environ:
    matplotlib.use("Agg")

@dataclass
class SHAPConfig:
    model_bundle_path: str
    test_data_path: str
    output_dir: str = "shap_results"
    n_bootstrap: int = 200
    sample_size: Optional[int] = None
    top_features: int = 10
    n_force_plots: int = 5
    n_dependence_plots: int = 5
    random_state: int = 42
    plot_dpi: int = 600
    plot_style: str = "seaborn-v0_8-whitegrid"
    cache_shap: bool = True
    cache_dir: str = "shap_cache"
    plot_color: str = "#0173B2"
    error_color: str = "#DE8F05"

    def __post_init__(self):
        if not Path(self.model_bundle_path).exists():
            raise FileNotFoundError(f"Model bundle not found: {self.model_bundle_path}")
        if not Path(self.test_data_path).exists():
            raise FileNotFoundError(f"Test data not found: {self.test_data_path}")

        output_path = Path(self.output_dir)
        output_path.mkdir(parents=True, exist_ok=True)
        (output_path / "force_plots").mkdir(exist_ok=True)
        (output_path / "tables").mkdir(exist_ok=True)
        (output_path / "bootstrap").mkdir(exist_ok=True)

        if self.cache_shap:
            Path(self.cache_dir).mkdir(exist_ok=True)

class SHAPAnalyzer:

    def __init__(self, config: SHAPConfig):
        self.config = config
        self.rng = np.random.default_rng(config.random_state)
        self.results = {}


        try:
            plt.style.use(config.plot_style)
        except OSError:
            plt.style.use("seaborn-whitegrid")

        plt.rcParams.update({
            "font.family": "DejaVu Sans",
            "font.size": 14,
            "axes.titlesize": 18,
            "axes.labelsize": 16,
            "xtick.labelsize": 12,
            "ytick.labelsize": 12,
            "figure.dpi": 600,
            "savefig.bbox": "tight"
        })


    def _get_short_names_map(self) -> Dict[str, str]:
        return {
            "Institution Type_Hospital": "Setting: Hospital",
            "Institution Type_Lab": "Setting: Lab",
            "Gender_M": "Male",
            "Gender_F": "Female",
            "Bacteria type_E. coli": "E. coli",
            "Bacteria type_Klebsiella Spp": "Klebsiella",
            "Bacteria type_Pseudomonas Spp": "Pseudomonas",
            "Healthcare Sector_Governmental": "Sector: Gov",
            "Healthcare Sector_Private": "Sector: Private",
        }

    def load_model_bundle(self) -> Dict:
        bundle_path = Path(self.config.model_bundle_path)
        try:
            bundle = joblib.load(bundle_path)
        except Exception as e:
            raise RuntimeError(f"Failed to load model bundle: {e}") from e
        return bundle

    def extract_base_model(self, model):
        if isinstance(model, CalibratedClassifierCV):
            if hasattr(model, 'calibrated_classifiers_'):
                return model.calibrated_classifiers_[0].estimator
            return model.base_estimator
        return model

    def get_feature_names(self, preprocessor) -> List[str]:
        try:
            return list(preprocessor.get_feature_names_out())
        except AttributeError:
            return self._get_feature_names_fallback(preprocessor)

    def _get_feature_names_fallback(self, column_transformer) -> List[str]:
        names = []
        for name, transformer, columns in column_transformer.transformers_:
            if name == "remainder" and transformer == "drop": continue
            if (name == "cat" and hasattr(transformer, "named_steps") and
                "onehot" in transformer.named_steps):
                encoder = transformer.named_steps["onehot"]
                for col, categories in zip(columns, encoder.categories_):
                    names.extend([f"{col}_{cat}" for cat in categories])
            else:
                names.extend(list(columns))
        return names

    def clean_feature_names(self, feature_names: List[str]) -> List[str]:
        cleaned = []
        for name in feature_names:
            if "__" in name: name = name.split("__")[-1]
            cleaned.append(name)
        return cleaned

    def load_and_preprocess_data(self, bundle: Dict) -> Tuple[pd.DataFrame, pd.Series]:
        logger.info("Loading test data...")
        test_df = pd.read_csv(self.config.test_data_path)
        target_col = "MDR status" if "MDR status" in test_df.columns else "outcome"

        X_raw = test_df.drop(columns=[target_col])
        y = test_df[target_col]

        preprocessor = bundle['preprocessor']
        X_trans = preprocessor.transform(X_raw)
        if hasattr(X_trans, 'toarray'): X_trans = X_trans.toarray()

        raw_feature_names = self.get_feature_names(preprocessor)
        cleaned_names = self.clean_feature_names(raw_feature_names)
        feature_names = cleaned_names if X_trans.shape[1] == len(cleaned_names) else raw_feature_names

        X_full = pd.DataFrame(X_trans, columns=feature_names, index=X_raw.index)
        selected_features = bundle['selected_features']
        X_selected = X_full[selected_features]
        return X_selected, y

    def compute_shap_values(self, bundle, X: pd.DataFrame, use_cache: bool = True) -> Tuple[np.ndarray, float, float]:
        full_model = bundle['model']
        if use_cache and self.config.cache_shap:
            cache_file = self._get_cache_path(full_model, X)
            if cache_file.exists():
                logger.info(f"Loading cached SHAP values from {cache_file}")
                with open(cache_file, 'rb') as f:
                    return pickle.load(f)

        base_model = self.extract_base_model(full_model)
        try:
            explainer = shap.TreeExplainer(base_model)
        except Exception as e:
            logger.warning(f"TreeExplainer failed, falling back to KernelExplainer: {e}")
            explainer = shap.KernelExplainer(full_model.predict_proba, X.iloc[:100])

        shap_values = explainer.shap_values(X)
        base_expected_value = explainer.expected_value

        if isinstance(shap_values, list):
            if len(shap_values) == 2:
                shap_values = shap_values[1]
                if isinstance(base_expected_value, (list, np.ndarray)):
                    base_expected_value = base_expected_value[1]
            else:
                shap_values = shap_values if len(shap_values) > 0 else shap_values

        shap_values = np.asarray(shap_values, dtype=np.float32)
        base_expected_value = float(base_expected_value)

        calibrated_expected_value = np.nan
        if hasattr(full_model, "predict_proba"):
            probs = full_model.predict_proba(X)[:, 1]
            calibrated_expected_value = float(np.mean(probs))

        if use_cache and self.config.cache_shap:
            with open(self._get_cache_path(full_model, X), 'wb') as f:
                pickle.dump((shap_values, base_expected_value, calibrated_expected_value), f)

        return shap_values, base_expected_value, calibrated_expected_value

    def _get_cache_path(self, model, X: pd.DataFrame) -> Path:
        import hashlib
        from pandas.util import hash_pandas_object
        model_hash = hashlib.md5(str(type(model)).encode()).hexdigest()[:8]
        data_hash = str(hash_pandas_object(X).sum())[:8]
        return Path(self.config.cache_dir) / f"shap_{model_hash}_{data_hash}.pkl"

    def bootstrap_shap_importance(self, shap_values: np.ndarray, sample_size: Optional[int] = None) -> Dict:
        n_samples, n_features = shap_values.shape
        if sample_size is None: sample_size = min(n_samples, self.config.sample_size or n_samples)

        logger.info(f"Bootstrapping SHAP importance (n={self.config.n_bootstrap})")
        feat_imp_boot = np.zeros((self.config.n_bootstrap, n_features), dtype=np.float32)
        shap_mean_boot = np.zeros((self.config.n_bootstrap, n_features), dtype=np.float32)
        all_indices = self.rng.integers(0, n_samples, size=(self.config.n_bootstrap, sample_size))

        for i in tqdm(range(self.config.n_bootstrap), desc="Bootstrapping"):
            indices = all_indices[i]
            sample_shap = shap_values[indices]
            feat_imp_boot[i] = np.mean(np.abs(sample_shap), axis=0)
            shap_mean_boot[i] = np.mean(sample_shap, axis=0)

        return {
            'feature_importance_mean': np.mean(feat_imp_boot, axis=0),
            'feature_importance_ci_lower': np.percentile(feat_imp_boot, 2.5, axis=0),
            'feature_importance_ci_upper': np.percentile(feat_imp_boot, 97.5, axis=0),
            'shap_mean_mean': np.mean(shap_mean_boot, axis=0),
            'n_bootstrap': self.config.n_bootstrap
        }

    def plot_bootstrap_importance(self, bootstrap_results: Dict, feature_names: List[str], top_n: Optional[int] = None):
        if top_n is None: top_n = self.config.top_features
        mean_importance = bootstrap_results['feature_importance_mean']
        sorted_idx = np.argsort(mean_importance)[::-1][:top_n]

        plot_names = [feature_names[i] for i in sorted_idx]
        plot_means = mean_importance[sorted_idx]
        plot_ci_lower = bootstrap_results['feature_importance_ci_lower'][sorted_idx]
        plot_ci_upper = bootstrap_results['feature_importance_ci_upper'][sorted_idx]
        plot_signed = bootstrap_results['shap_mean_mean'][sorted_idx]

        fig, ax = plt.subplots(figsize=(12, max(6, top_n * 0.6)), dpi=self.config.plot_dpi)
        y_pos = np.arange(len(plot_names))
        xerr = [plot_means - plot_ci_lower, plot_ci_upper - plot_means]

        bars = ax.barh(y_pos, plot_means, xerr=xerr, color=self.config.plot_color,
                      alpha=0.8, ecolor=self.config.error_color, capsize=6, error_kw={'linewidth': 2})

        max_val = np.max(plot_ci_upper)
        for i, (bar, mean_val) in enumerate(zip(bars, plot_means)):
            sign_char = "+" if plot_signed[i] > 0 else "-"
            label_text = f'{mean_val:.4f} {sign_char}'
            ax.text(plot_ci_upper[i] + (max_val * 0.02), bar.get_y() + bar.get_height()/2,
                    label_text, va='center', ha='left', fontsize=12,
                    bbox=dict(boxstyle='round,pad=0.2', facecolor='white', alpha=0.7, edgecolor='none'))

        ax.set_yticks(y_pos)
        ax.set_yticklabels(plot_names)
        ax.set_xlabel('Mean Absolute SHAP Value (95% CI)')
        ax.set_title(f'Top {top_n} Feature Importance (Bootstrap N={bootstrap_results["n_bootstrap"]})', pad=20)
        ax.set_xlim(0, max_val * 1.25)
        ax.invert_yaxis()
        ax.grid(True, alpha=0.3, axis='x')

        plt.savefig(Path(self.config.output_dir) / "bootstrap_feature_importance.png", bbox_inches='tight')
        plt.close()

    def save_force_plots(self, shap_values: np.ndarray, expected_value: float, X: pd.DataFrame, y: pd.Series):
        output_dir = Path(self.config.output_dir) / "force_plots"
        output_dir.mkdir(parents=True, exist_ok=True)
        n_samples = min(self.config.n_force_plots, X.shape[0])

        logger.info(f"Generating {n_samples} High-Resolution Force Plots...")

        for idx in range(n_samples):
            try:

                plt.figure(figsize=(24, 6), dpi=600)

                shap.force_plot(
                    expected_value,
                    shap_values[idx],
                    X.iloc[idx],
                    matplotlib=True,
                    show=False,
                    text_rotation=30,
                    contribution_threshold=0.02
                )


                status = "MDR" if int(y.iloc[idx]) == 1 else "Non-MDR"
                plt.title(f"Force Plot (Sample {idx}) - True Label: {status}", fontsize=20, pad=40, weight='bold')

                plt.tight_layout()

                save_path = output_dir / f"sample_{idx}_force_plot.png"
                plt.savefig(save_path, dpi=600, bbox_inches="tight")
                plt.close()

            except Exception as e:
                logger.warning(f"Failed to generate static force plot for sample {idx}: {e}")
                plt.close()


            try:
                plot_html = shap.force_plot(
                    expected_value, shap_values[idx], X.iloc[idx],
                    matplotlib=False, link="logit"
                )
                shap.save_html(str(output_dir / f"sample_{idx}_prob.html"), plot_html)
            except Exception: pass

        logger.info(f"✅ Saved High-Res force plots to {output_dir}")

    def plot_shap_summary(self, shap_values: np.ndarray, X: pd.DataFrame):
        output_dir = Path(self.config.output_dir)


        plt.figure(figsize=(14, 10))
        shap.summary_plot(shap_values, X, show=False, max_display=15, plot_size=(14, 10))
        plt.title('SHAP Feature Importance Summary', fontsize=20, pad=20)
        plt.tight_layout()
        plt.savefig(output_dir / "shap_summary_beeswarm.png", dpi=600, bbox_inches='tight')
        plt.close()


        plt.figure(figsize=(14, 10))
        shap.summary_plot(shap_values, X, plot_type='bar', show=False, max_display=15, plot_size=(14, 10))
        plt.title('Mean Absolute SHAP Values', fontsize=20, pad=20)
        plt.tight_layout()
        plt.savefig(output_dir / "shap_summary_bar.png", dpi=600, bbox_inches='tight')
        plt.close()

    def run_analysis(self):
        logger.info("Starting SHAP analysis pipeline")
        try:
            bundle = self.load_model_bundle()
            X, y = self.load_and_preprocess_data(bundle)

            shap_values, base_ev, calib_ev = self.compute_shap_values(bundle, X, use_cache=self.config.cache_shap)

            bootstrap_results = self.bootstrap_shap_importance(shap_values)
            mapping = self._get_short_names_map()
            X_display = X.rename(columns=mapping)
            display_names = list(X_display.columns)

            pd.DataFrame(shap_values, columns=X.columns, index=X.index).to_csv(
                Path(self.config.output_dir) / "tables" / "shap_values_raw.csv")

            self.plot_bootstrap_importance(bootstrap_results, display_names)
            self.plot_shap_summary(shap_values, X_display)
            self.save_force_plots(shap_values, base_ev, X_display, y)

            logger.info(f"Analysis complete! Results saved to {self.config.output_dir}")

        except Exception as e:
            logger.error(f"Analysis failed: {e}", exc_info=True)
            raise

def main():
    config = SHAPConfig(
        model_bundle_path="deployment_artifacts/CatBoost_with_thresholds_v1.0.0.pkl",
        test_data_path="test_set.csv",
        output_dir="shap_analysis_results",
        n_bootstrap=200,
        top_features=10,
        n_force_plots=5,
        cache_shap=True,
        plot_style="seaborn-v0_8-whitegrid"
    )

    analyzer = SHAPAnalyzer(config)
    analyzer.run_analysis()

if __name__ == "__main__":
    main()

Bootstrapping:   0%|          | 0/200 [00:00<?, ?it/s]

**14. Model Interpretability: LIME Analysis**  
Use LIME to provide local explanations of individual predictions and validate model behavior.

In [20]:
import os
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.calibration import CalibratedClassifierCV
import lime
import lime.lime_tabular

MODEL_BUNDLE_PATH = "deployment_artifacts/CatBoost_with_thresholds_v1.0.0.pkl"
TEST_DATA_PATH = "test_set.csv"
OUTPUT_DIR = "lime_explanations"
os.makedirs(OUTPUT_DIR, exist_ok=True)


plt.rcParams.update({
    "font.family": "DejaVu Sans",
    "font.weight": "normal",
    "font.size": 12,
    "axes.titlesize": 14,
    "axes.labelsize": 12,
    "xtick.labelsize": 11,
    "ytick.labelsize": 11,
    "figure.dpi": 600,
    "axes.linewidth": 1.0,
})


print("Loading data...")
df = pd.read_csv(TEST_DATA_PATH)
if "MDR status" not in df.columns:
    raise ValueError("'MDR status' column missing in test data.")
X_raw = df.drop(columns=["MDR status"])
y_test = df["MDR status"].astype(int)

print("Loading model bundle...")
bundle = joblib.load(MODEL_BUNDLE_PATH)
model = bundle["model"]
preprocessor = bundle.get("preprocessor")
selected_features = bundle.get("selected_features")

def get_onehot_feature_names_from_ct(ct):
    names = []
    for nm, pipe, cols in ct.transformers_:
        if nm == "remainder" and pipe == "drop": continue
        if nm == "cat" and hasattr(pipe, "named_steps") and "onehot" in pipe.named_steps:
            cats = pipe.named_steps["onehot"].categories_
            for col, cat_list in zip(cols, cats):
                names += [f"{col}_{cat}" for cat in cat_list]
        else: names += list(cols)
    return names

X_trans = preprocessor.transform(X_raw)
all_feat_names = get_onehot_feature_names_from_ct(preprocessor)
X_full = pd.DataFrame(X_trans, columns=all_feat_names, index=X_raw.index)
X_use = X_full[selected_features].copy().astype(np.float64)
feature_names = list(X_use.columns)

rename_map = {
    "Institution Type_Hospital": "Setting: Hospital",
    "Institution Type_Lab": "Setting: Lab",
    "Gender_M": "Male", "Gender_F": "Female",
    "Bacteria type_E. coli": "Pathogen: E. coli",
    "Bacteria type_Klebsiella Spp": "Pathogen: Klebsiella",
    "Bacteria type_Pseudomonas Spp": "Pathogen: Pseudomonas",
    "Healthcare Sector_Governmental": "Sector: Gov",
    "Healthcare Sector_Private": "Sector: Private",
}

display_names = []
for n in feature_names:
    clean_n = str(n)
    if clean_n in rename_map: clean_n = rename_map[clean_n]
    display_names.append(clean_n)

_seen, dedup_names = {}, []
for n in display_names:
    if n in _seen:
        _seen[n] += 1
        dedup_names.append(f"{n}_{_seen[n]}")
    else:
        _seen[n] = 0
        dedup_names.append(n)
display_names = dedup_names

def predict_fn(X_array):
    X_arr = np.asarray(X_array, dtype=np.float64)
    if X_arr.ndim == 1: X_arr = X_arr.reshape(1, -1)
    try: probs = model.predict_proba(X_arr)
    except:
        X_df = pd.DataFrame(X_arr, columns=feature_names)
        probs = model.predict_proba(X_df)

    probs = np.asarray(probs, dtype=np.float64)
    if probs.ndim == 1: probs = probs.reshape(-1, 1)
    if probs.shape[1] == 1:
        p1 = probs[:, 0]
        probs = np.column_stack([1.0 - p1, p1])
    return probs

explainer = lime.lime_tabular.LimeTabularExplainer(
    training_data=X_use.to_numpy(dtype=np.float64),
    feature_names=display_names,
    class_names=['Non-MDR', 'MDR'],
    mode='classification',
    discretize_continuous=True,
    random_state=42
)

np.random.seed(42)
sample_indices = []
for cls in [0, 1]:
    idxs = np.where(y_test.values == cls)[0]
    k = min(3, len(idxs))
    if k > 0: sample_indices.extend(np.random.choice(idxs, size=k, replace=False))

sample_indices = list(dict.fromkeys(sample_indices))

for idx in sample_indices:
    try:
        x_row = X_use.iloc[idx].to_numpy(dtype=np.float64)
        exp = explainer.explain_instance(
            data_row=x_row,
            predict_fn=predict_fn,
            num_features=min(10, X_use.shape[1]),
            top_labels=1
        )
        top_label = int(np.asarray(exp.top_labels))


        fig = exp.as_pyplot_figure(label=top_label)

        fig.set_size_inches(8, 5)

        plt.title(f"Sample {idx} (True: {'MDR' if int(y_test.iloc[idx]) == 1 else 'Non-MDR'})",
                  fontsize=14, pad=15, weight='bold')

        plt.tight_layout()

        png_path = os.path.join(OUTPUT_DIR, f"sample_{idx}.png")
        plt.savefig(png_path, dpi=600, bbox_inches='tight')
        plt.close(fig)

        print(f"✅ Saved Clear LIME plot: {png_path}")

    except Exception as e:
        print(f"❌ Failed to explain sample {idx}: {str(e)}")

print("\n🎉 LIME analysis completed!")

Loading data...
Loading model bundle...
✅ Saved Clear LIME plot: lime_explanations/sample_46.png
✅ Saved Clear LIME plot: lime_explanations/sample_103.png
✅ Saved Clear LIME plot: lime_explanations/sample_322.png
✅ Saved Clear LIME plot: lime_explanations/sample_252.png
✅ Saved Clear LIME plot: lime_explanations/sample_28.png
✅ Saved Clear LIME plot: lime_explanations/sample_9.png

🎉 LIME analysis completed!


**15. SHAP Analysis on LIME-selected Samples**  
Perform SHAP analysis specifically on cases identified by LIME for deeper interpretability.

In [21]:
import os
import joblib
import numpy as np
import pandas as pd
import shap
import matplotlib.pyplot as plt
from sklearn.calibration import CalibratedClassifierCV

MODEL_BUNDLE_PATH = "deployment_artifacts/CatBoost_with_thresholds_v1.0.0.pkl"
TEST_DATA_PATH = "test_set.csv"

RENAME_MAP = {
    "Institution Type_Hospital": "Setting: Hospital",
    "Institution Type_Lab": "Setting: Lab",
    "Gender_M": "Male",
    "Gender_F": "Female",
    "Bacteria type_E. coli": "Pathogen: E. coli",
    "Bacteria type_Klebsiella Spp": "Pathogen: Klebsiella",
    "Bacteria type_Pseudomonas Spp": "Pathogen: Pseudomonas",
    "Healthcare Sector_Governmental": "Sector: Gov",
    "Healthcare Sector_Private": "Sector: Private",
}

def get_onehot_feature_names_from_ct(ct):
    names = []
    for nm, pipe, cols in ct.transformers_:
        if nm == "remainder" and pipe == "drop":
            continue
        if nm == "cat" and hasattr(pipe, "named_steps") and "onehot" in pipe.named_steps:
            cats = pipe.named_steps["onehot"].categories_
            for col, cat_list in zip(cols, cats):
                names += [f"{col}_{cat}" for cat in cat_list]
        else:
            names += list(cols)
    return names

def extract_base_model(model):
    if isinstance(model, CalibratedClassifierCV):
        return model.calibrated_classifiers_[0].estimator
    return model

def compute_shap_values(model, X_df):
    base_model = extract_base_model(model)
    explainer = shap.TreeExplainer(base_model)
    shap_values = explainer.shap_values(X_df)
    expected_value = explainer.expected_value

    if isinstance(shap_values, list):
        if len(shap_values) == 2:
            shap_values = shap_values[1]
            if isinstance(expected_value, (list, tuple, np.ndarray)):
                expected_value = expected_value[1]
        else:
            shap_values = shap_values if len(shap_values) > 0 else shap_values

    shap_values = np.asarray(shap_values)
    expected_value = float(np.asarray(expected_value))
    return shap_values, expected_value

def generate_shap_for_lime_samples(
    bundle_path=MODEL_BUNDLE_PATH,
    test_path=TEST_DATA_PATH,
    lime_sample_indices=None,
    output_dir="explanations"
):

    if lime_sample_indices is None:
        lime_sample_indices = []

    df = pd.read_csv(test_path)
    if "MDR status" not in df.columns:
        raise ValueError("'MDR status' column missing from test_set.csv.")
    X_raw = df.drop(columns=["MDR status"])

    bundle = joblib.load(bundle_path)
    model = bundle["model"]
    pre = bundle.get("preprocessor")
    sel_feats = bundle.get("selected_features")

    if pre is None or sel_feats is None:
        raise KeyError("Bundle missing 'preprocessor' or 'selected_features'")

    X_trans = pre.transform(X_raw)
    all_names = get_onehot_feature_names_from_ct(pre)
    X_full = pd.DataFrame(X_trans, columns=all_names, index=X_raw.index)

    missing = [f for f in sel_feats if f not in X_full.columns]
    if missing:
        raise ValueError(f"Missing transformed columns: {missing}")

    X = X_full[sel_feats].copy()
    short_feature_names = [RENAME_MAP.get(col, col) for col in X.columns]
    shap_values, expected_value = compute_shap_values(model, X)

    os.makedirs(output_dir, exist_ok=True)


    plt.rcParams.update({
        "font.family": "DejaVu Sans",
        "font.weight": "normal",
        "font.size": 12,
        "axes.titlesize": 14,
        "axes.labelsize": 12,
        "xtick.labelsize": 11,
        "ytick.labelsize": 11,
        "figure.dpi": 600,
        "axes.linewidth": 1.0,
    })

    for i, sample_idx in enumerate(lime_sample_indices):
        try:
            row_X = X.loc[sample_idx] if sample_idx in X.index else X.iloc[sample_idx]
            pos = X.index.get_loc(sample_idx) if sample_idx in X.index else sample_idx

            plt.figure(figsize=(8, 5))

            shap.plots.waterfall(
                shap.Explanation(
                    values=shap_values[pos],
                    base_values=expected_value,
                    data=row_X,
                    feature_names=short_feature_names
                ),
                show=False,
                max_display=10
            )

            plt.title(f"SHAP for sample {sample_idx}", fontsize=16, pad=20, weight='bold')

            plt.tight_layout()

            png_path = os.path.join(output_dir, f"shap_sample_{sample_idx}.png")


            plt.savefig(png_path, dpi=600, bbox_inches='tight')
            plt.close()
            print(f"✅ Saved High-Res waterfall plot: {png_path}")


            force_plot = shap.plots.force(
                expected_value,
                shap_values[pos],
                row_X,
                feature_names=short_feature_names,
                matplotlib=False
            )
            html_path = os.path.join(output_dir, f"shap_force_{sample_idx}.html")
            shap.save_html(html_path, force_plot)
            print(f"✅ Saved force plot: {html_path}")

        except Exception as e:
            print(f"❌ Error processing sample {sample_idx}: {e}")

    print("\n🎉 SHAP analysis complete for all LIME samples")

if __name__ == "__main__":
    lime_sample_indices = [9, 103, 46, 28, 252, 322]
    generate_shap_for_lime_samples(
        bundle_path=MODEL_BUNDLE_PATH,
        test_path=TEST_DATA_PATH,
        lime_sample_indices=lime_sample_indices,
        output_dir="explanations"
    )

✅ Saved High-Res waterfall plot: explanations/shap_sample_9.png
✅ Saved force plot: explanations/shap_force_9.html
✅ Saved High-Res waterfall plot: explanations/shap_sample_103.png
✅ Saved force plot: explanations/shap_force_103.html
✅ Saved High-Res waterfall plot: explanations/shap_sample_46.png
✅ Saved force plot: explanations/shap_force_46.html
✅ Saved High-Res waterfall plot: explanations/shap_sample_28.png
✅ Saved force plot: explanations/shap_force_28.html
✅ Saved High-Res waterfall plot: explanations/shap_sample_252.png
✅ Saved force plot: explanations/shap_force_252.html
✅ Saved High-Res waterfall plot: explanations/shap_sample_322.png
✅ Saved force plot: explanations/shap_force_322.html

🎉 SHAP analysis complete for all LIME samples


**16. Subgroup / Fairness Performance Analysis**  
Evaluate model performance across patient subgroups to assess fairness and potential biases.

In [23]:
import pandas as pd
import numpy as np
import joblib
import os
import logging
import warnings
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Tuple
from itertools import combinations

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    roc_auc_score, precision_score, recall_score, f1_score,
    accuracy_score, average_precision_score
)
from sklearn.utils import resample

warnings.filterwarnings('ignore')

log_filename = f'subgroup_analysis_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_filename),
        logging.StreamHandler()
    ]
)


plt.rcParams.update({
    'font.family': 'DejaVu Sans',
    'font.size': 14,
    'axes.titlesize': 20,
    'axes.labelsize': 16,
    'xtick.labelsize': 14,
    'ytick.labelsize': 14,
    'legend.fontsize': 14,
    'figure.dpi': 600,
    'lines.linewidth': 2,
    'lines.markersize': 8,
    'savefig.bbox': 'tight'
})
# ----------------------------------------

class HybridFeatureSelector:
    def __init__(self, selected_features=None):
        self.selected_features_ = selected_features if selected_features is not None else []
    def fit(self, X, y=None): return self
    def transform(self, X): return X[self.selected_features_]
    def fit_transform(self, X, y=None): return self.fit(X, y).transform(X)

def get_feature_names_from_column_transformer(ct):
    names = []
    for nm, pipe, cols in ct.transformers_:
        if nm == 'remainder' and pipe == 'drop': continue
        if nm == 'cat' and hasattr(pipe, 'named_steps') and 'onehot' in pipe.named_steps:
            cats = pipe.named_steps['onehot'].categories_
            for c, cl in zip(cols, cats): names += [f"{c}_{v}" for v in cl]
        else: names += list(cols)
    return names

def load_group_specific_thresholds(filepath):
    logging.info(f"📋 Loading group-specific thresholds from {filepath}...")
    try:
        df = pd.read_csv(filepath)
        group_thresholds = {
            row['group'].strip(): row['youden_j']
            for _, row in df.iterrows()
        }
        logging.info(f"✅ Found Youden thresholds for {len(group_thresholds)} groups.")
        return group_thresholds
    except Exception as e:
        logging.warning(f"⚠️ Could not load thresholds: {e}")
        return {}

def load_and_prepare_data():
    logging.info("📂 Loading test data and model...")
    test_set = pd.read_csv("test_set.csv")
    X_test = test_set.drop(columns=["MDR status"])
    y_true = test_set["MDR status"]
    bundle = joblib.load("deployment_artifacts/CatBoost_with_thresholds_v1.0.0.pkl")
    return test_set, X_test, y_true, bundle

def preprocess_and_predict(X_test, bundle):
    logging.info("🔧 Preprocessing data...")
    model = bundle['model']
    preprocessor = bundle.get('preprocessor')
    selected_features = bundle.get('selected_features')

    X_processed = preprocessor.transform(X_test)
    all_features = get_feature_names_from_column_transformer(preprocessor)
    X_processed_df = pd.DataFrame(X_processed, columns=all_features)
    X_final = X_processed_df[selected_features] if selected_features else X_processed_df
    y_proba = model.predict_proba(X_final)[:, 1]
    return y_proba, X_processed_df

def define_subgroups_from_preprocessed_data(test_data_with_probs, preprocessed_df):
    logging.info("🔍 Defining subgroups...")
    combined_df = pd.concat([test_data_with_probs.reset_index(drop=True), preprocessed_df.reset_index(drop=True)], axis=1)
    combined_df = combined_df.loc[:, ~combined_df.columns.duplicated(keep='last')]

    subgroups = {'All': combined_df}

    if 'Age' in combined_df.columns:
        age_median = combined_df['Age'].median()
        subgroups['Age < Median'] = combined_df[combined_df['Age'] < age_median]
        subgroups['Age >= Median'] = combined_df[combined_df['Age'] >= age_median]

    categorical_prefixes = ['Gender', 'Institution Type', 'Healthcare Sector', 'Bacteria type']
    non_redundant_cols = ["_F", "_Lab", "_Private"]

    for prefix in categorical_prefixes:
        cols = [col for col in combined_df.columns if col.startswith(prefix) and combined_df[col].isin([0, 1]).all()]
        for col in cols:
            if prefix == 'Bacteria type' or any(sub in col for sub in non_redundant_cols):
                if "Gender_F" in col: name_1, name_0 = "Female", "Male"
                elif "Institution Type_Lab" in col: name_1, name_0 = "Setting: Lab", "Setting: Hospital"
                elif "Healthcare Sector_Private" in col: name_1, name_0 = "Sector: Private", "Sector: Gov"
                elif "Bacteria type" in col:
                    clean_bac = col.replace("Bacteria type_", "").replace(" Spp", "")
                    name_1, name_0 = f"Pathogen: {clean_bac}", f"Non-{clean_bac}"
                else: name_1, name_0 = f"{col} = 1", f"{col} = 0"

                subgroups[name_1] = combined_df[combined_df[col] == 1]
                subgroups[name_0] = combined_df[combined_df[col] == 0]

    return {name: group for name, group in subgroups.items() if len(group) > 0}

def bootstrap_ci(y_true, y_scores, metric_fn, n_boot=1000, alpha=0.05, seed=42):
    np.random.seed(seed)
    scores = []
    for _ in range(n_boot):
        idx = resample(np.arange(len(y_true)), replace=True)
        if len(np.unique(y_true[idx])) < 2 and metric_fn.__name__ in ['roc_auc_score', 'average_precision_score']: continue
        try: scores.append(metric_fn(y_true[idx], y_scores[idx]))
        except: continue
    if not scores: return np.nan, np.nan, np.nan
    return np.mean(scores), np.percentile(scores, 100 * alpha / 2), np.percentile(scores, 100 * (1 - alpha / 2))

def compute_metrics(y_true, y_pred, y_prob, n_boot=1000):
    if len(y_true) == 0: return {}
    results = {}
    y_true, y_pred, y_prob = np.array(y_true), np.array(y_pred), np.array(y_prob)
    metric_defs = {"AUROC": (roc_auc_score, y_prob), "PR_AUC": (average_precision_score, y_prob),
                   "Accuracy": (accuracy_score, y_pred), "F1": (lambda yt, yp: f1_score(yt, yp, zero_division=0), y_pred),
                   "Recall": (lambda yt, yp: recall_score(yt, yp, zero_division=0), y_pred)}
    for name, (fn, scores) in metric_defs.items():
        mean, low, high = bootstrap_ci(y_true, scores, fn, n_boot=n_boot)
        results[name], results[f"{name}_lowCI"], results[f"{name}_highCI"] = mean, low, high
    return results

def apply_thresholds_and_compute_metrics(subgroups, bundle, group_specific_thresholds):
    logging.info("📊 Computing metrics for each subgroup...")
    global_threshold = bundle.get('youden_j_threshold', 0.5)
    metrics_dict, predicted_subgroups = {}, {}

    for name, df in subgroups.items():
        if df.empty: continue
        cleaned_name = name.strip()
        threshold = group_specific_thresholds.get(cleaned_name, global_threshold)

        df = df.copy()
        df['predicted'] = (df['prob'] >= threshold).astype(int)
        metrics = compute_metrics(df['MDR status'], df['predicted'], df['prob'])
        metrics.update({'Threshold': threshold, 'N_samples': len(df)})
        metrics_dict[name] = metrics
        predicted_subgroups[name] = df

    return metrics_dict, predicted_subgroups

METRIC_COLORS = {"AUROC": "blue", "Accuracy": "green", "Recall": "purple"}

def _get_ci_columns(metric, df):
    low_cols = [f"{metric}_lowCI", f"{metric}_low_ci", f"{metric}_lo"]
    high_cols = [f"{metric}_highCI", f"{metric}_high_ci", f"{metric}_hi"]
    low_col = next((c for c in low_cols if c in df.columns), None)
    high_col = next((c for c in high_cols if c in df.columns), None)
    if not low_col or not high_col: raise KeyError(f"Missing CI columns for {metric}")
    return low_col, high_col

def plot_subgroup_metrics(df, metrics_to_plot, output_dir, single_plots=False):
    if df.empty: return
    if "Subgroup" not in df.columns: df["Subgroup"] = df.index
    df = df.sort_values("Subgroup", ascending=False)

    if single_plots:
        for metric in metrics_to_plot:
            try:
                lo_col, hi_col = _get_ci_columns(metric, df)

                plt.figure(figsize=(14, 12))

                xerr = [df[metric] - df[lo_col], df[hi_col] - df[metric]]
                plt.errorbar(df[metric], df["Subgroup"], xerr=xerr, fmt="o",
                             capsize=8, elinewidth=3, markersize=10,
                             color=METRIC_COLORS.get(metric, 'gray'))

                plt.axvline(0.5, color="red", linestyle="--", alpha=0.6, linewidth=2)
                plt.title(f"Subgroup Analysis: {metric} with 95% CI", pad=20)
                plt.grid(True, linestyle='--', alpha=0.5)
                plt.tight_layout()
                plt.savefig(output_dir / f"subgroup_analysis_{metric}.png")
                plt.close()
            except KeyError: pass
    else:

        plt.figure(figsize=(16, 14))

        offsets = np.linspace(-0.25, 0.25, len(metrics_to_plot))
        for i, metric in enumerate(metrics_to_plot):
            try:
                lo_col, hi_col = _get_ci_columns(metric, df)
                plt.errorbar(df[metric], np.arange(len(df)) + offsets[i],
                             xerr=[df[metric] - df[lo_col], df[hi_col] - df[metric]],
                             fmt="o", capsize=8, elinewidth=3, markersize=10,
                             color=METRIC_COLORS.get(metric, 'gray'), label=metric)
            except KeyError: pass

        plt.yticks(np.arange(len(df)), df["Subgroup"])
        plt.axvline(0.5, color="red", linestyle="--", alpha=0.6, linewidth=2)
        plt.title("Subgroup Analysis: Model Performance with 95% CI", pad=20)
        plt.xlabel("Performance Metric Value")
        plt.grid(True, linestyle='--', alpha=0.5)


        plt.legend(title="Metrics", bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        plt.savefig(output_dir / "subgroup_analysis_combined.png")
        plt.close()

def main():
    output_dir = Path("subgroup_analysis_results")
    output_dir.mkdir(parents=True, exist_ok=True)
    plots_dir = output_dir / "plots"
    plots_dir.mkdir(exist_ok=True)

    try:
        threshold_file = "Threshold_results/reports/CatBoost_subgroups_optimal_thresholds_summary.csv"
        group_thresholds = load_group_specific_thresholds(threshold_file)

        test_set, X_test, y_true, bundle = load_and_prepare_data()
        y_proba, X_processed = preprocess_and_predict(X_test, bundle)

        test_set_with_probs = test_set.copy()
        test_set_with_probs['prob'] = y_proba

        subgroups = define_subgroups_from_preprocessed_data(test_set_with_probs, X_processed)
        metrics_dict, predicted_subgroups = apply_thresholds_and_compute_metrics(subgroups, bundle, group_thresholds)

        metrics_df = pd.DataFrame(metrics_dict).T.round(4)
        metrics_df.to_csv(output_dir / "subgroup_metrics_results.csv")

        plot_subgroup_metrics(metrics_df, ["AUROC", "Accuracy", "Recall"], plots_dir, single_plots=True)
        plot_subgroup_metrics(metrics_df, ["AUROC", "Accuracy", "Recall"], plots_dir, single_plots=False)

        print("✅ Analysis Complete. High-resolution plots saved.")

    except Exception as e:
        print(f"❌ Error: {e}")

if __name__ == "__main__":
    main()

✅ Analysis Complete. High-resolution plots saved.
