In [None]:
import numpy as np
import pandas as pd
from collections import defaultdict

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
    roc_auc_score, average_precision_score, f1_score,
    accuracy_score, brier_score_loss
)

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Load combined_df from CSV in /app/data/processed/
combined_df = pd.read_csv("/app/data/processed/combined_df.csv")

In [None]:
feature_cols = [c for c in combined_df.columns if c.startswith("original_")]
X = combined_df[feature_cols].copy()
y = combined_df["target"].astype(int).values

print(f"Rows: {len(y)}, Radiomics features: {len(feature_cols)}")
print("Class balance (target):\n", pd.Series(y).value_counts(normalize=True).rename("pct").round(3))

# 1) Common preprocessor
# - median impute
# - scale - for linear models
preprocess_linear = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("scale", StandardScaler())
])

preprocess_tree = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    # no scaling for tree-based models
])

# 2) Models to compare
models = {
    "LogReg_L2": Pipeline([
        ("prep", preprocess_linear),
        ("clf", LogisticRegression(
            C=1.0, penalty="l2", solver="liblinear",
            class_weight="balanced", random_state=RANDOM_STATE
        ))
    ]),
    "SVM_linear": Pipeline([
        ("prep", preprocess_linear),
        ("clf", SVC(
            kernel="linear", C=1.0, probability=True,
            class_weight="balanced", random_state=RANDOM_STATE
        ))
    ]),
    "RandomForest": Pipeline([
        ("prep", preprocess_tree),
        ("clf", RandomForestClassifier(
            n_estimators=300, max_depth=None, min_samples_leaf=2,
            n_jobs=-1, class_weight="balanced", random_state=RANDOM_STATE
        ))
    ]),
    "GradBoost": Pipeline([
        ("prep", preprocess_tree),
        ("clf", GradientBoostingClassifier(
            learning_rate=0.05, n_estimators=300, max_depth=3,
            random_state=RANDOM_STATE
        ))
    ]),
}

# 3) CV scheme - stratified KFold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

def evaluate_oof(model, X, y, cv):
    """
    Get out-of-fold probabilities and compute metrics.
    """
    oof_proba = cross_val_predict(model, X, y, cv=cv, method="predict_proba", n_jobs=-1)[:, 1]
    oof_pred  = (oof_proba >= 0.5).astype(int)

    metrics = {
        "ROC_AUC": roc_auc_score(y, oof_proba),
        "PR_AUC": average_precision_score(y, oof_proba),
        "F1": f1_score(y, oof_pred),
        "ACC": accuracy_score(y, oof_pred),
        "Brier": brier_score_loss(y, oof_proba),
    }
    return oof_proba, oof_pred, metrics

def bootstrap_ci(y, oof_proba, oof_pred, B=300, seed=RANDOM_STATE):
    """
    Percentile 95% CIs around metrics using bootstrap resamples of OOF predictions.
    Keeps data amount fixed; no extra holdout.
    """
    rng = np.random.default_rng(seed)
    n = len(y)
    stats = defaultdict(list)
    for _ in range(B):
        idx = rng.integers(0, n, n)
        yt = y[idx]
        pt = oof_proba[idx]
        ht = (pt >= 0.5).astype(int)
        # collect
        stats["ROC_AUC"].append(roc_auc_score(yt, pt))
        stats["PR_AUC"].append(average_precision_score(yt, pt))
        stats["F1"].append(f1_score(yt, ht))
        stats["ACC"].append(accuracy_score(yt, ht))
        stats["Brier"].append(brier_score_loss(yt, pt))
    ci = {k: (np.percentile(v, 2.5), np.percentile(v, 97.5)) for k, v in stats.items()}
    return ci

# 4) Run
rows = []
oof_store = {}
for name, pipe in models.items():
    proba, pred, m = evaluate_oof(pipe, X, y, cv)
    ci = bootstrap_ci(y, proba, pred, B=300)
    rows.append({
        "model": name,
        **{f"{k}_mean": v for k, v in m.items()},
        **{f"{k}_ci": f"[{ci[k][0]:.3f}, {ci[k][1]:.3f}]" for k in m.keys()}
    })
    oof_store[name] = proba
    print(f"Done: {name}")

summary = pd.DataFrame(rows).sort_values("ROC_AUC_mean", ascending=False)
display(summary)

# 5) Feature importance
def top_features_linear(pipe, cols, k=10):
    pipe.fit(X, y)
    clf = pipe.named_steps["clf"]
    if hasattr(clf, "coef_"):
        w = np.abs(clf.coef_).ravel()
        top = pd.Series(w, index=cols).sort_values(ascending=False).head(k)
        return top
    return None

def top_features_tree(pipe, cols, k=10):
    pipe.fit(X, y)
    clf = pipe.named_steps["clf"]
    if hasattr(clf, "feature_importances_"):
        imp = pd.Series(clf.feature_importances_, index=cols).sort_values(ascending=False).head(k)
        return imp
    return None

for name in ["LogReg_L2", "SVM_linear", "RandomForest", "GradBoost"]:
    if name not in models: 
        continue
    if any(x in name for x in ["LogReg", "SVM"]):
        tf = top_features_linear(models[name], feature_cols, k=10)
    else:
        tf = top_features_tree(models[name], feature_cols, k=10)
    if tf is not None:
        print(f"\nTop features — {name}")
        display(tf)

Rows: 1905, Radiomics features: 102
Class balance (target):
 0    0.504
1    0.496
Name: pct, dtype: float64
Done: LogReg_L2
Done: LogReg_L2
Done: SVM_linear
Done: SVM_linear
Done: RandomForest
Done: RandomForest
Done: GradBoost
Done: GradBoost


Unnamed: 0,model,ROC_AUC_mean,PR_AUC_mean,F1_mean,ACC_mean,Brier_mean,ROC_AUC_ci,PR_AUC_ci,F1_ci,ACC_ci,Brier_ci
2,RandomForest,0.608025,0.607382,0.555008,0.573228,0.241794,"[0.583, 0.634]","[0.578, 0.646]","[0.530, 0.584]","[0.553, 0.597]","[0.235, 0.248]"
3,GradBoost,0.603439,0.597918,0.552255,0.572703,0.248129,"[0.579, 0.629]","[0.570, 0.635]","[0.527, 0.579]","[0.552, 0.596]","[0.240, 0.255]"
0,LogReg_L2,0.592959,0.588741,0.55236,0.566929,0.245502,"[0.569, 0.617]","[0.561, 0.625]","[0.526, 0.578]","[0.546, 0.590]","[0.240, 0.251]"
1,SVM_linear,0.592362,0.58109,0.56622,0.575328,0.245216,"[0.567, 0.616]","[0.552, 0.615]","[0.540, 0.594]","[0.555, 0.596]","[0.243, 0.247]"



Top features — LogReg_L2


original_glcm_SumEntropy                             1.192543
original_gldm_DependenceEntropy                      0.936738
original_gldm_DependenceNonUniformityNormalized      0.854285
original_gldm_DependenceNonUniformity                0.839994
original_ngtdm_Complexity                            0.682100
original_firstorder_10Percentile                     0.559423
original_gldm_SmallDependenceLowGrayLevelEmphasis    0.549907
original_gldm_SmallDependenceEmphasis                0.547473
original_glcm_Idm                                    0.517412
original_glrlm_RunLengthNonUniformityNormalized      0.501681
dtype: float64


Top features — SVM_linear


original_glcm_SumEntropy                             1.576651
original_gldm_DependenceEntropy                      1.179991
original_ngtdm_Complexity                            1.080502
original_gldm_DependenceNonUniformityNormalized      1.077276
original_glcm_Idm                                    1.007948
original_glcm_InverseVariance                        0.960473
original_firstorder_10Percentile                     0.870640
original_glcm_Id                                     0.797360
original_gldm_SmallDependenceLowGrayLevelEmphasis    0.781361
original_glrlm_RunPercentage                         0.776784
dtype: float64


Top features — RandomForest


original_shape2D_PerimeterSurfaceRatio    0.020754
original_shape2D_Sphericity               0.017280
original_shape2D_Elongation               0.016873
original_firstorder_Mean                  0.013996
original_shape2D_Perimeter                0.012589
original_shape2D_MinorAxisLength          0.012481
original_ngtdm_Coarseness                 0.012436
original_firstorder_10Percentile          0.012415
original_shape2D_MaximumDiameter          0.012274
original_glcm_SumSquares                  0.012238
dtype: float64


Top features — GradBoost


original_shape2D_PerimeterSurfaceRatio    0.096191
original_shape2D_Elongation               0.049375
original_shape2D_Sphericity               0.035246
original_firstorder_Median                0.029244
original_firstorder_Mean                  0.027488
original_firstorder_10Percentile          0.024287
original_firstorder_Kurtosis              0.023467
original_firstorder_Range                 0.021780
original_gldm_GrayLevelVariance           0.021445
original_firstorder_Entropy               0.020359
dtype: float64