In [None]:
# Install scikit-survival package
import subprocess
import sys

subprocess.check_call([sys.executable, "-m", "pip", "install", "scikit-survival"])

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%pip install scikit-learn==1.3.2 scikit-survival==0.22.2

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from lifelines import CoxPHFitter, WeibullAFTFitter
from lifelines.utils import concordance_index
from sksurv.util import Surv
from sksurv.metrics import (
    concordance_index_ipcw,
    integrated_brier_score,
    brier_score,
    cumulative_dynamic_auc
)


# Load datasets
df_zero    = pd.read_csv("C:\\Users\\04ama\\OneDrive\\pension survival analysis\\notebooks\\ipcw_and_other_censoring\\data\\censoring_methods\\data_zero.csv")
df_discard = pd.read_csv("C:\\Users\\04ama\\OneDrive\\pension survival analysis\\notebooks\\ipcw_and_other_censoring\\data\\censoring_methods\\data_discard.csv")
df_ipcw    = pd.read_csv("C:\\Users\\04ama\\OneDrive\\pension survival analysis\\notebooks\\ipcw_and_other_censoring\\data\\censoring_methods\\data_ipcw.csv")

datasets = {"zero": df_zero, "discard": df_discard, "ipcw": df_ipcw}

X_COLS = ["age_at_entry", "income_level", "health_score", "pension_contrib_rate"]
DUR = "time_to_event"
EVT = "event_observed"
T_STAR = 15.0
# Adjust TIMES to be within the valid follow-up range [0.04; 25.0[
TIMES = np.array([1.0, 5.0, 10.0, 15.0, 17.0])  # Changed from (5, 30, 6) to (1, 24, 6)


In [None]:
from lifelines import KaplanMeierFitter
from sksurv.metrics import concordance_index_ipcw, integrated_brier_score, cumulative_dynamic_auc
from sksurv.util import Surv
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
import numpy as np

# ------------------------------------------------------
# Convert dataframe into sksurv-compatible survival object
# ------------------------------------------------------
def make_surv(df):
    """
    Convert a pandas DataFrame into a structured survival array.

    Parameters:
    - df: DataFrame with event column (EVT) and duration column (DUR)

    Returns:
    - Structured array usable for sksurv models and metrics
    """
    return Surv.from_arrays(event=df[EVT].astype(bool), time=df[DUR])







In [None]:
# ----------------- Classification Model Evaluation Function -----------------
def evaluate_classifier(model_name, model, X_train, y_train, X_test, y_test, sample_weight=None, baseline_proba=None):
    """
    Evaluate a classification model using various performance metrics including:
    - Accuracy, F1 score
    - AUC-ROC, AUC@15 (same as AUC in binary classification)
    - Calibration score (1 - Brier Score)
    - Net Reclassification Improvement (NRI), if baseline probabilities are provided

    Parameters
    ----------
    model_name : str
        Name of the classification model (for reporting purposes).
    model : sklearn estimator or pipeline
        The machine learning model to evaluate.
    X_train, y_train : array-like
        Training features and labels.
    X_test, y_test : array-like
        Test features and labels.
    sample_weight : array-like, optional
        Case weights for handling censoring (e.g., IPCW). Only used if supported by model.
    baseline_proba : array-like, optional
        Predicted probabilities from a baseline model for computing NRI.

    Returns
    -------
    results : dict
        Dictionary containing evaluation metrics.
    y_pred_proba : np.ndarray
        Predicted probabilities for the positive class.
    """
    try:
        model_fitted = False

        # Attempt to fit the model using sample weights (if provided and supported)
        if sample_weight is not None:
            try:
                # If model is a pipeline, check the final estimator
                if hasattr(model, 'steps') and len(model.steps) > 0:
                    final_estimator = model.steps[-1][1]
                    if 'sample_weight' in final_estimator.fit.__code__.co_varnames:
                        # Must pass weights using pipeline syntax: <stepname>__sample_weight
                        model.fit(X_train, y_train, **{model.steps[-1][0] + '__sample_weight': sample_weight})
                        model_fitted = True
                else:
                    # Non-pipeline model with direct sample_weight support
                    if 'sample_weight' in model.fit.__code__.co_varnames:
                        model.fit(X_train, y_train, sample_weight=sample_weight)
                        model_fitted = True
            except (TypeError, AttributeError, ValueError):
                # If sample-weight-based fitting fails, fall back to normal fitting
                pass

        # If model was not fitted using weights, fit normally
        if not model_fitted:
            model.fit(X_train, y_train)

        # Generate predictions
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, zero_division=0)

        # Predicted probabilities for positive class
        if hasattr(model, 'predict_proba'):
            y_pred_proba = model.predict_proba(X_test)[:, 1]
        elif hasattr(model, 'decision_function'):
            # Convert decision_function to probabilities via logistic transform
            scores = model.decision_function(X_test)
            y_pred_proba = 1 / (1 + np.exp(-scores))
        else:
            y_pred_proba = y_pred.astype(float)

        # ROC-AUC computation
        try:
            if len(np.unique(y_test)) > 1:
                roc_auc = roc_auc_score(y_test, y_pred_proba)
            else:
                roc_auc = 0.5
        except:
            roc_auc = 0.5

        auc_15 = roc_auc  # Same interpretation for binary classification

        # Calibration approximation = 1 - Brier score
        try:
            calibration = 1 - np.mean((y_pred_proba - y_test) ** 2)
        except:
            calibration = np.nan

        # -------------- NRI Computation --------------
        nri_events = nri_non_events = nri_total = np.nan

        if baseline_proba is not None and len(baseline_proba) == len(y_pred_proba):
            try:
                cutoff = 0.5
                baseline_class = (baseline_proba >= cutoff).astype(int)
                new_class = (y_pred_proba >= cutoff).astype(int)

                # Event cases (y=1)
                events_mask = (y_test == 1)
                if np.sum(events_mask) > 0:
                    up_events = np.sum((new_class[events_mask] == 1) & (baseline_class[events_mask] == 0))
                    down_events = np.sum((new_class[events_mask] == 0) & (baseline_class[events_mask] == 1))
                    nri_events = (up_events - down_events) / np.sum(events_mask)

                # Non-event cases (y=0)
                nonevents_mask = (y_test == 0)
                if np.sum(nonevents_mask) > 0:
                    up_nonevents = np.sum((new_class[nonevents_mask] == 0) & (baseline_class[nonevents_mask] == 1))
                    down_nonevents = np.sum((new_class[nonevents_mask] == 1) & (baseline_class[nonevents_mask] == 0))
                    nri_non_events = (up_nonevents - down_nonevents) / np.sum(nonevents_mask)

                # Total NRI = sum of event and non-event improvement
                if not (np.isnan(nri_events) or np.isnan(nri_non_events)):
                    nri_total = nri_events + nri_non_events

            except Exception:
                pass

        # -------------- Final Output Dictionary --------------
        return {
            "Model": model_name,
            "C_index": np.nan,
            "C_index_IPCW": np.nan,
            "IBS": np.nan,
            "AUC@15": auc_15,
            "Calibration": calibration,
            "Accuracy": accuracy,
            "AUC": roc_auc,
            "F1": f1,
            "NRI_Events": nri_events,
            "NRI_Non_Events": nri_non_events,
            "NRI_Total": nri_total
        }, y_pred_proba

    except Exception as e:
        # If something goes wrong, return all NaN metrics
        return {
            "Model": model_name,
            "C_index": np.nan,
            "C_index_IPCW": np.nan,
            "IBS": np.nan,
            "AUC@15": np.nan,
            "Calibration": np.nan,
            "Accuracy": np.nan,
            "AUC": np.nan,
            "F1": np.nan,
            "NRI_Events": np.nan,
            "NRI_Non_Events": np.nan,
            "NRI_Total": np.nan
        }, np.array([])


In [None]:
# ----------------- Main Evaluation Loop -----------------


all_results = []
baseline_probabilities = {}  # Stores baseline model probabilities for NRI calculation

print("Starting comprehensive model evaluation...")

for method, df in datasets.items():
    print(f"\nMETHOD: {method.upper()}")

    # ----------------- Train / Test Splitting -----------------
    train_df, test_df = train_test_split(
        df,
        test_size=0.3,
        random_state=42,
        stratify=df[EVT]   # Ensures balanced event/non-event distribution
    )

    # Dataframe for fitting survival models (lifelines format)
    df_fit = train_df[[DUR, EVT] + X_COLS].copy()

    # Assign sample weights depending on method
    if method == "ipcw" and "ipcw" in train_df.columns:
        df_fit["ipcw"] = train_df["ipcw"]
        w_col = "ipcw"
    elif method == "discard" and "discard_weight" in train_df.columns:
        df_fit["discard_weight"] = train_df["discard_weight"]
        w_col = "discard_weight"
    else:
        w_col = None

    # ===================== SURVIVAL MODELS =====================

    # ------ Cox Proportional Hazards ------
    try:
        cph = CoxPHFitter()
        cph.fit(
            df_fit,
            duration_col=DUR,
            event_col=EVT,
            weights_col=w_col,
            robust=True
        )
        res_cph = evaluate_survival_model("Cox PH", cph, train_df, test_df)
        res_cph["Method"] = method
        res_cph["Model_Type"] = "Survival"
        all_results.append(res_cph)
    except Exception:
        pass  # Continue even if Cox model fails

    # ------ Weibull AFT ------
    try:
        aft = WeibullAFTFitter()
        if w_col:
            # Avoid zeros in weights for stability
            df_fit_aft = df_fit.copy()
            df_fit_aft[w_col] = np.maximum(df_fit_aft[w_col], 1e-6)
            aft.fit(df_fit_aft, duration_col=DUR, event_col=EVT, weights_col=w_col)
        else:
            aft.fit(df_fit, duration_col=DUR, event_col=EVT)

        res_aft = evaluate_survival_model("Weibull AFT", aft, train_df, test_df)
        res_aft["Method"] = method
        res_aft["Model_Type"] = "Survival"
        all_results.append(res_aft)
    except Exception:
        pass  # Continue even if AFT model fails

    # ===================== CLASSIFICATION MODELS =====================

    # Binary target: event occurs within T_STAR years
    y_train = ((train_df[DUR] <= T_STAR) & (train_df[EVT] == 1)).astype(int)
    y_test = ((test_df[DUR] <= T_STAR) & (test_df[EVT] == 1)).astype(int)
    X_train, X_test = train_df[X_COLS], test_df[X_COLS]

    # Sample weights for classification (IPCW or DISCARD)
    sw_train = None
    if method == "ipcw" and "ipcw" in train_df.columns:
        sw_train = train_df["ipcw"].copy()
        # Optionally down-weight non-events to stabilize
        sw_train[y_train == 0] = 0.1
    elif method == "discard" and "discard_weight" in train_df.columns:
        sw_train = train_df["discard_weight"].copy()

    # Define standard classifiers
    classifiers = {
        "Logistic Regression": make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000, random_state=42)),
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
        "SVM (RBF)": make_pipeline(StandardScaler(), SVC(probability=True, random_state=42)),
        "KNN": make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=25))
    }

    # Evaluate each classifier
    for name, clf in classifiers.items():
        try:
            # Use baseline probabilities only for NRI comparison in non-zero methods
            baseline_proba = baseline_probabilities.get(name, None) if method != "zero" else None

            res_clf, y_pred_proba = evaluate_classifier(
                name, clf, X_train, y_train, X_test, y_test,
                sample_weight=sw_train,
                baseline_proba=baseline_proba
            )
            res_clf["Method"] = method
            res_clf["Model_Type"] = "Classification"
            all_results.append(res_clf)

            # Save baseline probabilities from ZERO method for NRI comparison
            if method == "zero":
                baseline_probabilities[name] = y_pred_proba

        except Exception:
            pass  # Continue evaluating other models even if one fails

# ----------------- Save and Print Results -----------------
res = pd.DataFrame(all_results)
res.to_csv("model_evaluation_results.csv", index=False)

print("\nCOMPREHENSIVE MODEL EVALUATION RESULTS")
print("=" * 80)

# Survival Results Table
survival_results = res[res['Model_Type'] == 'Survival'][['Model', 'Method', 'C_index', 'C_index_IPCW', 'IBS', 'AUC@15', 'Calibration']]
if not survival_results.empty:
    print("\nSURVIVAL MODELS:")
    print(survival_results.round(3).to_string(index=False))

# Classification Results Table
class_results = res[res['Model_Type'] == 'Classification'][['Model', 'Method', 'Accuracy', 'AUC', 'F1', 'AUC@15', 'Calibration', 'NRI_Total']]
if not class_results.empty:
    print("\nCLASSIFICATION MODELS:")
    print(class_results.round(3).to_string(index=False))

print("\nResults saved to 'model_evaluation_results.csv'")
print("Evaluation completed.")
