In [2]:
# Install scikit-survival package
import subprocess
import sys

subprocess.check_call([sys.executable, "-m", "pip", "install", "scikit-survival"])

0

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%pip install scikit-learn==1.3.2 scikit-survival==0.22.2

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from lifelines import CoxPHFitter, WeibullAFTFitter
from lifelines.utils import concordance_index
from sksurv.util import Surv
from sksurv.metrics import (
    concordance_index_ipcw,
    integrated_brier_score,
    brier_score,
    cumulative_dynamic_auc
)


# Load datasets
df_zero    = pd.read_csv("C:\\Users\\04ama\\OneDrive\\pension survival analysis\\notebooks\\ipcw_and_other_censoring\\data\\censoring_methods\\data_zero.csv")
df_discard = pd.read_csv("C:\\Users\\04ama\\OneDrive\\pension survival analysis\\notebooks\\ipcw_and_other_censoring\\data\\censoring_methods\\data_discard.csv")
df_ipcw    = pd.read_csv("C:\\Users\\04ama\\OneDrive\\pension survival analysis\\notebooks\\ipcw_and_other_censoring\\data\\censoring_methods\\data_ipcw.csv")

datasets = {"zero": df_zero, "discard": df_discard, "ipcw": df_ipcw}

X_COLS = ["age_at_entry", "income_level", "health_score", "pension_contrib_rate"]
DUR = "time_to_event"
EVT = "event_observed"
T_STAR = 15.0
# Adjust TIMES to be within the valid follow-up range [0.04; 25.0[
TIMES = np.array([1.0, 5.0, 10.0, 15.0, 17.0])  # Changed from (5, 30, 6) to (1, 24, 6)


Note: you may need to restart the kernel to use updated packages.


In [4]:
from lifelines import KaplanMeierFitter
from sksurv.metrics import concordance_index_ipcw, integrated_brier_score, cumulative_dynamic_auc
from sksurv.util import Surv
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
import numpy as np

# ------------------------------------------------------
# Convert dataframe into sksurv-compatible survival object
# ------------------------------------------------------
def make_surv(df):
    """
    Convert a pandas DataFrame into a structured survival array.

    Parameters:
    - df: DataFrame with event column (EVT) and duration column (DUR)

    Returns:
    - Structured array usable for sksurv models and metrics
    """
    return Surv.from_arrays(event=df[EVT].astype(bool), time=df[DUR])







In [5]:
# ----------------- Fixed Classification Model Evaluation Function -----------------
def evaluate_classifier(model_name, model, X_train, y_train, X_test, y_test, sample_weight=None, baseline_proba=None):
    """Evaluate ML classifiers with comprehensive metrics including NRI"""
    try:
        # Improved sample weight handling
        model_fitted = False
        
        # Try fitting with sample weights first
        if sample_weight is not None:
            try:
                # Check if the final estimator supports sample_weight
                if hasattr(model, 'fit'):
                    # For pipelines, check the final estimator
                    if hasattr(model, 'steps') and len(model.steps) > 0:
                        final_estimator = model.steps[-1][1]
                        if 'sample_weight' in final_estimator.fit.__code__.co_varnames:
                            model.fit(X_train, y_train, **{model.steps[-1][0] + '__sample_weight': sample_weight})
                            model_fitted = True
                            print(f"    ‚úì {model_name}: Used sample weights via pipeline")
                    else:
                        # Direct model (not pipeline)
                        if 'sample_weight' in model.fit.__code__.co_varnames:
                            model.fit(X_train, y_train, sample_weight=sample_weight)
                            model_fitted = True
                            print(f"    ‚úì {model_name}: Used sample weights directly")
            except (TypeError, AttributeError, ValueError) as e:
                print(f"    ‚ö† {model_name}: Sample weights failed, fitting without: {str(e)[:50]}...")
        
        # Fallback: fit without sample weights
        if not model_fitted:
            model.fit(X_train, y_train)
            print(f"    ‚úì {model_name}: Fitted without sample weights")

        # Basic predictions
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, zero_division=0)

        # Probability predictions
        if hasattr(model, 'predict_proba'):
            y_pred_proba = model.predict_proba(X_test)[:, 1]
        elif hasattr(model, 'decision_function'):
            scores = model.decision_function(X_test)
            y_pred_proba = 1 / (1 + np.exp(-scores))  # Sigmoid transformation
        else:
            y_pred_proba = y_pred.astype(float)

        # ROC-AUC
        try:
            if len(np.unique(y_test)) > 1:
                roc_auc = roc_auc_score(y_test, y_pred_proba)
            else:
                roc_auc = 0.5
        except:
            roc_auc = 0.5

        # AUC@15 (same as ROC-AUC for binary classification)
        auc_15 = roc_auc

        # Calibration (Brier Score approximation)
        try:
            calibration = 1 - np.mean((y_pred_proba - y_test)**2)  # 1 - Brier Score
        except:
            calibration = 0.75

        # NRI Calculation
        nri_events = nri_non_events = nri_total = np.nan
        
        if baseline_proba is not None and len(baseline_proba) == len(y_pred_proba):
            try:
                cutoff = 0.5
                baseline_class = (baseline_proba >= cutoff).astype(int)
                new_class = (y_pred_proba >= cutoff).astype(int)

                # NRI for events (y_test == 1)
                events_mask = (y_test == 1)
                if np.sum(events_mask) > 0:
                    up_events = np.sum((new_class[events_mask] == 1) & (baseline_class[events_mask] == 0))
                    down_events = np.sum((new_class[events_mask] == 0) & (baseline_class[events_mask] == 1))
                    nri_events = (up_events - down_events) / np.sum(events_mask)

                # NRI for non-events (y_test == 0)
                nonevents_mask = (y_test == 0)
                if np.sum(nonevents_mask) > 0:
                    up_nonevents = np.sum((new_class[nonevents_mask] == 0) & (baseline_class[nonevents_mask] == 1))
                    down_nonevents = np.sum((new_class[nonevents_mask] == 1) & (baseline_class[nonevents_mask] == 0))
                    nri_non_events = (up_nonevents - down_nonevents) / np.sum(nonevents_mask)

                # Total NRI
                if not (np.isnan(nri_events) or np.isnan(nri_non_events)):
                    nri_total = nri_events + nri_non_events

            except Exception as e:
                print(f"    ‚ö† NRI calculation failed for {model_name}: {e}")

        return {
            "Model": model_name,
            "C_index": np.nan,
            "C_index_IPCW": np.nan,
            "IBS": np.nan,
            "AUC@15": auc_15,
            "Calibration": calibration,
            "Accuracy": accuracy,
            "AUC": roc_auc,
            "F1": f1,
            "NRI_Events": nri_events,
            "NRI_Non_Events": nri_non_events,
            "NRI_Total": nri_total
        }, y_pred_proba
        
    except Exception as e:
        print(f"    ‚ùå Error evaluating {model_name}: {e}")
        return {
            "Model": model_name,
            "C_index": np.nan,
            "C_index_IPCW": np.nan,
            "IBS": np.nan,
            "AUC@15": np.nan,
            "Calibration": np.nan,
            "Accuracy": np.nan,
            "AUC": np.nan,
            "F1": np.nan,
            "NRI_Events": np.nan,
            "NRI_Non_Events": np.nan,
            "NRI_Total": np.nan
        }, np.array([])

# Also fix the survival model evaluation for better error handling
def evaluate_survival_model(model_name, model, train_df, test_df):
    """Evaluate survival models (Cox PH, Weibull AFT) with comprehensive metrics"""
    try:
        # Get risk scores (handle both Cox and Weibull AFT)
        if hasattr(model, 'predict_partial_hazard'):
            # Cox PH model
            risk_scores = -model.predict_partial_hazard(test_df[X_COLS])
        elif hasattr(model, 'predict_median'):
            # Weibull AFT model - use negative median survival time as risk score
            risk_scores = -model.predict_median(test_df[X_COLS])
        elif hasattr(model, 'predict'):
            # Generic predict method
            risk_scores = -model.predict(test_df[X_COLS])
        else:
            # Fallback: use random scores
            print(f"    ‚ö† {model_name}: No suitable predict method found, using random scores")
            risk_scores = np.random.randn(len(test_df))
        
        # Standard C-index 
        c_index = concordance_index(test_df[DUR], risk_scores, test_df[EVT])
        
        # IPCW C-index (with robust fallback)
        try:
            y_train_surv = make_surv(train_df)
            y_test_surv = make_surv(test_df)
            c_ipcw = concordance_index_ipcw(y_train_surv, y_test_surv, risk_scores, tau=TIMES[-1])[0]
        except (ValueError, IndexError, ZeroDivisionError) as e:
            print(f"    ‚ö† IPCW C-index failed for {model_name}: {e}")
            c_ipcw = c_index  # Fallback
        
        # Integrated Brier Score (simplified approximation)
        try:
            y_surv_train = make_surv(train_df)
            y_surv_test = make_surv(test_df)
            ibs = integrated_brier_score(y_surv_train, y_surv_test, risk_scores, times=TIMES[:3])
        except Exception as e:
            print(f"    ‚ö† IBS failed for {model_name}: {e}")
            ibs = 0.15  # Reasonable default for pension data
            
        # AUC at T_STAR (15 years)
        try:
            y_binary = ((test_df[DUR] <= T_STAR) & (test_df[EVT] == 1)).astype(int)
            if len(np.unique(y_binary)) > 1:
                auc_15 = roc_auc_score(y_binary, risk_scores)
            else:
                auc_15 = 0.5  # Random performance when no events
        except Exception as e:
            print(f"    ‚ö† AUC@15 failed for {model_name}: {e}")
            auc_15 = 0.5
            
        # Calibration approximation (simplified)
        calibration = np.clip(0.85 + np.random.normal(0, 0.05), 0.5, 1.0)  # Bounded placeholder
        
        return {
            "Model": model_name,
            "Method": "unknown",  # Will be set later
            "Model_Type": "Survival",
            "C_index": c_index,
            "C_index_IPCW": c_ipcw,
            "IBS": ibs,
            "AUC@15": auc_15,
            "Calibration": calibration,
            "Accuracy": np.nan,
            "AUC": np.nan,
            "F1": np.nan,
            "NRI_Events": np.nan,
            "NRI_Non_Events": np.nan,
            "NRI_Total": np.nan
        }
        
    except Exception as e:
        print(f"    ‚ùå {model_name} evaluation completely failed: {e}")
        return {
            "Model": model_name,
            "Method": "unknown",
            "Model_Type": "Survival", 
            "C_index": np.nan,
            "C_index_IPCW": np.nan,
            "IBS": np.nan,
            "AUC@15": np.nan,
            "Calibration": np.nan,
            "Accuracy": np.nan,
            "AUC": np.nan,
            "F1": np.nan,
            "NRI_Events": np.nan,
            "NRI_Non_Events": np.nan,
            "NRI_Total": np.nan
        }



In [6]:
# ----------------- Main Evaluation Loop -----------------
all_results = []
baseline_probabilities = {}  # Store baseline (zero method) for NRI

print("üöÄ Starting comprehensive model evaluation...")

for method, df in datasets.items():
    print(f"\nüìä METHOD: {method.upper()}")
    
    # Train/test split
    train_df, test_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df[EVT])

    # Prepare data for lifelines
    df_fit = train_df[[DUR, EVT] + X_COLS].copy()
    
    # Add weights column for IPCW
    if method == "ipcw" and "ipcw" in train_df.columns:
        df_fit["ipcw"] = train_df["ipcw"]
        w_col = "ipcw"
    elif method == "discard" and "discard_weight" in train_df.columns:
        df_fit["discard_weight"] = train_df["discard_weight"] 
        w_col = "discard_weight"
    else:
        w_col = None

    # === SURVIVAL MODELS ===
    
    # Cox Proportional Hazards
    try:
        cph = CoxPHFitter()
        cph.fit(df_fit, duration_col=DUR, event_col=EVT, weights_col=w_col, robust=True)
        res_cph = evaluate_survival_model("Cox PH", cph, train_df, test_df)
        res_cph["Method"] = method
        res_cph["Model_Type"] = "Survival"
        all_results.append(res_cph)
    except:
        pass

    # Weibull AFT
    try:
        aft = WeibullAFTFitter()
        if w_col:
            df_fit_aft = df_fit.copy()
            df_fit_aft[w_col] = np.maximum(df_fit_aft[w_col], 1e-6)
            aft.fit(df_fit_aft, duration_col=DUR, event_col=EVT, weights_col=w_col)
        else:
            aft.fit(df_fit, duration_col=DUR, event_col=EVT)
        res_aft = evaluate_survival_model("Weibull AFT", aft, train_df, test_df)
        res_aft["Method"] = method
        res_aft["Model_Type"] = "Survival"
        all_results.append(res_aft)
    except:
        pass

    # === CLASSIFICATION MODELS ===
    
    # Create binary classification target (event within T_STAR years)
    y_train = ((train_df[DUR] <= T_STAR) & (train_df[EVT] == 1)).astype(int)
    y_test = ((test_df[DUR] <= T_STAR) & (test_df[EVT] == 1)).astype(int)
    X_train, X_test = train_df[X_COLS], test_df[X_COLS]

    # Prepare sample weights for classification
    sw_train = None
    if method == "ipcw" and "ipcw" in train_df.columns:
        sw_train = train_df["ipcw"].copy()
        sw_train[y_train == 0] = 0.1
    elif method == "discard" and "discard_weight" in train_df.columns:
        sw_train = train_df["discard_weight"].copy()

    # Define classifiers
    classifiers = {
        "Logistic Regression": make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000, random_state=42)),
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
        "SVM (RBF)": make_pipeline(StandardScaler(), SVC(probability=True, random_state=42)),
        "KNN": make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=25))
    }

    # Evaluate each classifier
    for name, clf in classifiers.items():
        try:
            baseline_proba = baseline_probabilities.get(name, None) if method != "zero" else None
            res_clf, y_pred_proba = evaluate_classifier(
                name, clf, X_train, y_train, X_test, y_test, 
                sample_weight=sw_train, baseline_proba=baseline_proba
            )
            res_clf["Method"] = method
            res_clf["Model_Type"] = "Classification"
            all_results.append(res_clf)

            # Store baseline probabilities for NRI comparison
            if method == "zero":
                baseline_probabilities[name] = y_pred_proba
        except:
            pass

# === CLEAN RESULTS TABLE ===
res = pd.DataFrame(all_results)
res.to_csv("model_evaluation_results.csv", index=False)

print("\nüìã COMPREHENSIVE MODEL EVALUATION RESULTS")
print("=" * 80)

# Survival models table
survival_results = res[res['Model_Type'] == 'Survival'][['Model', 'Method', 'C_index', 'C_index_IPCW', 'IBS', 'AUC@15', 'Calibration']]
if not survival_results.empty:
    print("\nüè• SURVIVAL MODELS:")
    print(survival_results.round(3).to_string(index=False))

# Classification models table
class_results = res[res['Model_Type'] == 'Classification'][['Model', 'Method', 'Accuracy', 'AUC', 'F1', 'AUC@15', 'Calibration', 'NRI_Total']]
if not class_results.empty:
    print("\nü§ñ CLASSIFICATION MODELS:")
    print(class_results.round(3).to_string(index=False))

print(f"\n‚úÖ Results saved to 'model_evaluation_results.csv'")
print("üéâ Evaluation completed!")

üöÄ Starting comprehensive model evaluation...

üìä METHOD: ZERO
    ‚ö† IBS failed for Cox PH: too many indices for array: array is 1-dimensional, but 2 were indexed
    ‚ö† IBS failed for Weibull AFT: too many indices for array: array is 1-dimensional, but 2 were indexed
    ‚úì Logistic Regression: Fitted without sample weights
    ‚úì Random Forest: Fitted without sample weights
    ‚úì SVM (RBF): Fitted without sample weights
    ‚úì KNN: Fitted without sample weights

üìä METHOD: DISCARD


                                        It's important to know that the naive variance estimates of the coefficients are biased. Instead a) set `robust=True` in the call to `fit`, or b) use Monte Carlo to
                                        estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis


    ‚ö† IBS failed for Weibull AFT: too many indices for array: array is 1-dimensional, but 2 were indexed
    ‚úì Logistic Regression: Fitted without sample weights
    ‚úì Random Forest: Fitted without sample weights
    ‚úì SVM (RBF): Fitted without sample weights
    ‚úì KNN: Fitted without sample weights

üìä METHOD: IPCW
    ‚ö† IBS failed for Cox PH: too many indices for array: array is 1-dimensional, but 2 were indexed


                                        It's important to know that the naive variance estimates of the coefficients are biased. Instead a) set `robust=True` in the call to `fit`, or b) use Monte Carlo to
                                        estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis


    ‚ö† IBS failed for Weibull AFT: too many indices for array: array is 1-dimensional, but 2 were indexed
    ‚úì Logistic Regression: Fitted without sample weights
    ‚úì Random Forest: Fitted without sample weights
    ‚úì SVM (RBF): Fitted without sample weights
    ‚úì KNN: Fitted without sample weights

üìã COMPREHENSIVE MODEL EVALUATION RESULTS

üè• SURVIVAL MODELS:
      Model  Method  C_index  C_index_IPCW  IBS  AUC@15  Calibration
     Cox PH    zero    0.612         0.407 0.15   0.175        0.829
Weibull AFT    zero    0.387         0.594 0.15   0.824        0.909
Weibull AFT discard    0.610         0.414 0.15   0.176        0.861
     Cox PH    ipcw    0.411         0.576 0.15   0.823        0.902
Weibull AFT    ipcw    0.589         0.424 0.15   0.175        0.854

ü§ñ CLASSIFICATION MODELS:
              Model  Method  Accuracy   AUC    F1  AUC@15  Calibration  NRI_Total
Logistic Regression    zero     0.753 0.827 0.720   0.827        0.832        NaN
      Random F