In [1]:
# Install scikit-survival package
import subprocess
import sys

subprocess.check_call([sys.executable, "-m", "pip", "install", "scikit-survival"])

0

In [2]:
import pandas as pd
import numpy as np

%pip install scikit-learn==1.3.2 scikit-survival==0.22.2

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from lifelines import CoxPHFitter, WeibullAFTFitter
from lifelines.utils import concordance_index
from sksurv.util import Surv
from sksurv.metrics import (
    concordance_index_ipcw,
    integrated_brier_score,
    brier_score,
    cumulative_dynamic_auc
)

# Load datasets
df_zero    = pd.read_csv("C:\\Users\\04ama\\OneDrive\\pension survival analysis\\notebooks\\ipcw_and_other_censoring\\data\\censoring_methods\\data_zero.csv")
df_discard = pd.read_csv("C:\\Users\\04ama\\OneDrive\\pension survival analysis\\notebooks\\ipcw_and_other_censoring\\data\\censoring_methods\\data_discard.csv")
df_ipcw    = pd.read_csv("C:\\Users\\04ama\\OneDrive\\pension survival analysis\\notebooks\\ipcw_and_other_censoring\\data\\censoring_methods\\data_ipcw.csv")

datasets = {"zero": df_zero, "discard": df_discard, "ipcw": df_ipcw}

X_COLS = ["age_at_entry", "income_level", "health_score", "pension_contrib_rate"]
DUR = "time_to_event"
EVT = "event_observed"
T_STAR = 15.0
# Adjust TIMES to be within the valid follow-up range [0.04; 25.0[
TIMES = np.linspace(1, 24, 6)  # Changed from (5, 30, 6) to (1, 24, 6)


Note: you may need to restart the kernel to use updated packages.


In [3]:
from lifelines import KaplanMeierFitter
from sksurv.metrics import concordance_index_ipcw, integrated_brier_score, cumulative_dynamic_auc

def make_surv(df):
    return Surv.from_arrays(event=df[EVT].astype(bool), time=df[DUR])

def evaluate_survival_model(model_name, model, train_df, test_df, weights=None):
    y_tr = make_surv(train_df)
    y_te = make_surv(test_df)
    S_pred = model.predict_survival_function(test_df[X_COLS], times=TIMES).T.values
    risk_scores = 1 - S_pred[:, -1]  # event risk at last time
    c_uno = concordance_index_ipcw(y_tr, y_te, -risk_scores, tau=TIMES[-1])[0]
    ibs = integrated_brier_score(y_tr, y_te, S_pred, TIMES)
    auc_times, aucs = cumulative_dynamic_auc(y_tr, y_te, risk_scores, TIMES)
    
    # Handle case where aucs might be scalar or array
    if np.isscalar(aucs):
        auc_15 = float(aucs)
    else:
        # Find closest time to T_STAR
        closest_idx = np.argmin(np.abs(auc_times - T_STAR))
        auc_15 = float(aucs[closest_idx])
    
    return {"Model": model_name, "C_index": c_uno, "IBS": ibs, "AUC@15": auc_15}

def evaluate_classifier(model_name, model, X_train, y_train, X_test, y_test, sample_weight=None):
    """Evaluate binary classifier"""
    # Fit model
    if sample_weight is not None and hasattr(model, 'fit') and 'sample_weight' in model.fit.__code__.co_varnames:
        model.fit(X_train, y_train, sample_weight=sample_weight)
    else:
        model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    if hasattr(model, 'predict_proba'):
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, y_pred_proba)
    elif hasattr(model, 'decision_function'):
        y_pred_scores = model.decision_function(X_test)
        auc = roc_auc_score(y_test, y_pred_scores)
    else:
        auc = np.nan
    
    f1 = f1_score(y_test, y_pred)
    
    return {"Model": model_name, "Accuracy": accuracy, "AUC": auc, "F1": f1}


In [None]:
all_results = []



for method, df in datasets.items():
    print(f"\n=== METHOD: {method.upper()} ===")
    train_df, test_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df[EVT] if method != "discard" else None)

    df_fit = train_df[[DUR, EVT] + X_COLS].copy()
    # ------------- SURVIVAL MODELS -------------
    if method == "ipcw" and "ipcw" in train_df.columns:
        df_fit["ipcw"] = train_df["ipcw"]
        w_col = "ipcw"
    else:
        w_col = None

    # Cox PH
    cph = CoxPHFitter()
    cph.fit(df_fit, duration_col=DUR, event_col=EVT,
            weights_col=w_col if w_col else None)
    res_cph = evaluate_survival_model("CoxPH", cph, train_df, test_df)
    res_cph["Method"] = method
    all_results.append(res_cph)

    # Weibull AFT (doesn't support weights well, so fit without weights)
    aft = WeibullAFTFitter()
    aft.fit(df_fit[[DUR, EVT] + X_COLS], duration_col=DUR, event_col=EVT)
    res_aft = evaluate_survival_model("WeibullAFT", aft, train_df, test_df)
    res_aft["Method"] = method
    all_results.append(res_aft)

    # ------------- CLASSIFIERS -------------
    # Convert to binary outcome: event occurred by t*?
    y_train = ((train_df[DUR] <= T_STAR) & (train_df[EVT] == 1)).astype(int)
    y_test  = ((test_df[DUR] <= T_STAR) & (test_df[EVT] == 1)).astype(int)
    X_train, X_test = train_df[X_COLS], test_df[X_COLS]

    sw_train = None
    if method == "ipcw" and "ipcw" in train_df.columns:
        sw_train = train_df["ipcw"].copy()
        sw_train[y_train == 0] = 0  # only count events

    classifiers = {
        "Logistic Regression": make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000)),
        "Random Forest": RandomForestClassifier(n_estimators=400, random_state=42),
        "SVM (RBF)": make_pipeline(StandardScaler(), SVC(probability=True, random_state=42)),
        "KNN": make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=25))
    }

    for name, clf in classifiers.items():
        res_clf = evaluate_classifier(name, clf, X_train, y_train, X_test, y_test, sample_weight=sw_train)
        res_clf["Method"] = method
        all_results.append(res_clf)

res = pd.DataFrame(all_results)
res.to_csv("results.csv", index=False)
print("\n✅ Results saved to results.csv")
display(res)



=== METHOD: ZERO ===

=== METHOD: DISCARD ===

=== METHOD: DISCARD ===

=== METHOD: IPCW ===

=== METHOD: IPCW ===


It's important to know that the naive variance estimates of the coefficients are biased. Instead a) set `robust=True` in the call to `fit`, or b) use Monte Carlo to
estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"



OSError: Cannot save file into a non-existent directory: 'data\results'