In [None]:
import pandas as pd
from lifelines import KaplanMeierFitter
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split




In [2]:
# ensure results folder exists
os.makedirs("data/censoring_methods", exist_ok=True)

# load synthetic data
df = pd.read_csv("C:\\Users\\04ama\\OneDrive\\pension survival analysis\\data\\synthetic_survival_data.csv")

print("Censoring rate:", 1 - df["event_observed"].mean())
df.head()

Censoring rate: 0.4292


Unnamed: 0,id,age_at_entry,income_level,health_score,pension_contrib_rate,true_event_time,censor_time,time_to_event,event_observed
0,1,53,36794.48,78.4,0.052,8.653,7.657,7.657,0
1,2,39,45557.62,62.0,0.072,32.314,11.618,11.618,0
2,3,32,35686.13,52.1,0.048,19.388,25.0,19.388,1
3,4,45,70045.55,75.5,0.08,34.158,25.0,25.0,0
4,5,43,49361.51,86.3,0.054,15.37,25.0,15.37,1


In [3]:
df_zero = df.copy()
df_zero["method"] = "zero"
df_zero.to_csv("data/censoring_methods/data_zero.csv", index=False)
print("Saved:", "data/censoring_methods/data_zero.csv")


Saved: data/censoring_methods/data_zero.csv


In [4]:
df_discard = df.copy()  # Keep all data (both censored and uncensored)
df_discard["method"] = "discard"

# Add discard weights: 1 for events, 0 for censored
df_discard["discard_weight"] = df_discard["event_observed"].astype(float)  # 1 for events, 0 for censored

df_discard.to_csv("data/censoring_methods/data_discard.csv", index=False)
print("Saved:", "data/censoring_methods/data_discard.csv")

# Check the weights
print(f"Discard weights - Events (1): {(df_discard['discard_weight'] == 1).sum()}")
print(f"Discard weights - Censored (0): {(df_discard['discard_weight'] == 0).sum()}")

Saved: data/censoring_methods/data_discard.csv
Discard weights - Events (1): 2854
Discard weights - Censored (0): 2146


In [None]:
# First add the train_test_split import at the top of your notebook if not already there


# ============ PROPER IPCW: FIT ON TRAINING, APPLY TO TEST ============

# Split the data first (same split as your evaluation will use)
df_train, df_test = train_test_split(df, test_size=0.3, random_state=42, stratify=df["event_observed"])

print(f"Training set: {len(df_train)} observations")
print(f"Test set: {len(df_test)} observations")

# FIT CENSORING MODEL ON TRAINING DATA ONLY
km_c_train = KaplanMeierFitter()
km_c_train.fit(
    durations=df_train["time_to_event"],
    event_observed=1 - df_train["event_observed"]  # flip: 1=censored
)

# CALCULATE IPCW FOR TRAINING SET
G_hat_train = km_c_train.survival_function_at_times(df_train["time_to_event"]).values
ipcw_train = 1 / np.clip(G_hat_train, 1e-4, None)

df_ipcw_train = df_train.assign(ipcw=ipcw_train)

# Clean training weights
df_ipcw_train["ipcw"] = df_ipcw_train["ipcw"].abs()
df_ipcw_train["ipcw"] = df_ipcw_train["ipcw"].clip(lower=1e-3)
df_ipcw_train["ipcw"] = np.where(
    df_ipcw_train["event_observed"] == 1,
    df_ipcw_train["ipcw"],
    1.0
)

# APPLY TRAINED CENSORING MODEL TO TEST SET
G_hat_test = km_c_train.survival_function_at_times(df_test["time_to_event"]).values
ipcw_test = 1 / np.clip(G_hat_test, 1e-4, None)

df_ipcw_test = df_test.assign(ipcw=ipcw_test)

# Clean test weights
df_ipcw_test["ipcw"] = df_ipcw_test["ipcw"].abs()
df_ipcw_test["ipcw"] = df_ipcw_test["ipcw"].clip(lower=1e-3)
df_ipcw_test["ipcw"] = np.where(
    df_ipcw_test["event_observed"] == 1,
    df_ipcw_test["ipcw"],
    1.0
)

# COMBINE AND SAVE
df_ipcw = pd.concat([df_ipcw_train, df_ipcw_test], ignore_index=True)
df_ipcw["method"] = "ipcw"

df_ipcw.to_csv("data/censoring_methods/data_ipcw.csv", index=False)
print("Saved:", "data/censoring_methods/data_ipcw.csv")

# Check the weights
print("✅ Proper IPCW weights (fitted on training only):")
print(f"Weight stats: Min={df_ipcw['ipcw'].min():.4f}, Max={df_ipcw['ipcw'].max():.4f}, Mean={df_ipcw['ipcw'].mean():.4f}")
print(f"Training weights range: {df_ipcw_train['ipcw'][df_ipcw_train['event_observed']==1].min():.4f} to {df_ipcw_train['ipcw'][df_ipcw_train['event_observed']==1].max():.4f}")
print(f"Test weights range: {df_ipcw_test['ipcw'][df_ipcw_test['event_observed']==1].min():.4f} to {df_ipcw_test['ipcw'][df_ipcw_test['event_observed']==1].max():.4f}")
print(f"Negative weights: {(df_ipcw['ipcw'] < 0).sum()}")
print(f"Zero weights: {(df_ipcw['ipcw'] == 0).sum()}")

Training set: 3500 observations
Test set: 1500 observations
Saved: data/censoring_methods/data_ipcw.csv
✅ Proper IPCW weights (fitted on training only):
Weight stats: Min=1.0000, Max=2.2453, Mean=1.2335
Training weights range: 1.0092 to 2.2453
Test weights range: 1.0020 to 2.2453
Negative weights: 0
Zero weights: 0


In [6]:
summary = pd.DataFrame({
    "Method": ["zero", "discard", "ipcw"],
    "Rows": [len(df_zero), len(df_discard), len(df_ipcw)],
    "Censoring rate": [
        1 - df_zero["event_observed"].mean(),
        1 - df_discard["event_observed"].mean(),
        1 - df_ipcw["event_observed"].mean()
    ]
})
summary

summary.to_csv("data/censoring_methods/summary_overview.csv", index=False)
print("Saved:", "data/censoring_methods/summary_overview.csv")


Saved: data/censoring_methods/summary_overview.csv
