In [1]:
import pandas as pd
from lifelines import KaplanMeierFitter
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split




In [2]:
# ensure results folder exists
os.makedirs("data/censoring_methods", exist_ok=True)

# load synthetic data
df = pd.read_csv("C:\\Users\\04ama\\OneDrive\\pension survival analysis\\notebooks\\Data generation\\synthetic_survival_data.csv")

censoring_rate = 1 - df['event_observed'].mean()
print(f"Censoring Rate: {censoring_rate:.2%}")
df.head()

Censoring Rate: 54.70%


Unnamed: 0,id,age_at_entry,income_level,health_score,pension_contrib_rate,true_event_time,censor_time,time_to_event,event_observed
0,1,53,36794.48,78.4,0.052,4.992,8.615,4.992,1
1,2,39,45557.62,62.0,0.072,18.644,3.319,3.319,0
2,3,32,35686.13,52.1,0.048,11.186,6.974,6.974,0
3,4,45,70045.55,75.5,0.08,19.707,2.38,2.38,0
4,5,43,49361.51,86.3,0.054,8.868,3.315,3.315,0


In [3]:

df_zero = df.copy()
df_zero["method"] = "zero"
df_zero.to_csv("data/censoring_methods/data_zero.csv", index=False)
print("Saved:", "data/censoring_methods/data_zero.csv")


Saved: data/censoring_methods/data_zero.csv


In [5]:

# ----------------- 1. SPLIT DATA -----------------
df_train, df_test = train_test_split(
    df, test_size=0.3, random_state=42, stratify=df["event_observed"]
)
print(f"Training set: {len(df_train)}, Test set: {len(df_test)}")

# ----------------- 2. KM CENSORING MODEL ON TRAIN SET ONLY -----------------
km_c_train = KaplanMeierFitter()
km_c_train.fit(
    durations=df_train["time_to_event"],
    event_observed=1 - df_train["event_observed"]   # 1 = censored
)

# ----------------- 3. G(t) Diagnostics -----------------
print("\n=== CENSORING DISTRIBUTION DIAGNOSTICS ===")
G_values = km_c_train.survival_function_at_times(df_train["time_to_event"]).values
print(f"G(t) range: [{G_values.min():.6f}, {G_values.max():.6f}]")
print(f"G(t) < 0.01: {(G_values < 0.01).sum()} entries")
print(f"G(t) < 0.001: {(G_values < 0.001).sum()} entries")

# ----------------- 4. STABILISED IPCW  -----------------
G_hat_train = km_c_train.survival_function_at_times(df_train["time_to_event"]).values
G_hat_train = np.clip(G_hat_train, 0.01, 1.0)  # ðŸ”¹ Clip at 0.01 (not 0.05)

# Raw weights 1/G(t)
ipcw_raw = 1 / G_hat_train

# Clip max weight for stability
MAX_WEIGHT = 15
ipcw_train = np.where(
    df_train["event_observed"] == 1,               # If event occurred
    np.clip(ipcw_raw, 1, MAX_WEIGHT),             # Use capped IPCW weights (min=1, max=15)
    0.1                                          # Censored get minimal weight
)

df_ipcw_train = df_train.assign(ipcw=ipcw_train)

# ----------------- 5. STABILISED IPCW -----------------
G_hat_test = km_c_train.survival_function_at_times(df_test["time_to_event"]).values
G_hat_test = np.clip(G_hat_test, 0.01, 1.0)        # same clipping rule

ipcw_raw_test = 1 / G_hat_test
ipcw_test_events = np.clip(ipcw_raw_test, 0, MAX_WEIGHT)

ipcw_test = np.where(
    df_test["event_observed"] == 1,
    np.clip(ipcw_raw_test, 1, MAX_WEIGHT),        # Events: IPCW weights (1-15)
    0.1                                          # Censored: minimal weight
)

df_ipcw_test = df_test.assign(ipcw=ipcw_test)

# ----------------- 6. SAVE DATA -----------------
df_ipcw = pd.concat([df_ipcw_train, df_ipcw_test], ignore_index=True)
df_ipcw["method"] = "ipcw"
df_ipcw.to_csv("data/censoring_methods/data_ipcw.csv", index=False)

print("\nâœ… IPCW (Events Only) saved to data/censoring_methods/data_ipcw.csv")
print(f"Weight stats â†’ Min={df_ipcw['ipcw'].min():.4f} | Max={df_ipcw['ipcw'].max():.4f} | Mean={df_ipcw['ipcw'].mean():.4f}")



Training set: 3500, Test set: 1500

=== CENSORING DISTRIBUTION DIAGNOSTICS ===
G(t) range: [0.005608, 1.000000]
G(t) < 0.01: 2 entries
G(t) < 0.001: 0 entries

âœ… IPCW (Events Only) saved to data/censoring_methods/data_ipcw.csv
Weight stats â†’ Min=0.1000 | Max=15.0000 | Mean=0.9597


In [6]:
df_discard = df.copy()  # Keep all data (both censored and uncensored)
df_discard["method"] = "discard"

# Add discard weights: 1 for events, 0 for censored
df_discard["discard_weight"] = df_discard["event_observed"].astype(float)  # 1 for events, 0 for censored

df_discard.to_csv("data/censoring_methods/data_discard.csv", index=False)
print("Saved:", "data/censoring_methods/data_discard.csv")

# Check the weights
print(f"Discard weights - Events (1): {(df_discard['discard_weight'] == 1).sum()}")
print(f"Discard weights - Censored (0): {(df_discard['discard_weight'] == 0).sum()}")

Saved: data/censoring_methods/data_discard.csv
Discard weights - Events (1): 2265
Discard weights - Censored (0): 2735
