### 1. Data Loading & Setup
This section loads the dataset and initializes necessary directories.  
The dataset contains information on patients, their treatment status, and survival outcomes.  
To improve clustering performance, we apply **StandardScaler** to normalize numerical features.

In [1]:
# Code Block 1: Load dataset and preprocess
import os
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import DBSCAN, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from lifelines import KaplanMeierFitter

# Create necessary directories
trial_pp_dir = "trial_pp"
trial_itt_dir = "trial_itt"
os.makedirs(trial_pp_dir, exist_ok=True)
os.makedirs(trial_itt_dir, exist_ok=True)

# Load dataset
data_censored = pd.read_csv("data_censored.csv")

# Standardize features for clustering
scaler = StandardScaler()
features = data_censored[["age", "x1", "x2", "x3"]]
scaled_features = scaler.fit_transform(features)

### 2. Clustering Implementation
To identify patterns within the dataset, we apply clustering.  
The options include **DBSCAN, Agglomerative Clustering, and Gaussian Mixture Model (GMM)**.  
The chosen method segments patients into groups, allowing for better survival analysis insights.  
Clusters are then added to the dataset for use in later modeling steps.


In [2]:
# Code Block 2: Apply Clustering
def apply_clustering(data, method='DBSCAN'):
    if method == 'DBSCAN':
        cluster_model = DBSCAN(eps=0.5, min_samples=5)
    elif method == 'Agglomerative':
        cluster_model = AgglomerativeClustering(n_clusters=3)
    elif method == 'GMM':
        cluster_model = GaussianMixture(n_components=3, random_state=42)
    else:
        raise ValueError("Invalid clustering method")
    
    labels = cluster_model.fit_predict(data)
    return labels

# Apply clustering and assign cluster labels
data_censored['Cluster'] = apply_clustering(scaled_features, method='DBSCAN')

### 3. Treatment Switching & Censoring Weight Adjustments per Cluster
For each identified cluster, a **logistic regression model** predicts the likelihood of treatment switching.  
This ensures weights are assigned appropriately based on different patient subgroups.


In [5]:
# Code Block 3: Logistic Regression for Treatment Switching
def logistic_regression_per_cluster(df, features, target):
    weights = []
    for cluster in df['Cluster'].unique():
        subset = df[df['Cluster'] == cluster]
        if len(subset) > 10:  # Ensure sufficient data points
            model = LogisticRegression()
            model.fit(subset[features], subset[target])
            weights.append(model.predict_proba(subset[features])[:, 1])
        else:
            weights.append(np.ones(len(subset)))
    return np.concatenate(weights)

# Compute weights
data_censored["switch_weight"] = logistic_regression_per_cluster(data_censored, ["age", "x1", "x3"], "treatment")
data_censored["censor_weight"] = logistic_regression_per_cluster(data_censored, ["x2", "x1"], "censored")
data_censored["final_weight"] = data_censored["switch_weight"] * data_censored["censor_weight"]

print(data_censored.columns)


Index(['id', 'period', 'treatment', 'x1', 'x2', 'x3', 'x4', 'age', 'age_s',
       'outcome', 'censored', 'eligible', 'Cluster', 'switch_weight',
       'censor_weight', 'final_weight'],
      dtype='object')


### 4. Outcome Model Using Logistic Regression
A logistic regression model estimates the treatment effect on survival probability.  
This helps analyze how different factors contribute to the likelihood of survival.

In [None]:
# Ensure necessary columns exist
if "assigned_treatment" not in data_censored.columns:
    data_censored["assigned_treatment"] = data_censored["treatment"]  
if "followup_time" not in data_censored.columns:
    data_censored["followup_time"] = data_censored["period"]

# Outcome Model
X_outcome = sm.add_constant(data_censored[["assigned_treatment", "x2", "followup_time"]])
y_outcome = data_censored["outcome"]
outcome_model = sm.Logit(y_outcome, X_outcome).fit()
print(outcome_model.summary())


Index(['id', 'period', 'treatment', 'x1', 'x2', 'x3', 'x4', 'age', 'age_s',
       'outcome', 'censored', 'eligible', 'Cluster', 'switch_weight',
       'censor_weight', 'final_weight'],
      dtype='object')


KeyError: "['assigned_treatment', 'followup_time'] not in index"

### 5. Expanding Trial Periods
To simulate real-world patient follow-up, we expand trial periods.  
Each period represents a step in patient monitoring.

In [None]:
# Code Block 5: Expanding Trial Periods
def expand_trials(data):
    expanded_data = data.copy()
    expanded_data["trial_period"] = expanded_data["period"]
    expanded_data["followup_time"] = expanded_data["period"] + 1
    return expanded_data

trial_pp_expanded = expand_trials(data_censored)
trial_itt_expanded = expand_trials(data_censored)

### 6. Creating Sequential Trials
Patients are tracked for up to five future periods, creating a sequence of follow-ups.  
This provides richer insights into long-term survival trends.

In [None]:
# Code Block 6: Creating Sequential Trials
def create_sequence_of_trials(data):
    expanded_trials = []
    for _, row in data.iterrows():
        period_start = int(row["period"])
        period_end = period_start + 5
        for t in range(period_start, period_end):
            new_row = row.copy()
            new_row["trial_period"] = t
            new_row["followup_time"] = t - period_start
            expanded_trials.append(new_row)
    return pd.DataFrame(expanded_trials)

trial_pp_seq = create_sequence_of_trials(trial_pp_expanded)
trial_itt_seq = create_sequence_of_trials(trial_itt_expanded)

### 7. Weight Adjustments
To prevent extreme weighting, we apply **Winsorization** to cap the highest values at the 99th percentile.  
This stabilizes model performance and ensures fair weighting.

In [None]:
# Code Block 7: Weight Adjustments
def winsorize_weights(weights):
    q99 = np.percentile(weights, 99)
    return np.minimum(weights, q99)

sample_size = int(len(trial_itt_expanded) * 0.5)
trial_itt_sampled = trial_itt_expanded.sample(sample_size, random_state=1234)
trial_itt_sampled["adjusted_weight"] = winsorize_weights(trial_itt_sampled["final_weight"])

### 8. MSM Model Fitting
A logistic regression model is trained using **Inverse Probability Weighting (IPW)**  
to estimate treatment effects on survival probability.


In [None]:
# Code Block 8: MSM Model Fitting
X_msm = sm.add_constant(trial_itt_sampled[["assigned_treatment", "x2", "followup_time"]])
y_msm = trial_itt_sampled["outcome"]

msm_model = sm.Logit(y_msm, X_msm, weights=trial_itt_sampled["adjusted_weight"]).fit()
print(msm_model.summary())

### 9. Kaplan-Meier Survival Analysis by Cluster
The **Kaplan-Meier estimator** is used to plot survival curves for each cluster.  
This allows for a direct comparison of how different clusters behave over time.


In [None]:
# Code Block 9: Kaplan-Meier Survival Analysis by Cluster
def survival_analysis(df):
    kmf = KaplanMeierFitter()
    plt.figure(figsize=(10, 6))
    
    has_clusters = False
    for cluster in df['Cluster'].unique():
        subset = df[df['Cluster'] == cluster]
        if len(subset) > 5:
            kmf.fit(subset['followup_time'], event_observed=subset['outcome'])
            kmf.plot(label=f'Cluster {cluster}')
            has_clusters = True
    
    plt.title("Kaplan-Meier Survival Curves by Cluster")
    plt.xlabel("Follow-up Time")
    plt.ylabel("Survival Probability")
    if has_clusters:
        plt.legend()
    plt.show()

survival_analysis(data_censored)

### 10. Cluster Distribution Table
This table displays the number of data points assigned to each cluster.  
It provides insights into how patient groups are distributed.


In [None]:
# Code Block 10: Cluster Distribution Table
print("Cluster Distribution Table:")
print(data_censored.groupby("Cluster").size())