In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from lifelines import KaplanMeierFitter, CoxPHFitter
from lifelines.statistics import logrank_test
from sksurv.linear_model import CoxnetSurvivalAnalysis
from sksurv.ensemble import RandomSurvivalForest
from sksurv.metrics import concordance_index_censored
from sksurv.preprocessing import OneHotEncoder
from sksurv.util import Surv
from sklearn.inspection import permutation_importance

# Load dataset
data = pd.read_excel("RADCURE_Clinical_v04_20241219.xlsx")

# Convert Status to event indicator (1=Dead, 0=Alive)
data["event"] = data["Status"].apply(lambda x: 1 if x == "Dead" else 0)

# Kaplan-Meier Analysis
kmf = KaplanMeierFitter()
plt.figure(figsize=(10, 6))

# Define the groups for Kaplan-Meier (e.g., Chemotherapy)
group_col = "Chemo"
unique_groups = data[group_col].unique()

for group in unique_groups:
    group_data = data[data[group_col] == group]
    kmf.fit(group_data["Length FU"], event_observed=group_data["event"], label=f"{group_col} = {group}")
    kmf.plot_survival_function()

plt.title(f'Kaplan-Meier Curve for {group_col}')
plt.xlabel('Follow-up Time (months)')
plt.ylabel('Survival Probability')
plt.legend()
plt.grid()
plt.show()

# Log-rank test (e.g., Chemo Yes vs No)
group1 = data[data[group_col] == unique_groups[0]]
group2 = data[data[group_col] == unique_groups[1]]

logrank_result = logrank_test(group1["Length FU"], group2["Length FU"],
                              event_observed_A=group1["event"], 
                              event_observed_B=group2["event"])

print(f"Log-rank test p-value: {logrank_result.p_value:.4f}")

# Cox Proportional Hazards Regression
covariates = ["Age", "Stage", "Tx Modality"]

# Convert categorical variables to numeric
for col in covariates:
    if data[col].dtype == 'object' or data[col].dtype.name == 'category':
        data[col] = data[col].astype("category").cat.codes

cph = CoxPHFitter()
cph.fit(data[["Length FU", "event"] + covariates], duration_col="Length FU", event_col="event")

# Print the summary of the model
cph.print_summary()
cph.plot()
plt.title('Cox Regression Coefficients')
plt.show()

# Validate proportional hazards assumption
cph.check_assumptions(data, p_value_threshold=0.05)

# Random Survival Forests (RSF)
encoder = OneHotEncoder(drop="first", sparse=False)
encoded_features = encoder.fit_transform(data[covariates])
encoded_feature_names = encoder.get_feature_names_out(covariates)
encoded_df = pd.DataFrame(encoded_features, columns=encoded_feature_names, index=data.index)

# Prepare survival data for sksurv
data_x = pd.concat([data.drop(columns=covariates), encoded_df], axis=1)
data_y = Surv.from_dataframe(event="event", time="Length FU", data=data)

# Train Random Survival Forest model
rsf = RandomSurvivalForest(n_estimators=100, min_samples_split=10, min_samples_leaf=5, random_state=42)
rsf.fit(data_x, data_y)

# Compute Concordance Index for RSF
rsf_cindex = rsf.score(data_x, data_y)
print(f"Random Survival Forest C-index: {rsf_cindex:.4f}")

# Compute Concordance Index for Cox Regression
cph_cindex = cph.concordance_index_
print(f"Cox Regression C-index: {cph_cindex:.4f}")

# Perform Permutation Feature Importance Analysis
result = permutation_importance(rsf, data_x, data_y, n_repeats=15, random_state=42)
feature_importance = pd.DataFrame(
    {
        "importances_mean": result["importances_mean"],
        "importances_std": result["importances_std"],
    },
    index=data_x.columns,
).sort_values(by="importances_mean", ascending=False)

# Plot Feature Importance
plt.figure(figsize=(10, 6))
plt.title("Feature Importances in Random Survival Forest")
plt.barh(feature_importance.index, feature_importance["importances_mean"], 
         xerr=feature_importance["importances_std"], align="center", color="teal")
plt.xlabel("Mean Importance Score")
plt.ylabel("Features")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()
