In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from lifelines import KaplanMeierFitter
from lifelines.plotting import add_at_risk_counts
from lifelines.statistics import logrank_test
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# Charge the survival info, rename columns an set the index
df = pd.read_csv("./Data/Kaplan-Meier_TCGA_clusters/survdat0.csv").rename(columns={"Unnamed: 0":"Sample",
                                                                          "time":"OS.time",
                                                                          "status":"OS"})
df.index= df.Sample.replace("\.","-",regex=True) 
df

Unnamed: 0_level_0,Sample,OS.time,OS
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
TCGA-IB-7887-01,TCGA.IB.7887.01,110,1
TCGA-3A-A9IH-01,TCGA.3A.A9IH.01,1021,0
TCGA-IB-A5SP-01,TCGA.IB.A5SP.01,482,0
TCGA-HZ-8317-01,TCGA.HZ.8317.01,378,1
TCGA-XD-AAUL-01,TCGA.XD.AAUL.01,498,0
...,...,...,...
TCGA-HZ-7925-01,TCGA.HZ.7925.01,614,1
TCGA-3A-A9J0-01,TCGA.3A.A9J0.01,743,0
TCGA-2L-AAQE-01,TCGA.2L.AAQE.01,684,1
TCGA-HZ-8002-01,TCGA.HZ.8002.01,366,1


In [3]:
import pickle as pkl
with open("./Results/Clustering/Smple_cluster_dict.pkl", "rb") as f:
    samplesdict = pkl.load(f)

df["Risk_group"] = df.index.map(samplesdict)
df.drop("Sample",axis=1,inplace=True)
df

Unnamed: 0_level_0,OS.time,OS,Risk_group
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
TCGA-IB-7887-01,110,1,0
TCGA-3A-A9IH-01,1021,0,0
TCGA-IB-A5SP-01,482,0,0
TCGA-HZ-8317-01,378,1,1
TCGA-XD-AAUL-01,498,0,0
...,...,...,...
TCGA-HZ-7925-01,614,1,1
TCGA-3A-A9J0-01,743,0,0
TCGA-2L-AAQE-01,684,1,0
TCGA-HZ-8002-01,366,1,1


In [4]:
fig, ax = plt.subplots(1, 1, figsize = (6, 3))
kmf = KaplanMeierFitter()
T = df['OS.time'].loc[df['Risk_group'] == 1]
E = df['OS'].loc[df['Risk_group'] == 1]
kmf.fit(durations = T, event_observed = E, label = 'Cluster 1')
kmf.plot_survival_function()

kmf1 = KaplanMeierFitter()
T1 = df['OS.time'].loc[df['Risk_group'] == 0]
E1 = df['OS'].loc[df['Risk_group'] == 0]
kmf1.fit(durations = T1, event_observed = E1, label = 'Cluster 0')
kmf1.plot_survival_function()

results = logrank_test(T1,T,event_observed_A=E1, event_observed_B=E)

add_at_risk_counts(kmf,kmf1, ax=ax, fig=fig, labels=['Cluster 1', 'Cluster 0'], rows_to_show=['At risk'])

fontsize = 13

for tick in ax.xaxis.get_major_ticks():
    tick.label1.set_fontsize(fontsize)
    #tick.label1.set_fontweight('bold')
for tick in ax.yaxis.get_major_ticks():
    tick.label1.set_fontsize(fontsize)
    #tick.label1.set_fontweight('bold')

ax.text(0.2, 0.1, 'p.value= ' + str("{0:.6f}".format(results.p_value)), fontsize=10,weight='bold')
fig.patch.set_facecolor('w')
ax.spines[['right', 'top']].set_visible(False)
plt.xlabel('Days passed')
plt.title('Survival of different RS group')
plt.savefig("./Results/Kaplan-Meier_TCGA_clusters/KM_plot_tcga_clusters.png",bbox_inches="tight", )
plt.close()