In [None]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
data_dir = "brca"

## Survival

In [None]:
df_files = pd.read_csv("files.dat", sep=",", index_col=0).replace("unknown", np.nan)
df_files.info()

In [None]:
bins = np.linspace(0,100,20)
avg = df_files["cases.0.diagnoses.0.age_at_diagnosis"].astype(float).mean(skipna=True)
df_files["age_at_diagnosis"]=pd.cut(df_files["cases.0.diagnoses.0.age_at_diagnosis"].astype(float).fillna(avg)/365., bins=bins, labels = (bins[1:]+bins[:-1])/2).__array__()

In [None]:
df_files["gender"] = (df_files["cases.0.demographic.gender"]=="male").astype(int)

In [None]:
for letter in ["a", "b", "c"]:
    for (old, new) in zip(["stage %s%s"%(i,letter) for i in ["i", "ii", "iii", "iv"]],["stage %s"%i for i in ["i", "ii", "iii", "iv"]]):
        df_files.replace(old, new, inplace=True)
df_files["cases.0.diagnoses.0.tumor_stage"].unique()

In [None]:
df_files["tumor_stage"]=df_files["cases.0.diagnoses.0.tumor_stage"]
for i,stage in enumerate(["stage i", "stage ii", "stage iii", "stage iv", "stage v", "stage x"]):
    df_files["tumor_stage"].replace(stage, i+1, inplace=True)

In [None]:
#0 = Alive
df_files["vital_status"]=(df_files["cases.0.demographic.vital_status"]=="Dead").astype(int)

In [None]:
def get_survival(case):
    if case["cases.0.demographic.vital_status"] == 1:
        return case["cases.0.demographic.days_to_death"]
    else:
        return case["cases.0.diagnoses.0.days_to_last_follow_up"]

df_files["days_survival"] = df_files.apply(get_survival,1)

## Unitopic variate

In [None]:
%load_ext autoreload
%autoreload 2
import lifelines
import sys
from survival import fit_cox, add_group_to_subset, save_plot
from topicpy.hsbmpy import get_file
from scipy.stats import pearsonr

In [None]:
l=1

algorithm = "trisbm"
kind = "keyword", "metadatum"
#kind = "word", "topic"

In [None]:
df_clusters = pd.read_csv("%s/trisbm/trisbm_level_1_%s-dist.csv"%(data_dir,kind[1]),index_col=1).drop("i_doc",1)

In [None]:
df_topics = pd.read_csv("%s/%s/%s_level_%d_%s-dist.csv"%(data_dir,algorithm,algorithm,l,kind[1]))

df_topics.set_index('doc', inplace=True)
df_topics.insert(0,'stage','')
df_topics.drop('i_doc', axis=1, inplace=True)

labels=["cases.0.diagnoses.0.tumor_stage"]
label = labels[0]

if label not in df_files.columns:
    raise AttributeError(f"{label} not valid. Available: {df_files.columns}")
for letter in ["a", "b"]:
    for (old, new) in zip(["stage %s%s"%(i,letter) for i in ["i", "ii", "iii", "iv"]],["stage %s"%i for i in ["i", "ii", "iii", "iv"]]):
        df_files.replace(old, new, inplace=True)
for sample in df_topics.index.values:
    file_data = get_file(sample,df_files)
    if file_data is None:
        continue
    df_topics.at[sample,'stage']=("%s"%(file_data[labels[0]]))

mask = df_topics["stage"]!="not reported"
mask = (mask) & (df_topics["stage"]!="")
mask = (mask) & (df_topics["stage"]!="stage x")
mask = (mask) & (df_topics["stage"]!="nan")
df_topic_stage_gb = df_topics[mask].groupby(["stage"]).mean()

color_generator=(c for c in sns.palettes.color_palette(n_colors=20, palette=None))

In [None]:
def get_cohort(subtype):
    if str(subtype)=="nan":
        return None
    else:
        if "Basal" in subtype:
            return "A"
        else:
            return "B"
df_files["cohort"]=list(map(get_cohort,df_files["Subtype_Selected"]))

In [None]:
summaries = pd.DataFrame()
mask = (~df_files["days_survival"].isna()) & (df_files["cases.0.diagnoses.0.tumor_stage"]!="not reported")
subset = df_files[mask]
df_files = df_files.reset_index()
subset = subset[["days_survival","vital_status", "tumor_stage", "gender", "age_at_diagnosis"]]
for topic in df_clusters.columns:
    top_set = add_group_to_subset(topic, subset, df_clusters.divide(df_clusters.sum(0),1), 0.35)
    #print(top_set[topic].sum())
    summary, cph, ax = fit_cox(top_set, topic)
    if summary is not None:
        if summary.at[summary.index[-1],"-log2(p)"]/(np.log2(10)) > 2:
            print(topic,"\n",summary.loc[summary.index[-1],["coef", "p"]],"\n")
            summaries = summaries.append(summary)
            #ax=None
            if ax is not None:
                ax.set_title(ax.title.get_text(), fontsize=35)
                save_plot(ax, "", topic)
        
            figcph, axcph = plt.subplots(figsize=(18,15))
            cph.plot(ax=axcph,elinewidth=15)

            axcph.tick_params(labelsize=35)
            axcph.set_xlabel(axcph.get_xlabel(), fontsize=35)
            plt.tight_layout()
            figcph.savefig(f"survival_{topic}_HR.pdf")

In [None]:
summaries

## Multivariate

In [None]:
%load_ext watermark
%watermark -a Filippo_Valle -p pandas,scanpy,requests -m -v -g

In [None]:
from lifelines import CoxPHFitter

In [None]:
algorithm = "trisbm"
l = 1

In [None]:
df_covariates = pd.DataFrame(index=df_files["case_id"])

df_topics = pd.read_csv("%s/%s/%s_level_%d_metadatum-dist.csv"%(data_dir,algorithm,algorithm,l), index_col=1).drop("i_doc",1)
df_topics_std = df_topics.divide(df_topics.sum(0),1)

q = np.linspace(0, 1, 25)
#q = [0, 0.35, 1]

for topic in df_topics:
    quantiles = np.argmin([np.abs(df_topics_std[topic]-q) for q in df_topics_std[topic].quantile(q).values], 0)
    df_covariates = df_covariates.join(pd.Series(name=topic, index = df_topics_std.index, data = quantiles), how="outer")
    

df_covariates["vital_status"] = df_files["vital_status"].values.astype(int)
df_covariates["time"] = df_files["days_survival"].values
#df_covariates["gender"] = df_files["gender"].values
#df_covariates["age"] = df_files["age_at_diagnosis"].values
#df_covariates["stage"] = df_files["tumor_stage"].values

#mask = (~df_covariates["time"].isna()) & (df_covariates["stage"]!="not reported")
df_covariates = df_covariates[mask]

In [None]:
df_covariates[mask].head(5)

In [None]:
cph = CoxPHFitter()
res = cph.fit(df_covariates.dropna(how="any", axis=0), duration_col='time', event_col='vital_status')

cph.print_summary()

In [None]:
figcph, axcph = plt.subplots(figsize=(18,20))
#cph.plot(ax=axcph, elinewidth=15, capsize=1)

data = cph.summary
coefs = data["coef"]

axcph.errorbar(x = coefs, 
               y = [t.replace("Metadatum", "Topic") for t in data.index], 
               xerr=np.array([coefs-data["coef lower 95%"],data["coef upper 95%"]-coefs]).reshape((2,-1)),
               lw=0,
               marker="x",
               ms=25,
               c="gray",
               elinewidth=30,
               alpha=0.8
      )

axcph.vlines(x=0, ymin=0, ymax=data.shape[0], color="black", ls="--", lw=15, alpha=0.9)


#data = cph.summary[cph.summary.index.isin(["Metadatum 3", "Metadatum 7"])]
data = cph.summary[cph.summary["-log2(p)"]/np.log2(10)>-np.log10(0.05)]
coefs = data["coef"]

axcph.errorbar(x = coefs, 
               y = [t.replace("Metadatum", "Topic") for t in data.index], 
               xerr=np.array([coefs-data["coef lower 95%"],data["coef upper 95%"]-coefs]).reshape((2,-1)),
               lw=0,
               c="red",
               elinewidth=15
      )


axcph.tick_params(labelsize=35)
axcph.set_xlabel("Log(HR) (95% CI)", fontsize=35)
plt.tight_layout()
figcph.savefig(f"survival_HR.pdf")

In [None]:
plt.hist(cph.summary["coef"])