In [None]:
import numpy as np
import pandas as pd
import pathlib as pl 
import seaborn as sns

from statsmodels.stats.multitest import multipletests

In [None]:
basedir = pl.Path("/path/to/tcga/assoc/results")

In [None]:
cancer_tcga = ["CRC_COAD","CRC_ICM_COAD","HGG_GBM","Neftel_GBM","ESO_ESCA","BRCA_BRCA"]

In [None]:
full_q_corrected, full_q_uncorrected = [],[]
full_OR_corrected, full_OR_uncorrected = [],[]
labels_pc = []
for cancer in cancer_tcga:
    
    survival_link = pd.read_csv(basedir / f"{cancer}_survival.txt",sep="\t")
    
    df = survival_link[survival_link["Is_corrected"]]
    df = df[df.feature.str.startswith("MetaSig")]
    p_values = df["Cox_pvalue"].ravel()
    corrected_q_values = multipletests(p_values, method="fdr_bh")[1]
    corrected_q_values = pd.DataFrame(corrected_q_values, index=df.feature.ravel())
    full_q_corrected.append(corrected_q_values)
    full_OR_corrected.append(df[["Cox_hazard"]])

    df = survival_link[~survival_link["Is_corrected"]]
    p_values = df["Cox_pvalue"].ravel()
    uncorrected_q_values = multipletests(p_values, method="fdr_bh")[1]
    uncorrected_q_values = pd.DataFrame(uncorrected_q_values, index=df.feature.ravel())
    full_q_uncorrected.append(uncorrected_q_values)
    full_OR_uncorrected.append(df[["Cox_hazard"]])
    
    labels = [cancer]*corrected_q_values.shape[0]
    labels_pc.append(labels)
labels_pc = np.concatenate(labels_pc)

full_OR_corrected = pd.concat(full_OR_corrected,ignore_index=True).T
full_OR_uncorrected = pd.concat(full_OR_uncorrected,ignore_index=True).T

full_q_corrected = pd.concat(full_q_corrected).T
full_q_uncorrected = pd.concat(full_q_uncorrected).T

In [None]:
survival_plot_q = pd.concat([full_q_uncorrected,full_q_corrected])
survival_plot_q.columns = survival_plot_q.columns.str.replace("MetaSignature_","p-val Meta-sig. ")
survival_plot_or = pd.concat([full_OR_uncorrected,full_OR_corrected])
survival_plot_or.columns = survival_plot_q.columns.str.replace("p-val ","")
survival_plot_or.index = ["Univariable","Multivariable"]
survival_plot_q.index = ["Univariable","Multivariable"]

In [None]:
df2 = survival_plot_or.copy()
df2.columns = df2.columns.str.replace("Meta", "HR Meta")
labels_df = pd.DataFrame(np.concatenate([labels_pc,labels_pc]).reshape(1,-1),
                         columns=list(df2.columns)+list(survival_plot_q.columns), index=["Cancer"])
sheet1 = pd.concat([df2,survival_plot_q],axis=1)
sheet1 = pd.concat([sheet1,labels_df])

In [None]:
sns_palette = sns.color_palette()
color_mapping = {cancer: sns_palette[i] for i,cancer in enumerate(np.unique(labels_pc))}
color_labels = [color_mapping[cancer] for cancer in labels_pc]

In [None]:
g = sns.clustermap(survival_plot_or.fillna(0), cmap="vlag",center=1, mask=(survival_plot_q>0.1).values,
               col_cluster=False,row_cluster=False,col_colors=color_labels, 
                   figsize=(12,2), linewidth=1, colors_ratio=0.15, annot=survival_plot_or.round(1))
g.ax_heatmap.set_ylabel("")
g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(),rotation="0")
g.fig.savefig("path/to/fig/survival_link_tcga.svg",bbox_inches="tight")

# Subtype link

In [None]:
subtype_link = {}
for cancer in cancer_tcga:
    
    sbt = pd.read_csv(basedir / f"{cancer}_subtype_kruskal.txt",sep="\t")
    
    if cancer=="BRCA_BRCA":
        sbt = sbt[sbt.feature=="Subtype_mRNA"]
    elif cancer in ["HGG_GBM","Neftel_GBM"]:
        sbt = sbt[sbt.feature.isin(["Subtype_mRNA","Subtype_DNAmeth","Subtype_Selected"])]
        sbt.feature = sbt.feature.replace({"Subtype_Selected": "Subtype_other"})
    elif cancer in ["CRC_COAD","CRC_ICM_COAD","ESO_ESCA"]:
        sbt = sbt[sbt.feature.isin(["Subtype_other"])]
    sbt = sbt.dropna()
    
    sbt = sbt.groupby(["signature","feature"]).mean()
    
    qvalues = multipletests(sbt.pvalue.ravel(),method="fdr_bh")[1]
    sbt["qvalue"] = qvalues
    
    subtype_link[cancer] = sbt.reset_index()

In [None]:
all_assoc = []
labels_pc = []
for cancer in cancer_tcga:
    cancer_assoc = []
    for feature in subtype_link[cancer].feature.unique():
        assoc_pms = []
        for ms in subtype_link[cancer].signature.unique():
            msdf = subtype_link[cancer][(subtype_link[cancer].signature==ms) & (subtype_link[cancer].feature==feature)]
            if (msdf.qvalue<0.1).sum()>0:
                selected = -np.log(msdf.qvalue.min())
            else:
                selected = np.nan
            assoc_pms.append(pd.DataFrame([selected],index=[ms],columns=[feature]))
    
        assoc_pms = pd.concat(assoc_pms).T
        cancer_assoc.append(assoc_pms)
        
    cancer_assoc = pd.concat(cancer_assoc)
    
    all_assoc.append(cancer_assoc)
    labels = [cancer]*cancer_assoc.shape[1]
    labels_pc.append(labels)
labels_pc = np.concatenate(labels_pc)
all_assoc = pd.concat(all_assoc,axis=1)
    

In [None]:
column_order = ["Subtype_mRNA","Subtype_DNAmeth","Subtype_other"]
all_assoc = all_assoc.loc[column_order]
all_assoc.columns = all_assoc.columns.str.replace("MetaSignature_","Meta-sig. ")
all_assoc.index = all_assoc.index.str.replace("_"," ")

sheet2 = all_assoc.copy()
labels_df = pd.DataFrame(labels_pc.reshape(1,-1),
                         columns=list(sheet2.columns), index=["Cancer"])
sheet2 = pd.concat([sheet2,labels_df])

sns_palette = sns.color_palette()
color_mapping = {cancer: sns_palette[i] for i,cancer in enumerate(np.unique(labels_pc))}
color_labels = [color_mapping[cancer] for cancer in labels_pc]

In [None]:
sign_q = 2.3
g = sns.clustermap(all_assoc.fillna(0), cmap="vlag",center=sign_q, vmin=0, vmax=7,mask=all_assoc.fillna(0)<sign_q,
               col_cluster=False,row_cluster=False,col_colors=color_labels, 
                   figsize=(12,2), linewidth=1, colors_ratio=0.15, cbar_kws={"label": "-log(p)"})
g.ax_heatmap.set_ylabel("")
g.fig.savefig("path/to/fig/subtype_link_tcga.svg",bbox_inches="tight")

# Clinical link

In [None]:
clinical_link = {}
for cancer in cancer_tcga:
    
    clindf_kruskal = pd.read_csv(basedir / f"{cancer}_clinical_kruskal.txt",sep="\t")
    
    clindf_kruskal = clindf_kruskal[clindf_kruskal.feature.isin(["gender",'ajcc_pathologic_tumor_stage'])]
    clindf_kruskal = clindf_kruskal.groupby(["signature","feature"]).mean().reset_index()
    
    clindf_cor = pd.read_csv(basedir / f"{cancer}_clinical_cor.txt",sep="\t")
    clindf_cor = clindf_cor[clindf_cor.col_names.isin(["age_at_initial_pathologic_diagnosis"])]
    clindf_cor.columns = clindf_cor.columns.str.replace("_cor","")
    clindf_cor = clindf_cor.rename(columns={"Rho_coef": "Ratio/Rho", "col_names": "feature"})
    clindf_cor = clindf_cor.replace({"age_at_initial_pathologic_diagnosis": "Age"})
    
    clindf = pd.concat([clindf_kruskal,clindf_cor],ignore_index=True)
    
    qvalues = multipletests(clindf.pvalue.ravel(),method="fdr_bh")[1]
    clindf["qvalue"] = qvalues
    
    clinical_link[cancer] = clindf

In [None]:
clin_assoc = []
labels_pc = []
for cancer in cancer_tcga:
    cancer_assoc = []
    for feature in clinical_link[cancer].feature.unique():
        assoc_pms = []
        for ms in clinical_link[cancer].signature.unique():
            msdf = clinical_link[cancer][(clinical_link[cancer].signature==ms) & (clinical_link[cancer].feature==feature)]
            if (msdf.qvalue<0.1).sum()>0:
                selected = -np.log(msdf.qvalue.min())
            else:
                selected = np.nan
            assoc_pms.append(pd.DataFrame([selected],index=[ms],columns=[feature]))
    
        assoc_pms = pd.concat(assoc_pms).T
        cancer_assoc.append(assoc_pms)
        
    cancer_assoc = pd.concat(cancer_assoc)
    
    clin_assoc.append(cancer_assoc)
    labels = [cancer]*cancer_assoc.shape[1]
    labels_pc.append(labels)
labels_pc = np.concatenate(labels_pc)
clin_assoc = pd.concat(clin_assoc,axis=1)
    

In [None]:
clin_assoc.columns = clin_assoc.columns.str.replace("MetaSignature_","Meta-sig. ")
clin_assoc = clin_assoc.rename(index={"ajcc_pathologic_tumor_stage": "Stage", "gender": "Gender"})

sheet3 = clin_assoc.copy()
labels_df = pd.DataFrame(labels_pc.reshape(1,-1),
                         columns=list(sheet3.columns), index=["Cancer"])
sheet3 = pd.concat([sheet3,labels_df])

sns_palette = sns.color_palette()
color_mapping = {cancer: sns_palette[i] for i,cancer in enumerate(np.unique(labels_pc))}
color_labels = [color_mapping[cancer] for cancer in labels_pc]

In [None]:
sign_q = 2.3
g = sns.clustermap(clin_assoc.fillna(0), cmap="vlag",center=sign_q, vmin=0, vmax=7,mask=clin_assoc.fillna(0)<sign_q,
               col_cluster=False,row_cluster=False,col_colors=color_labels, 
                   figsize=(12,2), linewidth=1, colors_ratio=0.15, cbar_kws={"label": "-log(p)"})
g.ax_heatmap.set_ylabel("")
g.fig.savefig("path/to/fig/clinical_link_tcga.svg",bbox_inches="tight")

### Write to excel

Just to make my life easier, the cnv association is added here, but it's in fact computed separately

In [None]:
with pd.ExcelWriter('path/to/save/Suppl_Tables_2_5.xlsx') as writer:  
    sheet1.to_excel(writer, sheet_name='Suppl. Table 2' , na_rep="np.nan", startcol=1, startrow=2)
    worksheet = writer.sheets['Suppl. Table 2']
    worksheet.write_string(0, 0, 'Suppl. Table 2, Associations between CanSig meta-signatures and survival in TCGA. Univariate analysis corresponds to a Cox model with the score as parameter. Multivariate analysis corresponds to a Cox model with the score, the age, the stage and the tumor purity as input.')
    sheet2.to_excel(writer, sheet_name='Suppl. Table 3', na_rep="np.nan", startcol=1, startrow=2)
    worksheet = writer.sheets['Suppl. Table 3']
    worksheet.write_string(0, 0, 'Suppl. Table 3, Associations between CanSig meta-signatures and known molecular subtypes in TCGA. The -log(p) significance of the FDR corrected p value of Kruskal Wallis test across groups is reported.')
    sheet3.to_excel(writer, sheet_name='Suppl. Table 4', na_rep="np.nan", startcol=1, startrow=2)
    worksheet = writer.sheets['Suppl. Table 4']
    print(worksheet)
    worksheet.write_string(0, 0, 'Suppl. Table 4, Associations between CanSig meta-signatures and age, stage and gender in TCGA. For age, the Pearson correlation coefficient is reported, as well as the FDR corrected p value. For age and gender, the Kruskal Wallis FDR-corrected p value is reported. The mean signature score in each group is reported')

In [None]:
cnv_assoc = pd.read_csv("path/to/diff/cnv/metasig5/escc/cansig",index_col=0)
cnv_assoc = cnv_assoc[cnv_assoc.metasig5_qvalues<0.05].loc[:,cnv_assoc.columns.str.startswith("metasig5")]
cnv_assoc = cnv_assoc[(cnv_assoc.metasig5_perc_gains - cnv_assoc.metasig5_rest_gains)>0.25]

In [None]:
with pd.ExcelWriter('path/to/save/Suppl_Tables_2_5.xlsx',engine="openpyxl", mode="a") as writer:  
    cnv_assoc.to_excel(writer, sheet_name='Suppl. Table 5' , na_rep="np.nan", startcol=1, startrow=2)
    worksheet = writer.sheets['Suppl. Table 5']
    worksheet.cell(row=1, column=1).value = 'Suppl. Table 5, Gains significantly associated with meta-signature 5 in ESCC.'