In [None]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from collections import defaultdict
from gtfparse import read_gtf
from lifelines import CoxPHFitter
from scipy.stats import kruskal
from sklearn.preprocessing import StandardScaler
from statannot import add_stat_annotation
from statsmodels.stats.multitest import multipletests

In [None]:
def pretty_ax(ax):
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.tick_params(
        axis='both',  
        which='both',      
        bottom=True,     
        top=False,
        left=False,
        labelbottom=True,
        labelleft = True)
    ax.spines["bottom"].set_linewidth(1.5)
    ax.spines["left"].set_linewidth(1.5)

In [None]:
def get_survival_sig(sig, std_fpkm, clinmat, corrected = False):
    scores = std_fpkm[std_fpkm.columns.intersection(sig)].mean(axis=1)
    scores.name = "score"
    augclin = pd.concat([clin,scores],axis=1)
    cph = CoxPHFitter()
    if corrected:
        if "stage" in augclin.columns:
            cph.fit(augclin[["OS","OS.time","score","Purity","stage",'age']].dropna().astype(float), duration_col="OS.time", event_col="OS")
        else:
            cph.fit(augclin[["OS","OS.time","score","Purity",'age']].dropna().astype(float), duration_col="OS.time", event_col="OS")
        
    else:
        if "stage" in augclin.columns:
            cph.fit(augclin[["OS","OS.time","score"]].dropna().astype(float), duration_col="OS.time", event_col="OS")
        else:
            cph.fit(augclin[["OS","OS.time","score"]].dropna().astype(float), duration_col="OS.time", event_col="OS")
    return cph.summary, cph

In [None]:
def get_genes_region(gene_annotation: pd.DataFrame, chrom_gain_sign: pd.DataFrame) -> np.ndarray:
    
    chromosome, _, _ = chrom_gain_sign.iloc[0]
    chromdf = gene_annotation[gene_annotation.seqname==chromosome]
    
    region_genes = {}
    for i in range(chrom_gain_sign.shape[0]):
        _, region_start, region_end = chrom_gain_sign.iloc[i]
        region_start = int(region_start)
        region_end = int(region_end)

        found = chromdf[(chromdf.start>=region_start) & (chromdf.end<=region_end)].gene_name.ravel()
        name = chromosome + ":" + str(region_start) + "-" + str(region_end)
        region_genes[name] = found
    return region_genes

In [None]:
fpkm_dir = "/path/to/fpkm/tcga/data"

fpkm = pd.read_csv(os.path.join(fpkm_dir,"ESCA_gene_fpkm.csv"),index_col=0)
fpkm = fpkm.loc[~fpkm.index.duplicated(keep="first")]
fpkm = fpkm.loc[:,~fpkm.columns.duplicated(keep="first")]

clin_dir = "/path/to/preprocessed/clinical/data/"
clin = pd.read_csv(os.path.join(clin_dir,"ESCA_clin_mat.csv"),index_col=0)

In [None]:
purity = pd.read_csv("path/to/ESTIMATE/purity",index_col=0) # for ESCA
cancer_purity = purity[purity["cancer_type"]=="ESCA"]["TumorPurity"]

In [None]:
cancer_purity.index = cancer_purity.index.str.rstrip("A")
cancer_purity.index = cancer_purity.index.str.rstrip("B")
cancer_purity.index = cancer_purity.index.str.rstrip("C")

cancer_purity = cancer_purity.loc[~cancer_purity.index.duplicated(keep="first")]

clin = pd.concat([clin,cancer_purity],axis=1).dropna().rename(columns={"CPE": "Purity", "TumorPurity": "Purity"})
clin = clin.loc[~clin.index.duplicated(keep="first")]

In [None]:
stage_mapping = {"Stage X": np.nan, "Stage IV": 4, "Stage IVA": 4, "Stage IIB": 2, "Stage IA": 1, "Stage IIIA": 3, "Stage IIA": 2, "Stage IIC": 2, 
                 "Stage I": 1, "Stage IIIC": 3, "Stage IB": 1, "Stage IIIB": 3, 'Stage III': 3, "Stage II": 2, "I/II NOS": 1,
                 "[Discrepancy]": np.nan}

clin.age = -clin.age
clin.age = (clin.age - clin.age.mean())/(clin.age.std())
clin.stage = clin.stage.replace(stage_mapping).dropna()

In [None]:
gencode = pd.read_csv("path/to/gencode/mapping")
gencode = gencode.set_index('gencode_id_gex')

mapping_gen = gencode.loc[fpkm.columns.intersection(gencode.index)].gene_name.to_dict()

fpkm = fpkm.loc[:,fpkm.columns.intersection(gencode.index)]

fpkm = fpkm.rename(columns=mapping_gen)

In [None]:
ss = StandardScaler()
std_fpkm = pd.DataFrame(ss.fit_transform(fpkm),index=fpkm.index,columns=fpkm.columns)

In [None]:
signature_dir = "path/to/esophag/metasignatures/from/cansig"

In [None]:
all_sigs = {}
for s in os.listdir(signature_dir):
    sig = s.split(".csv")[0]
    all_sigs[sig] = pd.read_csv(os.path.join(signature_dir,s),index_col=0)
    all_sigs[sig] = all_sigs[sig].iloc[:50].values.ravel()

In [None]:
corrected_df_new = []
for s in sorted(list(all_sigs.keys())):
    sig_cph, _ = get_survival_sig(all_sigs[s], std_fpkm, clin, corrected=True)
    sig_cph = sig_cph.rename(index={"score": s, "Purity": f"purity_{s}", "age": f"age_{s}", "stage": f"stage_{s}"})
    corrected_df_new.append(sig_cph)
corrected_df_new = pd.concat(corrected_df_new)

In [None]:
corrected_df_new

In [None]:
uncorrected_df_new = []
for s in sorted(list(all_sigs.keys())):
    sig_cph, _ = get_survival_sig(all_sigs[s], std_fpkm, clin, corrected=False)
    sig_cph = sig_cph.rename(index={"score": s, "Purity": f"purity_{s}", "age": f"age_{s}", "stage": f"stage_{s}"})
    uncorrected_df_new.append(sig_cph)
uncorrected_df_new = pd.concat(uncorrected_df_new)

In [None]:
uncorrected_df_new

In [None]:
scores = std_fpkm[std_fpkm.columns.intersection(all_sigs["metasig5"])].mean(axis=1)
scores.name = "Signature score"
augclin = pd.concat([clin,scores],axis=1)

In [None]:
augclin.corr(method="spearman")

In [None]:
augclin.stage = augclin.stage.replace({1: "I", 2: 'II', 3: "III/IV", 4: "III/IV"})

In [None]:
fig, ax = plt.subplots(1,1)
plt.rcParams.update({'font.size': 15})
sns.violinplot(data=augclin, x="stage", y="Signature score", order=["I","II","III/IV"], ax=ax)
pretty_ax(ax)
add_stat_annotation(data=augclin, x="stage", y="Signature score", order=["I","II","III/IV"], ax=ax,
                   box_pairs=[("I","II"),("I","III/IV")],
                   test="Mann-Whitney")
ax.set_xlabel("Stage")
medians = augclin.groupby(by="stage").median()["Signature score"].round(2)
ax.text(0.05,medians['I'],f"{medians['I']}",fontdict={"size": 11},c="black")
ax.text(1.05,medians['II'],f"{medians['II']}",fontdict={"size": 11},c="black")
ax.text(2.05,medians['III/IV'],f"{medians['III/IV']}",fontdict={"size": 11},c="black")
fig.savefig("path/to/fig/signature_stage.svg",tight_layout=True)

In [None]:
multipletests(allps, method="fdr_bh")

In [None]:
esca_cnv = pd.read_csv("/path/to/thresholded/cnv/TCGA",sep="\t",index_col=0).T

gene_annotation = read_gtf('/path/to/gencode/v40/gtf')

In [None]:
sign_gains = pd.read_csv("/path/to/saved/significant/gains/escc/cansig",index_col=0)

chroms = sign_gains.index.str.split(":").str[0]
poss = sign_gains.index.str.split(":").str[1]
starts = poss.str.split("-").str[0]
ends = poss.str.split("-").str[1]

df_sign_gain = pd.DataFrame(np.array([chroms,starts,ends]),index=["chromosome","start","end"]).T

In [None]:
gene_annotation = gene_annotation[gene_annotation.feature=="gene"]

In [None]:
unique_chroms = df_sign_gain.chromosome.unique()

In [None]:
p_values = {}
cnv_gene_scores = {}
for chromosome in unique_chroms:
    chrom_gain_sign = df_sign_gain[df_sign_gain["chromosome"]==chromosome]
    region_genes = get_genes_region(gene_annotation=gene_annotation, chrom_gain_sign=chrom_gain_sign)
    
    for region_name in region_genes:
        selcnvgenes = esca_cnv.columns.intersection(region_genes[region_name])

        selgenesgain = (esca_cnv[selcnvgenes]>0).astype(int).sum(axis=1)
        selgenesloss = (esca_cnv[selcnvgenes]<0).astype(int).sum(axis=1)
        patinclude = (selgenesloss[selgenesloss==0]).index
        selgenesgain = selgenesgain.loc[patinclude]
        patgains = (selgenesgain>(len(selcnvgenes)/2)).astype(int)

        patgains.name = "region_gained"

        cnv_gene_score = pd.concat([scores,patgains],axis=1).dropna()
        cnv_gene_scores[region_name] = cnv_gene_score
        score_neutral = cnv_gene_score[cnv_gene_score.region_gained==0]["Signature score"].ravel()
        score_gained = cnv_gene_score[cnv_gene_score.region_gained==1]["Signature score"].ravel()
        p_values[region_name] = kruskal(score_neutral, score_gained)[1]

In [None]:
q_values = multipletests(list(p_values.values()),method="fdr_bh")[1]

In [None]:
sign_regions = np.array(list(p_values.keys()))[q_values<0.05]

In [None]:
list_regions = np.char.split(sign_regions,":")

In [None]:
big_regions = defaultdict(list)
for i,reg in enumerate(list_regions):
    big_regions[reg[0]].append(sign_regions[i])

In [None]:
big_region_scores = defaultdict(list)
for chrom in list(big_regions.keys()):
    for reg in big_regions[chrom]:
        big_region_scores[chrom].append(cnv_gene_scores[reg]["region_gained"])
    big_region_scores[chrom] = (pd.concat(big_region_scores[chrom],axis=1).sum(axis=1)>=(len(big_region_scores[chrom])/2)).astype(int)
    big_region_scores[chrom] = pd.concat([big_region_scores[chrom],cnv_gene_scores[reg]["Signature score"]],axis=1)
    big_region_scores[chrom].columns = ["Region gained","Signature score"]

In [None]:
for region_name in big_region_scores:
    cnv_gene_plot = big_region_scores[region_name].replace({0: "Neutral", 1: "Gain/Amplif."})
    vc = cnv_gene_plot["Region gained"].value_counts()
    n_gained = vc.loc["Gain/Amplif."]
    n_neutral = vc.loc["Neutral"]
    print(cnv_gene_plot.groupby(by="Region gained").describe())
    
    fig, ax = plt.subplots(1,1)
    plt.rcParams.update({'font.size': 15})
    sns.violinplot(data=cnv_gene_plot ,x="Region gained",y="Signature score",order=["Neutral","Gain/Amplif."],ax=ax)
    pretty_ax(ax)
    add_stat_annotation(data=cnv_gene_plot ,x="Region gained",y="Signature score",order=["Neutral","Gain/Amplif."],
                        ax=ax,
                       box_pairs=[("Neutral","Gain/Amplif.")],
                       test="Mann-Whitney")
    ax.set_xlabel(f"CNV status {region_name}")
    ax.set_xticklabels([f"Neutral n={n_neutral}",f"Gain/Amplif. n={n_gained}"])
    medians = cnv_gene_plot.groupby(by="Region gained").median().round(2)
    print(medians)
    ax.text(0.05,medians.loc['Neutral'],f"{float(medians.loc['Neutral'])}",fontdict={"size": 15},c="w")
    ax.text(1.05,medians.loc['Gain/Amplif.'],f"{float(medians.loc['Gain/Amplif.'])}",fontdict={"size": 15},c="w")
    fig.savefig(f"path/to/fig/cnv_signature_{region_name}.svg",bbox_inches="tight")