In [None]:
import pandas as pd
import numpy as np
import scanpy as sc

import matplotlib.pyplot as plt
import seaborn as sns

from statannotations.Annotator import Annotator

from tqdm.notebook import tqdm

from scipy.stats import fisher_exact
import pathlib as pl
import os
from typing import List, Tuple

import pandas as pd
import scanpy as sc
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import pathlib as pl
from statannotations.Annotator import Annotator

# Breast Wu 10X

In [None]:
metacells = sc.read_h5ad("/add/path/here/Breast_Wu_10X/metacells.h5ad")
metacells.obs["HighMT"] = (metacells.obs["HighMT_fraction_of_1"]>0.3).astype(int)
metacells.X = metacells.layers["total_umis"]
sc.pp.normalize_total(metacells, target_sum=10000)
sc.pp.log1p(metacells)

Score transcriptional states only in malignant cells

In [None]:
mal_meta = metacells[metacells.obs.Malignant==1].copy()

Load gene markers of the transcriptional states from Wu et al.

In [None]:
breast_states = pd.read_csv("/add/path/here/auxiliary_data/breast_states_markers.txt", sep='\t', header=None)

gm1 = breast_states[breast_states[1] == 1][0].tolist()
gm2 = breast_states[breast_states[1] == 2][0].tolist()
gm3 = breast_states[breast_states[1] == 3][0].tolist()
gm4 = breast_states[breast_states[1] == 4][0].tolist()
gm5 = breast_states[breast_states[1] == 5][0].tolist()
gm6 = breast_states[breast_states[1] == 6][0].tolist()
gm7 = breast_states[breast_states[1] == 7][0].tolist()

In [None]:
sc.tl.score_genes(mal_meta, gene_list=gm1, score_name="GM1")
sc.tl.score_genes(mal_meta, gene_list=gm2, score_name="GM2")
sc.tl.score_genes(mal_meta, gene_list=gm3, score_name="GM3")
sc.tl.score_genes(mal_meta, gene_list=gm4, score_name="GM4")
sc.tl.score_genes(mal_meta, gene_list=gm5, score_name="GM5")
sc.tl.score_genes(mal_meta, gene_list=gm6, score_name="GM6")
sc.tl.score_genes(mal_meta, gene_list=gm7, score_name="GM7")

In [None]:
mal_meta.obs[['cleaned_celltype', 'Malignant', 'HighMT',
       'GM1', 'GM2', 'GM3', 'GM4',"GM5", "GM6", "GM7"]].to_csv("/add/path/here/brca_states.csv")

# SCLC Chan

In [None]:
metacells = sc.read_h5ad("/add/path/here/SCLC_Chan_10X/metacells.h5ad")
metacells.obs["HighMT"] = (metacells.obs["HighMT_fraction_of_1"]>0.3).astype(int)
metacells.X = metacells.layers["total_umis"]

sc.pp.normalize_total(metacells, target_sum=10000)
sc.pp.log1p(metacells)

Score transcriptional states in malignant cells

In [None]:
mal_meta = metacells[metacells.obs.Malignant==1].copy()

Provide gene signatures of the states from Zhang et al., doi: 10.21037/tlcr.2018.02.02

In [None]:
ne = ["BEX1", "ASCL1", "INSM1", "CHGA", "TAGLN3", "KIF5C", "CRMP1", "SCG3", "SYT4", "RTN1", "MYT1", "SYP",
"KIF1A", "TMSB15A", "SYN1", "SYT11", "RUNDC3A", "TFF3", "CHGB", "FAM57B", "SH3GL2", "BSN", "SEZ6", "TMSB15B", "CELF3"]

no_ne = ["RAB27B", "TGFBR2", "SLC16A5", "S100A10", "ITGB4", "YAP1", "LGALS3", "EPHA2", "S100A16", "PLAU", "ABCC3",
"ARHGDIB", "CYR61", "PTGES", "CCND1", "IFITM2", "IFITM3", "AHNAK", "CAV2", "TACSTD2", "TGFBI", "EMP1", "CAV1", "ANXA1", "MYOF"]


In [None]:
sc.tl.score_genes(mal_meta, gene_list=ne, score_name="NE")
sc.tl.score_genes(mal_meta, gene_list=no_ne, score_name="no_NE")

In [None]:
mal_meta.obs[['cleaned_celltype', 'Malignant', 'HighMT',
       'NE', 'no_NE']].to_csv("/add/path/here/sclc_states.csv")

# Metastatic pancreas Raghavan 10X

In [None]:
metacells = sc.read_h5ad("/add/path/here/Pancreas_Raghavan_10X/metacells.h5ad")
metacells.obs["HighMT"] = (metacells.obs["HighMT_fraction_of_1"]>0.3).astype(int)
metacells.X = metacells.layers["total_umis"]

sc.pp.normalize_total(metacells, target_sum=10000)
sc.pp.log1p(metacells)

Score transcriptional states in malignant cells

In [None]:
mal_meta = metacells[metacells.obs.Malignant==1].copy()

Use marker genes from Zhang et al., https://doi.org/10.1038/s41467-023-40727-7

In [None]:
states = pd.read_csv("/add/path/here/auxiliary_data/PDAC_states_markers.txt",  sep='\t')

tff1 = states[states.cluster == "Ductal cell TFF1"]["gene"].dropna().tolist()
rps3 = states[states.cluster == "Ductal cell RPS3"]["gene"].dropna().tolist()
ceacam6 = states[states.cluster == "Ductal cell CEACAM6"]["gene"].dropna().tolist()
malat1 = states[states.cluster == "Ductal cell MALAT1"]["gene"].dropna().tolist()
mki67 = states[states.cluster == "Ductal cell MKI67"]["gene"].dropna().tolist()
ceacam5 = states[states.cluster == "Ductal cell CEACAM5"]["gene"].dropna().tolist()


In [None]:
sc.tl.score_genes(mal_meta, gene_list=tff1, score_name="TFF1")
sc.tl.score_genes(mal_meta, gene_list=rps3, score_name="RPS3")
sc.tl.score_genes(mal_meta, gene_list=ceacam6, score_name="CEACAM6")
sc.tl.score_genes(mal_meta, gene_list=malat1, score_name="MALAT1")
sc.tl.score_genes(mal_meta, gene_list=mki67, score_name="MKI67")
sc.tl.score_genes(mal_meta, gene_list=ceacam5, score_name="CEACAM5")

In [None]:
mal_meta.obs[['cleaned_celltype', 'Malignant', 'HighMT',
       'TFF1', "RPS3", "CEACAM6", "MALAT1", "MKI67", "CEACAM5"]].to_csv("/add/path/here/metpan_states.csv")

# Pancreas Steele 10X

In [None]:
metacells = sc.read_h5ad("/add/path/here/Pancreas_Steele_10X/metacells.h5ad")
metacells.obs["HighMT"] = (metacells.obs["HighMT_fraction_of_1"]>0.3).astype(int)
metacells.X = metacells.layers["total_umis"]

sc.pp.normalize_total(metacells, target_sum=10000)
sc.pp.log1p(metacells)

Score transcriptional states in malignant cells

In [None]:
mal_meta = metacells[metacells.obs.Malignant==1].copy()

Use marker genes from Zhang et al., https://doi.org/10.1038/s41467-023-40727-7

In [None]:
states = pd.read_csv("/add/path/here/auxiliary_data/PDAC_states_markers.txt",  sep='\t')

tff1 = states[states.cluster == "Ductal cell TFF1"]["gene"].dropna().tolist()
rps3 = states[states.cluster == "Ductal cell RPS3"]["gene"].dropna().tolist()
ceacam6 = states[states.cluster == "Ductal cell CEACAM6"]["gene"].dropna().tolist()
malat1 = states[states.cluster == "Ductal cell MALAT1"]["gene"].dropna().tolist()
mki67 = states[states.cluster == "Ductal cell MKI67"]["gene"].dropna().tolist()
ceacam5 = states[states.cluster == "Ductal cell CEACAM5"]["gene"].dropna().tolist()

In [None]:
sc.tl.score_genes(mal_meta, gene_list=tff1, score_name="TFF1")
sc.tl.score_genes(mal_meta, gene_list=rps3, score_name="RPS3")
sc.tl.score_genes(mal_meta, gene_list=ceacam6, score_name="CEACAM6")
sc.tl.score_genes(mal_meta, gene_list=malat1, score_name="MALAT1")
sc.tl.score_genes(mal_meta, gene_list=mki67, score_name="MKI67")
sc.tl.score_genes(mal_meta, gene_list=ceacam5, score_name="CEACAM5")

In [None]:
mal_meta.obs[['cleaned_celltype', 'Malignant', 'HighMT',
       'TFF1', "RPS3", "CEACAM6", "MALAT1", "MKI67", "CEACAM5"]].to_csv("/add/path/here/pancreas_states.csv")


# RCC Bi 10X

In [None]:
metacells = sc.read_h5ad("/add/path/here/RCC_Bi_10X/metacells.h5ad")
metacells.obs["HighMT"] = (metacells.obs["HighMT_fraction_of_1"]>0.3).astype(int)
metacells.X = metacells.layers["total_umis"]

sc.pp.normalize_total(metacells, target_sum=10000)
sc.pp.log1p(metacells)

Score transcriptional states in malignant cells

In [None]:
mal_meta = metacells[metacells.obs.Malignant==1].copy()

Use marker genes of the states from Bi et al.

In [None]:
rcc_states = pd.read_csv("/add/path/here/auxiliary_data/RCC_states_markers.txt", sep='\t')

tp1 = rcc_states["TP1"].dropna().tolist()
tp2 = rcc_states["TP2"].dropna().tolist()

In [None]:
sc.tl.score_genes(mal_meta, gene_list=tp1, score_name="TP1")
sc.tl.score_genes(mal_meta, gene_list=tp2, score_name="TP2")

In [None]:
mal_meta.obs[['cleaned_celltype', 'Malignant', 'HighMT',
       'TP1', "TP2"]].to_csv("/add/path/here/rcc_states.csv")

# LUAD Bischoff 10X

In [None]:
metacells = sc.read_h5ad("/add/path/here/LUAD_Bischoff_10X/metacells.h5ad")
metacells.obs["HighMT"] = (metacells.obs["HighMT_fraction_of_1"]>0.3).astype(int)
metacells.X = metacells.layers["total_umis"]

sc.pp.normalize_total(metacells, target_sum=10000)
sc.pp.log1p(metacells)

Score transcriptional states in malignant cells

In [None]:
mal_meta = metacells[metacells.obs.Malignant==1].copy()

Use marker genes from the transcriptional states from Kim et al., https://doi.org/10.1038/s41467-020-16164-1

In [None]:
states = pd.read_csv("/add/path/here/auxiliary_data/LUAD_states_markers.txt", sep=',', index_col=0)

In [None]:
ts1 = states["ts1"].dropna().tolist()
ts2 = states["ts2"].dropna().tolist()
ts3 = states["ts3"].dropna().tolist()

In [None]:
sc.tl.score_genes(mal_meta, gene_list=ts1, score_name="tS1")
sc.tl.score_genes(mal_meta, gene_list=ts2, score_name="tS2")
sc.tl.score_genes(mal_meta, gene_list=ts3, score_name="tS3")

In [None]:
mal_meta.obs[['cleaned_celltype', 'Malignant', 'HighMT',
       'tS1', "tS2", "tS3"]].to_csv("/add/path/here/auxiliary_data/luad_states.csv")

# UvealMelanoma Durante 10X

In [None]:
metacells = sc.read_h5ad("/add/path/here/UvealMelanoma_Durante_10X/metacells.h5ad")
metacells.obs["HighMT"] = (metacells.obs["HighMT_fraction_of_1"]>0.3).astype(int)
metacells.X = metacells.layers["total_umis"]

sc.pp.normalize_total(metacells, target_sum=10000)
sc.pp.log1p(metacells)

Score transcriptional states in malignant cells

In [None]:
mal_meta = metacells[metacells.obs.Malignant==1].copy()

Load gene markers of the transcriptional states from Durante et al.

In [None]:
states = pd.read_csv("/add/path/here/auxiliary_data/uveal_states_markers.txt", sep='\t', index_col=None)

In [None]:
class_1a_primary = states["Class_1A_Primary"].dropna().tolist()
class_1b_pramePos_met = states["Class_1B+PRAME+_Met"].dropna().tolist()
class_2_prameNeg_primary = states["Class_2_PRAME-_Primary"].dropna().tolist()
class_2_pramePos_primary = states["Class_2_PRAME+_Primary"].dropna().tolist()

In [None]:
sc.tl.score_genes(mal_meta, gene_list=class_1a_primary, score_name="1A_primary")
sc.tl.score_genes(mal_meta, gene_list=class_1b_pramePos_met, score_name="1B_PRAMEpos_metastatic")
sc.tl.score_genes(mal_meta, gene_list=class_2_prameNeg_primary, score_name="2_PRAMEneg_primary")
sc.tl.score_genes(mal_meta, gene_list=class_2_pramePos_primary , score_name="2_PRAMEpos_primary")

In [None]:
mal_meta.obs[['cleaned_celltype', 'Malignant', 'HighMT',
       '1A_primary', "1B_PRAMEpos_metastatic", "2_PRAMEneg_primary", "2_PRAMEpos_primary"]].to_csv("/add/path/here/uveal_states.csv")


# Plot scores from all studies

In [None]:
sclc = pd.read_csv("/add/path/here/sclc_states.csv", index_col=0)
uveal = pd.read_csv("/add/path/here/uveal_states.csv", index_col=0)
brca = pd.read_csv("/add/path/here/brca_states.csv", index_col=0)
metpan = pd.read_csv("/add/path/here/metpan_states.csv", index_col=0)
luad = pd.read_csv("/add/path/here/luad_states.csv", index_col=0)
pancreas = pd.read_csv("/add/path/here/pancreas_states.csv", index_col=0)
rcc = pd.read_csv("/add/path/here/rcc_states.csv", index_col=0)

In [None]:
metpan.columns = metpan.columns[:3].tolist() + ["Met_" + x for x in metpan.columns[3:]] 

all = pd.concat([sclc, uveal, brca, metpan, luad, pancreas, rcc], axis=0)

sub = all.iloc[:,2:]
sub2 = pd.melt(sub, id_vars=["HighMT"]).dropna()

In [None]:
fig, ax = plt.subplots(1,1,figsize=(15,5))
order = rcc.columns[3:].tolist() + sclc.columns[3:].tolist() + brca.columns[3:].tolist() + pancreas.columns[3:].tolist() + metpan.columns[3:].tolist() + uveal.columns[3:].tolist()  + luad.columns[3:].tolist()
pairs = [((x, 0),(x,1)) for x in order]
hue_order = [0,1]
ax = sns.violinplot(data=sub2, x="variable", y="value", boxprops={'alpha': 0.4}, dodge=True, hue = "HighMT",
order = order, hue_order = hue_order)
                                          
annotator = Annotator(ax, pairs, data=sub2, x="variable", y="value", hue="HighMT", 
order = order, hue_order = hue_order)

annotator.configure(test='Mann-Whitney', text_format='star', loc='outside', text_offset=1, show_test_name = False)
annotator.apply_and_annotate()

plt.xticks(rotation=45, ha="right")
fig.show()
fig.savefig("/add/path/here/all_states.pdf", format="pdf")


Plot subset of the results for the main figure

In [None]:
subset = ["TP1", "NE", "GM1", "GM5", "GM7", "TFF1", "MALAT1", 
"Met_TFF1", "Met_MALAT1", "1B_PRAMEpos_metastatic", "tS2"]

sub3 = sub2[sub2.variable.isin(subset)].copy()
order2 = [x for x in order if x in subset]

fig, ax = plt.subplots(1,1,figsize=(15,2.5))
pairs = [((x, 0),(x,1)) for x in order2]
hue_order = [0,1]
ax = sns.violinplot(data=sub3, x="variable", y="value", boxprops={'alpha': 0.4}, dodge=True, hue = "HighMT",
order = order2, hue_order = hue_order)
                                          
annotator = Annotator(ax, pairs, data=sub3, x="variable", y="value", hue="HighMT", 
order = order2, hue_order = hue_order)

annotator.configure(test='Mann-Whitney', text_format='simple', loc='outside', text_offset=1, show_test_name = False)
annotator.apply_and_annotate()

plt.xticks(rotation=45, ha="right")
fig.show()
fig.savefig("/add/path/here/highMT_states.pdf", format="pdf")
