In [None]:
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA

import seaborn as sns
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm

from scipy.stats import mannwhitneyu, pearsonr

from statsmodels.stats.multitest import multipletests

In [None]:
from statannotations.Annotator import Annotator

In [None]:
def pretty_ax(ax):
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.tick_params(
        axis='both',  
        which='both',      
        bottom=True,     
        top=False,
        left=False,
        labelbottom=True,
        labelleft = True)
    ax.spines["bottom"].set_linewidth(1.5)
    ax.spines["left"].set_linewidth(1.5)

In [None]:
# this is the list of genes from the ABC transporter family as given in the KEGG database
kegg_ABC = ['ABCA2','ABCC4','ABCG8','ABCA3','ABCC5','ABCC2','ABCA1','ABCC3','ABCA6',
 'ABCC8','ABCA7','ABCC9','ABCA4','ABCC6','ABCA5','TAP2','ABCA8','TAP1','ABCA9','ABCA10','ABCB10','ABCA12',
 'ABCB11','ABCC10','ABCG1','ABCG4','ABCC1','ABCG5','ABCG2','CFTR','ABCB4','ABCB1','ABCD3','ABCD4',
 'ABCB7','ABCB8','ABCB5','ABCB6','ABCB9','ABCA13','ABCC11','ABCC12','DEFB1','ABCD1','ABCD2']

The data needs to be downloaded from here https://depmap.org/portal/data_page/?tab=allData before.

In [None]:
expected_counts = pd.read_csv("/add/path/here/OmicsExpressionGenesExpectedCountProfile.csv",index_col=0)

profile_mapping = pd.read_csv("/add/path/here/OmicsProfiles.csv",index_col=0)

expected_counts = expected_counts.rename(index=profile_mapping["ModelID"])

expected_counts.columns = expected_counts.columns.str.split(" ").str[0]

mt_counts = (expected_counts.loc[:,expected_counts.columns.str.startswith("MT-")]).sum(axis=1)
full_counts = expected_counts.sum(axis=1)
pct_counts_mt = mt_counts/full_counts
pct_counts_mt.name = "pct_counts_mt"

pct_counts_mt = pct_counts_mt.loc[~pct_counts_mt.index.duplicated()]

In [None]:
rna = pd.read_csv("/add/path/here/internal-23q2_v98-omicsexpressionproteincodinggenestpmlogp1.csv",index_col=0)

info = pd.read_csv("/add/path/here/internal-23q2_v98-model.csv",index_col=0)

typ = info["OncotreePrimaryDisease"]
lowcounts = typ.value_counts()[(typ.value_counts()<5)].index
mapping = {c: "Other" for c in lowcounts}
typ = typ.replace(mapping)

rna.columns = rna.columns.str.split(" ").str[0]

In [None]:
drug_response = pd.read_csv("/add/path/here/GDSC1_fitted_dose_response_27Oct23.csv").set_index("SANGER_MODEL_ID")

In [None]:
mapping_ids = info["SangerModelID"].reset_index().dropna().set_index("SangerModelID").to_dict()["ModelID"]

In [None]:
drug_response = drug_response.rename(index=mapping_ids)

In [None]:
drug_mapping = drug_response[["DRUG_ID","DRUG_NAME"]].groupby("DRUG_ID").max().to_dict()["DRUG_NAME"]

In [None]:
drug_response

In [None]:
ct_mapping = {"Non-Small Cell Lung Cancer": "Thoracic", 
              "Non-Hodgkin Lymphoma": "Lymphoma", "Diffuse Glioma": "Brain", 
              "Melanoma": "Skin", 
              "Esophagogastric Adenocarcinoma": "GI", 
              "Colorectal Adenocarcinoma": "GI", 
              "Invasive Breast Carcinoma": "Reproductive", 
              "Pancreatic Adenocarcinoma": "GI", 
              "Head and Neck Squamous Cell Carcinoma": "Head&Neck", 
              "Ovarian Epithelial Tumor": "Reproductive", 
              "Lung Neuroendocrine Tumor": "Neuroendocrine", 
              "Acute Myeloid Leukemia": "Blood", 
              "Renal Cell Carcinoma": "Kidney", "Ewing Sarcoma": "Sarcoma", 
             "Neuroblastoma": "Neuroendocrine", "Intraductal Papillary Neoplasm of the Bile Duct": "GI",
             "Pleural Mesothelioma": "Thoracic", "Endometrial Carcinoma": "Reproductive", 
              "Bladder Urothelial Carcinoma": "Urinary", 
              "B-Lymphoblastic Leukemia/Lymphoma": "Blood", 
              "Esophageal Squamous Cell Carcinoma": "GI", 
              "Embryonal Tumor": "Brain", "Hepatocellular Carcinoma": "GI", 
              "Rhabdomyosarcoma": "Sarcoma", "Osteosarcoma": "Sarcoma", 
              "T-Lymphoblastic Leukemia/Lymphoma": "Blood", 
              "Myeloproliferative Neoplasms": "Blood", 
              "Cervical Squamous Cell Carcinoma": "Reproductive",
              "Anaplastic Thyroid Cancer": "Thyroid", "Liposarcoma": "Sarcoma",
              "Ocular Melanoma": "Other", "Leiomyosarcoma": "Sarcoma", 
              "Prostate Adenocarcinoma": "Reproductive", 
              "Non-Seminomatous Germ Cell Tumor": "Other", 
              "Well-Differentiated Thyroid Cancer": "Thyroid", 
              "Hodgkin Lymphoma": "Lymphoma", "Nerve Sheath Tumor": "Other", 
              "Synovial Sarcoma": "Sarcoma", "Chondrosarcoma": "Sarcoma", 
              "Undifferentiated Pleomorphic Sarcoma/Malignant Fibrous Histiocytoma/High-Grade Spindle Cell Sarcoma": "Sarcoma",
             "Intracholecystic Papillary Neoplasm": "GI", 
              "Cervical Adenocarcinoma": "Reproductive", 
              "Merkel Cell Carcinoma": "Skin", "Cutaneous Squamous Cell Carcinoma": "Skin", 
              "Breast Ductal Carcinoma In Situ": "Reproductive", "Rhabdoid Cancer": "Other", }

other = info["OncotreePrimaryDisease"].value_counts()[(info["OncotreePrimaryDisease"].value_counts()<5)].index
for c in other:
    ct_mapping[c] = "Other"

highlevel_cancer = info["OncotreePrimaryDisease"].replace(ct_mapping)

In [None]:
from typing import Tuple
def get_drug_resp(drug_id: int, drug_response: pd.DataFrame, 
                  highlevel_cancer: pd.Series, pct_counts_mt: pd.Series) -> Tuple[pd.DataFrame,pd.DataFrame]:

    drug_resp = drug_response[drug_response["DRUG_ID"]==drug_id]

    drug_resp = pd.concat([drug_resp,highlevel_cancer.loc[highlevel_cancer.index.intersection(drug_resp.index)],
                                                             pct_counts_mt],axis=1).dropna()
    
    kept_ct = drug_resp["TCGA_DESC"].value_counts()
    kept_ct = kept_ct[kept_ct>=10].index    

    rs, ps = {},{}
    for ct in kept_ct:
        df = drug_resp[drug_resp["TCGA_DESC"]==ct]
        r,p = pearsonr(df["LN_IC50"],df["pct_counts_mt"])
        rs[ct] = [r]
        ps[ct] = [p]
    rs, ps = pd.DataFrame(rs,index=[drug_mapping[drug_id]]).T,pd.DataFrame(ps,index=[drug_mapping[drug_id]]).T
    return rs, ps

In [None]:
descr_cid = drug_response[["TCGA_DESC"]][~drug_response.index.duplicated()]
descr_cid = pd.concat([descr_cid, pct_counts_mt],axis=1).dropna()
vc = descr_cid["TCGA_DESC"].value_counts()
vc = vc[vc>15]
descr_cid = descr_cid.loc[descr_cid["TCGA_DESC"].isin(vc.index)]
ct_order = descr_cid.groupby("TCGA_DESC").median().sort_values("pct_counts_mt")

# General characteristics

In [None]:
fig, ax = plt.subplots(1,1,figsize=(6,1.5))
sns.boxplot(data=descr_cid, x="TCGA_DESC", y="pct_counts_mt", order=ct_order.index)
pretty_ax(ax)
ax.set_xticks(ax.get_xticks(), ax.get_xticklabels(), rotation=45, ha='right')
fig.savefig("/add/path/here/figures/cell_line_pct_counts_mt.svg", dpi=200, bbox_inches="tight")

In [None]:
descr_cid.groupby("TCGA_DESC").median()

# Get correlations and comparison to random

In [None]:
selected_ct = np.setdiff1d(ct_order[ct_order>0.051].dropna().index,["UNCLASSIFIED"])

In [None]:
all_rs, all_ps = [],[]
for drug_id in tqdm(drug_response.DRUG_ID.unique()):
    rs, ps = get_drug_resp(drug_id, drug_response[drug_response["TCGA_DESC"].isin(selected_ct)], 
                  highlevel_cancer, pct_counts_mt)
    all_rs.append(rs)
    all_ps.append(ps)

In [None]:
all_rs = pd.concat(all_rs,axis=1)

all_ps = pd.concat(all_ps,axis=1)

all_rs = all_rs.groupby(by=all_rs.columns, axis=1).median()
all_ps = all_ps.groupby(by=all_ps.columns, axis=1).median()

drug_char = drug_response[["DRUG_NAME","PUTATIVE_TARGET","PATHWAY_NAME"]].set_index("DRUG_NAME")
drug_char = drug_char[~drug_char.index.duplicated()]

sel_drugs = all_rs.isna().sum()[all_rs.isna().sum()<6].index

In [None]:
rng = np.random.default_rng(42)
shuffled_pct = pd.Series(rng.choice(pct_counts_mt.values, size=pct_counts_mt.shape[0]),index=pct_counts_mt.index,)

shuffled_pct.name = "pct_counts_mt"

In [None]:
shuffled_rs, shuffled_ps = [],[]
for drug_id in tqdm(drug_response.DRUG_ID.unique()):
    rs, ps = get_drug_resp(drug_id, drug_response[drug_response["TCGA_DESC"].isin(selected_ct)], 
                  highlevel_cancer, shuffled_pct)
    shuffled_rs.append(rs)
    shuffled_ps.append(ps)

In [None]:
shuffled_rs = pd.concat(shuffled_rs,axis=1)

shuffled_ps = pd.concat(shuffled_ps,axis=1)

shuffled_rs = shuffled_rs.groupby(by=shuffled_rs.columns, axis=1).median()
shuffled_ps = shuffled_ps.groupby(by=shuffled_ps.columns, axis=1).median()

drug_char = drug_response[["DRUG_NAME","PUTATIVE_TARGET","PATHWAY_NAME"]].set_index("DRUG_NAME")
drug_char = drug_char[~drug_char.index.duplicated()]

In [None]:
from scipy.stats import kstest

fig, ax = plt.subplots(1,1, figsize=(2,1))
sns.kdeplot(data=all_rs.loc[:,sel_drugs].median(axis=0), c="blue", ax=ax)
sns.kdeplot(data=shuffled_rs.loc[:,sel_drugs].median(axis=0), c="red", ax=ax)
pretty_ax(ax)
_, p = kstest(all_rs.loc[:,sel_drugs].median(axis=0),shuffled_rs.loc[:,sel_drugs].median(axis=0))
ax.text(0.15, 5, f"p={p:.2e}")

In [None]:
fig, ax = plt.subplots(1,1, figsize=(2,1))
sns.kdeplot(data=all_rs.loc[:,sel_drugs].median(axis=0), c="blue", ax=ax)
sns.kdeplot(data=shuffled_rs.loc[:,sel_drugs].median(axis=0), c="red", ax=ax)
pretty_ax(ax)
ax.set_xlim([0.25,0.4])
ax.set_ylim([0,0.06])

In [None]:
fig, ax = plt.subplots(1,1, figsize=(2,1))
sns.kdeplot(data=all_rs.loc[:,sel_drugs].median(axis=0), c="blue", ax=ax)
sns.kdeplot(data=shuffled_rs.loc[:,sel_drugs].median(axis=0), c="red", ax=ax)
pretty_ax(ax)
ax.set_xlim([-0.35,-0.15])
ax.set_ylim([0,0.06])

In [None]:
#sign_res = (all_ps<0.05).sum()[(all_ps<0.05).sum()>=4].index
sign_res = all_rs.loc[:,sel_drugs].median(axis=0).sort_values(ascending=False).head(15).index

annot_df = all_ps.loc[cancer_order,sign_res].fillna(1)

annot_df = annot_df.applymap(lambda x: "" if x>0.1 else ("*" if 0.01<x<=0.1 else ("**" if 0.001<=x<0.01 else "***")))


fig, ax = plt.subplots(1,1,figsize=(8,3))
sns.heatmap(data=all_rs.loc[cancer_order,sign_res], annot=annot_df, fmt="", cmap="vlag", center=0, vmin=-0.6, vmax=0.6)
fig.savefig("/add/path/here/figures/drug_resistance_most_resistant.svg", 
            dpi=300, bbox_inches="tight")

In [None]:
all_rs.to_csv("/add/path/here/drug_resistance/correlation.csv")
all_ps.to_csv("/add/path/here/drug_resistance/pvalues.csv")

In [None]:
drug_char.loc[sign_res]

In [None]:
from scipy.stats import fisher_exact
conting_res = pd.concat([drug_char.loc[sign_res].PATHWAY_NAME.value_counts(),drug_char.drop(sign_res).PATHWAY_NAME.value_counts()],axis=1).fillna(0)
conting_res.columns = ["SIGN","NSIGN"]

for ptw in conting.index:
    conting2 = pd.concat([conting_res.loc[ptw],conting_res.drop([ptw]).sum()],axis=1)
    _, p = fisher_exact(conting2)
    if p<0.1:
        print(ptw, p)

In [None]:
sign_res = all_rs.loc[:,sel_drugs].median(axis=0).sort_values().head(15).index

annot_df = all_ps.loc[cancer_order,sign_res].fillna(1)

annot_df = annot_df.applymap(lambda x: "" if x>0.1 else ("*" if 0.01<x<=0.1 else ("**" if 0.001<=x<0.01 else "***")))


fig, ax = plt.subplots(1,1,figsize=(8,3))
sns.heatmap(data=all_rs.loc[cancer_order,sign_res], annot=annot_df, fmt="", cmap="vlag", center=0, vmin=-0.6, vmax=0.6)
fig.savefig("/add/path/here/drug_resistance_most_sensitive.svg", 
            dpi=300, bbox_inches="tight")

In [None]:
drug_char.loc[sign_res]

In [None]:
from scipy.stats import fisher_exact
conting_sens = pd.concat([drug_char.loc[sign_res].PATHWAY_NAME.value_counts(),drug_char.drop(sign_res).PATHWAY_NAME.value_counts()],axis=1).fillna(0)
conting_sens.columns = ["SIGN","NSIGN"]

for ptw in conting.index:
    conting2 = pd.concat([conting_sens.loc[ptw],conting_sens.drop([ptw]).sum()],axis=1)
    _, p = fisher_exact(conting2)
    if p<0.1:
        print(ptw, p)

In [None]:
df1 = pd.concat([conting_res["SIGN"]/conting_res["SIGN"].sum(),pd.DataFrame(["Most resistant drugs"]*conting_res.shape[0],index=conting_res.index)],axis=1).reset_index()
df1.columns = ["Drug type","% drugs","Cat"]
df2 = pd.concat([conting_sens["SIGN"]/conting_sens["SIGN"].sum(),pd.DataFrame(["Most sensitive drugs"]*conting_sens.shape[0],index=conting_sens.index)],axis=1).reset_index()
df2.columns = ["Drug type","% drugs","Cat"]
df3 = pd.concat([conting_res["NSIGN"]/conting_res["NSIGN"].sum(),pd.DataFrame(["All tested drugs"]*conting_res.shape[0],index=conting_res.index)],axis=1).reset_index()
df3.columns = ["Drug type","% drugs","Cat"]
order = df3.sort_values("% drugs", ascending=False)["Drug type"]
fig, ax = plt.subplots(1,1,figsize=(6,2))
sns.barplot(data=pd.concat([df1,df2,df3]), y="% drugs", x="Drug type", 
            hue="Cat", order=order, ax=ax, hue_order=["All tested drugs",
                                                      "Most resistant drugs",
                                                      "Most sensitive drugs"])
pretty_ax(ax)
ax.set_xticks(ax.get_xticks(), ax.get_xticklabels(), rotation=45, ha='right')
ax.set_xlabel("")
plt.legend(frameon=False, bbox_to_anchor=(1,1,0,0))
fig.savefig("/add/path/here/drug_resistance_dist.svg", 
            dpi=300, bbox_inches="tight")