In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind
from sklearn.metrics import r2_score
from scipy.stats import pearsonr, spearmanr
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.patches as mpatches
from scipy.stats import ttest_ind, ttest_1samp, ttest_rel
import matplotlib as mpl
import umap
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from joblib import Parallel, delayed
from tqdm.notebook import tqdm, trange
import pickle
import matplotlib.ticker as mtick
import math
from itertools import combinations
import random
from sklearn.linear_model import LinearRegression
import qgrid

In [3]:
plt.rcParams['font.family'] = 'Helvetica'
plt.rcParams["axes.labelweight"] = "bold"
plt.rcParams['font.size'] = 18
plt.rcParams['axes.linewidth'] = 2
plt.rcParams['figure.figsize'] = (10.0, 7.0)
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42
sns.set_palette("Dark2")

In [4]:
seed = 42

# load data

In [5]:
colour_list = pd.read_csv('../../data/colour_list.txt',header=None)[0].tolist()

In [6]:
protein_ruv = pd.read_csv(
    "../../data/protein/E0022_P06_Protein_Matrix_ProNorM_no_control_update.txt",
    sep='\t').set_index('Cell_line')

meta = pd.read_csv('../../data/E0022_P06_final_sample_map_no_control.txt',
                   sep='\t')

In [7]:
cell_lines_no_blood = meta[meta['Tissue_type']!='Haematopoietic and Lymphoid'].index.values

In [8]:
tissue_type_map = meta.drop_duplicates(['Cell_line']).set_index(
    ['Cell_line'])[['Tissue_type']].to_dict()['Tissue_type']
cancer_type_map = meta.drop_duplicates(['Cell_line']).set_index(
    ['Cell_line'])[['Cancer_type']].to_dict()['Cancer_type']
batch_map = meta.drop_duplicates(['Cell_line']).set_index(
    ['Cell_line'])[['Batch']].to_dict()['Batch']
instrument_map = meta.drop_duplicates(['Cell_line']).set_index(
    ['Cell_line'])[['Instrument']].to_dict()['Instrument']

In [9]:
tissue_count = meta.drop_duplicates(['Cell_line', 'Tissue_type']).groupby('Tissue_type').size()
major_tissues = tissue_count[tissue_count>30].index.values

In [10]:
cancer_colours = dict(
    zip(meta['Cancer_type'].unique(),
        colour_list[:meta['Cancer_type'].unique().size]))

tissue_colours = dict(
    zip(meta['Tissue_type'].unique(),
        colour_list[:meta['Tissue_type'].unique().size]))

instrument_colours={
    'M01':'#66c2a5', 
    'M02':'#fc8d62', 
    'M03':'#8da0cb', 
    'M04':'#e78ac3', 
    'M05':'#a6d854', 
    'M06':'#ffd92f'
}

batch_colours={
    'P01':'#7fc97f', 
    'P02':'#beaed4', 
    'P03':'#fdc086', 
    'P04':'#386cb0', 
    'P05':'#f0027f', 
    'P06':'#bf5b17'
}

In [11]:
meta = meta.reset_index()

In [12]:
meta_cell_lines = meta.drop_duplicates('Cell_line')

In [13]:
name_map = pd.read_csv("../../data/misc/HUMAN_9606_idmapping.gene_prot.dat",
                       sep='\t',
                       names=['ID', 'type', 'code'])
name_map = name_map.drop_duplicates(['ID', 'type'])
name_map = pd.pivot(name_map, index='ID', columns='type',
                    values='code').dropna()

protein2rna_map = dict(
    zip(name_map['UniProtKB-ID'].values, name_map['Gene_Name'].values))
rna2protein_map = {
    protein2rna_map[protein]: protein
    for protein in protein_ruv.columns if protein in protein2rna_map
}

In [14]:
rna_raw = pd.read_csv("../../data/rna/rnaseq_voom.csv", index_col=0).T

rna_raw.index.name = 'SIDM'

rna_raw = rna_raw.reset_index()
rna_sample = pd.merge(rna_raw,
                      meta[['SIDM',
                            'Cell_line']].drop_duplicates()).drop(['SIDM'],
                                                                  axis=1)

rna_sample = rna_sample.set_index(['Cell_line'])

In [15]:
drug_df = pd.read_csv('../../data/drug/drug_final_processed_eg_id.csv.gz', low_memory=False)

In [16]:
drug_df['max_screening_conc_ln'] = drug_df['max_screening_conc'].map(np.log)

In [17]:
max_conc_map = drug_df[[
    'drug_id', 'max_screening_conc_ln'
]].drop_duplicates().set_index('drug_id').to_dict()['max_screening_conc_ln']

In [18]:
lm_res = pd.read_csv("../../result_files/lm/lm_sklearn_degr_drug_annotated.csv.gz")

In [19]:
# lm_res_old = pd.read_csv("../../result_files/lm/lm_sklearn_degr_drug_annotated_old.csv.gz")

In [21]:
lm_res[(lm_res['fdr']<0.1)|(lm_res['nc_fdr']<0.001)].shape

(105507, 16)

In [23]:
lm_res = lm_res[((lm_res['fdr'] < 0.1) | (lm_res['nc_fdr'] < 0.001))
                & (lm_res['r2'] > 0.4) & (lm_res['skew'] < -1)]
lm_res['x_protein'] = lm_res['x_id'].map(rna2protein_map)

In [24]:
lm_assoc_list = list(zip(lm_res['y_id'], lm_res['x_id'], lm_res['x_protein']))

# calculate

In [36]:
protein_df = protein_ruv
rna_df = rna_sample

In [37]:
def calc_box(step=0.2, tissues=[]):
    res_df = []
    for drug_id, rna_target, protein_target in tqdm(lm_assoc_list):
        if protein_target not in protein_df.columns or rna_target not in rna_df.columns:
            continue
        ic50_map = drug_df[drug_df['drug_id'] == drug_id].set_index(
            'cell_line_name').to_dict()['ln_IC50']
        tmp_data_protein = protein_df[[protein_target]]
        tmp_data_rna = rna_df[[rna_target]]
        tmp_data = pd.merge(tmp_data_protein, tmp_data_rna, on='Cell_line')
        tmp_data.loc[:, 'IC50'] = tmp_data.index.map(ic50_map).values
        tmp_data.loc[:, 'tissue'] = tmp_data.index.map(tissue_type_map).values
        tmp_data = tmp_data.dropna().reset_index(drop=True)

        if len(tissues) == 0:
            tissues = tmp_data['tissue'].unique()
        for tissue in tissues:
            tmp_data_tissue = tmp_data[tmp_data['tissue'] ==
                                       tissue].reset_index(drop=True)
            if tmp_data_tissue[protein_target].unique().size < int(
                    1 / step) or tmp_data_tissue[rna_target].unique(
                    ).size < int(1 / step):
                continue
            tmp_data_tissue.loc[:, 'protein_q'] = pd.qcut(
                tmp_data_tissue[protein_target],
                np.arange(0, 1.0001, step),
                duplicates='drop')
            tmp_data_tissue.loc[:, 'rna_q'] = pd.qcut(
                tmp_data_tissue[rna_target],
                np.arange(0, 1.0001, step),
                duplicates='drop')

            medians = tmp_data_tissue[[
                'protein_q', 'IC50'
            ]].groupby('protein_q').median()['IC50'].values
            mono = (np.all(medians[1:] >= medians[:-1]-0.1)
                    or np.all(medians[1:] <= medians[:-1]+0.1))

            protein_last_q = tmp_data_tissue[[
                'protein_q', 'IC50'
            ]].groupby('protein_q').median()['IC50'].values[-1]

            rna_last_q = tmp_data_tissue[[
                'rna_q', 'IC50'
            ]].groupby('rna_q').median()['IC50'].values[-1]
            protein_first_q = tmp_data_tissue[[
                'protein_q', 'IC50'
            ]].groupby('protein_q').median()['IC50'].values[0]
            protein_first_nq = tmp_data_tissue[[
                'protein_q', 'IC50'
            ]].groupby('protein_q').median()['IC50'].values[:-1]
            rna_first_q = tmp_data_tissue[[
                'rna_q', 'IC50'
            ]].groupby('rna_q').median()['IC50'].values[0]
            rna_first_nq = tmp_data_tissue[[
                'rna_q', 'IC50'
            ]].groupby('rna_q').median()['IC50'].values[:-1]
            ic50_mean = tmp_data_tissue['IC50'].mean()
            max_conc = max_conc_map[drug_id]
            p_corr_protein, p_pval_protein = pearsonr(
                tmp_data_tissue['IC50'], tmp_data_tissue[protein_target])
            p_corr_rna, p_pval_rna = pearsonr(
                tmp_data_tissue['IC50'], tmp_data_tissue[rna_target])
            counts = tmp_data_tissue.shape[0]

            tmp_dict = {
                'drug_id': drug_id,
                'protein': protein_target,
                'tissue': tissue,
                'protein_first_q': protein_first_q - ic50_mean,
                'protein_last_q': protein_last_q - ic50_mean,
                'rna_first_q': rna_first_q - ic50_mean,
                'rna_last_q': rna_last_q - ic50_mean,
                'protein_first_nq_std': np.std(protein_first_nq),
                'rna_first_nq_std': np.std(rna_first_nq),
                'IC50_mean': ic50_mean,
                'max_conc': max_conc,
                'mono': mono,
                'p_corr_protein': p_corr_protein,
                'p_pval_protein': p_pval_protein,
                'p_corr_rna': p_corr_rna,
                'p_pval_rna': p_pval_rna,
                'counts': counts
            }
            res_df.append(tmp_dict)
    res_df = pd.DataFrame(res_df)
    return res_df

In [38]:
res_df = calc_box(step=0.2)

res_df['last_q_delta'] = np.abs(
    res_df['rna_last_q'] -
    res_df['protein_last_q'])

res_df.to_csv("../../result_files/box_plot_drug_tissues_5q_10pc_nc.csv", index=False)

HBox(children=(FloatProgress(value=0.0, max=8692.0), HTML(value='')))




# analysis

## all sample level

In [294]:
res_df = pd.merge(res_df,
                  lm_res[['y_id', 'x_protein', 'fdr', 'beta', 'skew', 'ppi', 'r2']],
                  left_on=['drug_id', 'protein'],
                  right_on=['y_id', 'x_protein'])

In [296]:
res_df['protein_last_q_minus_max_c'] = res_df['protein_last_q'] + res_df[
    'IC50_mean'] - res_df['max_conc']
res_df['rna_last_q_minus_max_c'] = res_df['rna_last_q'] + res_df[
    'IC50_mean'] - res_df['max_conc']
res_df['last_q_delta'] = res_df['protein_last_q_minus_max_c'] - res_df['rna_last_q_minus_max_c']

In [297]:
corr_dict_merged = pd.read_csv("../../data/protein_rna_correlations.csv")

target_proteins = corr_dict_merged[corr_dict_merged['corr_diff'].abs(
) < 0.1].sort_values('corr_avg').head(1000)['protein'].values
drug_meta = pd.read_csv("../../data/drug/drug_info.csv")
FDA_approved_id = drug_meta[drug_meta['FDA_Approved']=='Yes']['drug_id'].values

In [299]:
res_df[(res_df['last_q_delta'] < -0.5) & (res_df['protein'].isin(target_proteins))
       & (res_df['fdr'] < 0.1) & (res_df['beta'] < 0)
       & (res_df['skew'] < -1) & (~pd.isna(res_df['protein_first_nq_std'])) &
       (res_df['protein_first_nq_std'] < 1) &
       (res_df['protein_last_q_minus_max_c'] < 0)].sort_values(
           'protein_last_q_minus_max_c').head(50)

Unnamed: 0,drug_id,protein,tissue,protein_first_q,protein_last_q,rna_first_q,rna_last_q,protein_first_nq_std,rna_first_nq_std,IC50_mean,...,last_q_delta,y_id,x_protein,fdr,beta,skew,ppi,r2,protein_last_q_minus_max_c,rna_last_q_minus_max_c
9,1427;AZD5582;GDSC1,SSRD_HUMAN,All,0.361516,-0.292937,0.082657,0.305681,0.290107,0.211715,1.905607,...,-0.598619,1427;AZD5582;GDSC1,SSRD_HUMAN,2e-06,-0.687235,-1.084352,-,0.491294,-0.689916,-0.091297
134,1427;AZD5582;GDSC1,NU107_HUMAN,All,0.172533,-0.286115,0.184261,0.594989,0.17893,0.250032,1.905607,...,-0.881105,1427;AZD5582;GDSC1,NU107_HUMAN,0.002673,-0.89168,-1.084352,-,0.491294,-0.683094,0.198011
243,56;WH-4-023;GDSC1,NU153_HUMAN,All,1.50626,-1.239928,1.49761,0.691404,0.605191,0.624324,2.233004,...,-1.931332,56;WH-4-023;GDSC1,NU153_HUMAN,0.00651,-0.534229,-1.19572,-,0.497593,-0.640078,1.291254
819,1427;AZD5582;GDSC1,PININ_HUMAN,All,0.594989,-0.189145,0.48704,0.484406,0.219525,0.200446,1.905607,...,-0.673551,1427;AZD5582;GDSC1,PININ_HUMAN,0.022845,-0.728424,-1.084352,-,0.491294,-0.586123,0.087427
554,1427;AZD5582;GDSC1,U5S1_HUMAN,All,0.540358,-0.150783,0.050651,0.617828,0.21691,0.231041,1.905607,...,-0.76861,1427;AZD5582;GDSC1,U5S1_HUMAN,0.015326,-0.827293,-1.084352,-,0.491294,-0.547762,0.220849
242,56;WH-4-023;GDSC1,ZCHC8_HUMAN,All,0.52205,-0.927393,-0.770356,1.084439,0.457394,0.611787,2.172637,...,-2.011832,56;WH-4-023;GDSC1,ZCHC8_HUMAN,0.00651,-0.471293,-1.19572,-,0.497593,-0.38791,1.623922
2025,56;WH-4-023;GDSC1,NUP43_HUMAN,All,1.095905,-0.933639,0.230713,0.062176,0.504786,0.433101,2.178884,...,-0.995816,56;WH-4-023;GDSC1,NUP43_HUMAN,0.062852,-0.438227,-1.19572,-,0.497593,-0.38791,0.607906
1346,1909;Venetoclax;GDSC2,REQU_HUMAN,All,0.652684,-0.805769,-0.153585,0.023476,0.326409,0.195612,2.888981,...,-0.829246,1909;Venetoclax;GDSC2,REQU_HUMAN,0.041,-0.4066,-1.56284,3,0.740261,-0.219374,0.609872


## major tissue

In [369]:
res_df = pd.read_csv("../../result_files/box_plot_major_tissues_5q.csv")

In [370]:
res_df = pd.merge(res_df,
                  lm_res[['y_id', 'x_protein', 'fdr', 'beta', 'skew', 'ppi', 'r2']],
                  left_on=['drug_id', 'protein'],
                  right_on=['y_id', 'x_protein'])

In [371]:
res_df['protein_last_q_minus_max_c'] = res_df['protein_last_q'] + res_df[
    'IC50_mean'] - res_df['max_conc']
res_df['rna_last_q_minus_max_c'] = res_df['rna_last_q'] + res_df[
    'IC50_mean'] - res_df['max_conc']
res_df['last_q_delta'] = res_df['protein_last_q_minus_max_c'] - res_df['rna_last_q_minus_max_c']

In [372]:
corr_dict_merged = pd.read_csv("../../data/protein_rna_correlations.csv")

target_proteins = corr_dict_merged[corr_dict_merged['corr_diff'].abs(
) < 0.1].sort_values('corr_avg').head(1000)['protein'].values
drug_meta = pd.read_csv("../../data/drug/drug_info.csv")
FDA_approved_id = drug_meta[drug_meta['FDA_Approved']=='Yes']['drug_id'].values

In [373]:
res_df[(res_df['last_q_delta'] < -1) & (res_df['protein'].isin(target_proteins))
       & (res_df['fdr'] < 0.1) & (res_df['beta'] < 0)
       & (res_df['skew'] < 0) & (~pd.isna(res_df['protein_first_nq_std'])) &
       (res_df['mono'] == True) &
       (res_df['protein_last_q_minus_max_c'] < -2)].sort_values(
           'protein_last_q_minus_max_c').head(50)

Unnamed: 0,drug_id,protein,tissue,protein_first_q,protein_last_q,rna_first_q,rna_last_q,protein_first_nq_std,rna_first_nq_std,IC50_mean,...,last_q_delta,y_id,x_protein,fdr,beta,skew,ppi,r2,protein_last_q_minus_max_c,rna_last_q_minus_max_c
16012,1909;Venetoclax;GDSC2,REQU_HUMAN,Haematopoietic and Lymphoid,1.59567,-0.108374,0.810358,1.271863,0.459915,0.085363,0.004994,...,-1.380237,1909;Venetoclax;GDSC2,REQU_HUMAN,0.041,-0.4066,-1.56284,3,0.740261,-2.405966,-1.025729
27483,1909;Venetoclax;GDSC2,PRP31_HUMAN,Haematopoietic and Lymphoid,1.569503,-0.052101,0.377492,1.397314,0.365719,0.249423,-0.013506,...,-1.449415,1909;Venetoclax;GDSC2,PRP31_HUMAN,0.070741,-0.476276,-1.56284,3,0.740261,-2.368192,-0.918777


In [374]:
res_df[(res_df['last_q_delta'] < -2) & (res_df['protein'].isin(target_proteins))
       & (res_df['fdr'] < 0.1) & (res_df['beta'] < 0)
       & (res_df['skew'] < 0) & (~pd.isna(res_df['protein_first_nq_std'])) &
       (res_df['protein_first_nq_std'] < 1) &
       (res_df['protein_last_q_minus_max_c'] < -2)].sort_values(
           'protein_last_q_minus_max_c').head(50)

Unnamed: 0,drug_id,protein,tissue,protein_first_q,protein_last_q,rna_first_q,rna_last_q,protein_first_nq_std,rna_first_nq_std,IC50_mean,...,last_q_delta,y_id,x_protein,fdr,beta,skew,ppi,r2,protein_last_q_minus_max_c,rna_last_q_minus_max_c
24072,56;WH-4-023;GDSC1,NUP43_HUMAN,Kidney,-0.822703,-1.790369,-0.327392,0.321688,0.817078,1.106761,-1.411137,...,-2.112057,56;WH-4-023;GDSC1,NUP43_HUMAN,0.062852,-0.438227,-1.19572,-,0.497593,-4.834661,-2.722604
31968,1373;Dabrafenib;GDSC2,WDR61_HUMAN,Skin,0.198714,-3.115418,0.198714,-0.455355,0.521906,1.460067,1.118118,...,-2.660063,1373;Dabrafenib;GDSC2,WDR61_HUMAN,0.083944,-0.419553,-1.457743,3,0.565177,-4.299885,-1.639822
7946,1427;AZD5582;GDSC1,SMU1_HUMAN,Head and Neck,0.939817,-2.602008,0.573541,0.097327,0.287441,0.232159,0.813651,...,-2.699335,1427;AZD5582;GDSC1,SMU1_HUMAN,0.019117,-0.652085,-1.084352,-,0.491294,-4.090942,-1.391607
33580,1909;Venetoclax;GDSC2,PRKDC_HUMAN,Haematopoietic and Lymphoid,1.167053,-1.647553,1.167053,0.607057,0.475299,0.409638,-0.013506,...,-2.25461,1909;Venetoclax;GDSC2,PRKDC_HUMAN,0.087424,-0.364282,-1.56284,2,0.740261,-3.963644,-1.709034
5460,1427;AZD5582;GDSC1,MOGS_HUMAN,Head and Neck,1.18825,-2.339378,1.072576,0.405418,0.327793,1.19372,0.813651,...,-2.744796,1427;AZD5582;GDSC1,MOGS_HUMAN,0.012751,-0.342868,-1.084352,-,0.491294,-3.828312,-1.083516
2914,56;WH-4-023;GDSC1,NU153_HUMAN,Kidney,-0.48413,-1.262999,-2.38277,1.683433,0.975041,1.615141,-0.818737,...,-2.946432,56;WH-4-023;GDSC1,NU153_HUMAN,0.00651,-0.534229,-1.19572,-,0.497593,-3.71489,-0.768459
33557,1427;AZD5582;GDSC1,ZN326_HUMAN,Head and Neck,1.029336,-2.153934,1.064971,0.427741,0.061616,2.247581,0.813651,...,-2.581675,1427;AZD5582;GDSC1,ZN326_HUMAN,0.08742,-0.294768,-1.084352,-,0.491294,-3.642868,-1.061193
34526,1909;Venetoclax;GDSC2,TSN_HUMAN,Haematopoietic and Lymphoid,1.382253,-0.79856,0.961588,1.789528,0.582747,0.774942,-0.013506,...,-2.588087,1909;Venetoclax;GDSC2,TSN_HUMAN,0.089691,-0.289914,-1.56284,3,0.740261,-3.114651,-0.526564
22348,1909;Venetoclax;GDSC2,ARP3_HUMAN,Haematopoietic and Lymphoid,1.345169,-0.769688,0.982561,1.421581,0.397865,1.019035,-0.013506,...,-2.191269,1909;Venetoclax;GDSC2,ARP3_HUMAN,0.058225,-0.303654,-1.56284,3,0.740261,-3.085779,-0.89451
2903,56;WH-4-023;GDSC1,ZCHC8_HUMAN,Esophagus,-0.36926,-1.588171,-2.058427,1.957772,0.914774,1.424013,1.166396,...,-3.545944,56;WH-4-023;GDSC1,ZCHC8_HUMAN,0.00651,-0.471293,-1.19572,-,0.497593,-2.054929,1.491014


## tissue level

In [25]:
drug_crispr = pd.read_csv("../../result_files/drug_crispr_associations_10pcs_nc.csv")

In [26]:
res_df = pd.read_csv("../../result_files/box_plot_drug_tissues_5q_10pc_nc.csv")

### sensitive

In [27]:
res_df = pd.merge(res_df,
                  lm_res[['y_id', 'x_protein', 'x_id', 'fdr', 'target', 'beta', 'nc_fdr', 'nc_beta','skew', 'ppi', 'r2']],
                  left_on=['drug_id', 'protein'],
                  right_on=['y_id', 'x_protein'])

In [28]:
res_df['protein_last_q_minus_max_c'] = res_df['protein_last_q'] + res_df[
    'IC50_mean'] - res_df['max_conc']
res_df['rna_last_q_minus_max_c'] = res_df['rna_last_q'] + res_df[
    'IC50_mean'] - res_df['max_conc']
res_df['last_q_delta'] = res_df['protein_last_q_minus_max_c'] - res_df['rna_last_q_minus_max_c']
res_df['corr_diff'] = res_df['p_corr_protein'].abs() - res_df['p_corr_rna'].abs()

In [29]:
corr_dict_merged = pd.read_csv("../../data/protein_rna_correlations.csv")

target_proteins = corr_dict_merged[corr_dict_merged['corr_diff'].abs(
) < 0.2].sort_values('corr_avg').head(2000)['protein'].values

In [30]:
res_df.shape

(182543, 32)

In [47]:
drug_res_filtered = res_df[(res_df['protein'].isin(target_proteins))
                           & (res_df['beta'] < 0) &
                           (res_df['p_pval_protein'] < 0.1) &
                           (res_df['counts'] > 20)
                           & (res_df['skew'] < -0.5) & (res_df['mono'] == True) & (res_df['corr_diff'] > 0.15) &
                           (res_df['protein_last_q_minus_max_c'] <
                            0)].sort_values('corr_diff', ascending=False)[[
                                'drug_id', 'protein', 'x_id', 'target', 'beta', 'fdr','nc_fdr', 'nc_beta',
                                'ppi', 'r2', 'tissue', 'p_corr_protein',
                                'p_pval_protein', 'corr_diff', 'counts', 'skew'
                            ]]

In [48]:
pd.merge(drug_res_filtered, drug_crispr, left_on=['drug_id', 'x_id'], right_on=['y_id_drug', 'x_id'])

Unnamed: 0,drug_id,protein,x_id,target_x,beta,fdr,nc_fdr,nc_beta,ppi,r2,...,covs_crispr,pval_crispr,fdr_crispr,nc_beta_crispr,nc_lr_crispr,nc_pval_crispr,nc_fdr_crispr,r2_crispr,ppi_crispr,skew_crispr
0,1377;Afatinib;GDSC1,RT22_HUMAN,MRPS22,EGFR;ERBB2,-0.207507,0.77983,9.927609e-09,-0.811231,-,0.617294,...,21,0.244293,0.892241,-0.365466,34.810174,3.63469e-09,6.1e-05,0.59332,3,-1.779083
1,2022;PF-06747775;GDSC2,ICT1_HUMAN,MRPL58,EGFR,-0.271471,0.370333,2.408008e-07,-0.845908,-,0.462193,...,21,0.16247,0.803743,-0.381459,29.8452,4.679539e-08,8.7e-05,0.59332,-,-1.779083
2,1032;Afatinib;GDSC1,ICT1_HUMAN,MRPL58,EGFR;ERBB2,-0.251152,0.664989,1.212406e-11,-0.938139,-,0.551671,...,21,0.16247,0.803743,-0.381459,29.8452,4.679539e-08,8.7e-05,0.59332,-,-1.779083


In [45]:
drug_res_filtered.shape

(215, 16)

In [46]:
qgrid.show_grid(drug_res_filtered)

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

In [72]:
drug_res_filtered.to_csv(
    "../../result_files/drug_tissue/drug_tissue_sensitive_associations_filtered_10pc_nc.csv",
    index=False)

### resistant

In [209]:
res_df = pd.merge(res_df,
                  lm_res[['y_id', 'x_protein', 'fdr', 'beta', 'skew', 'ppi', 'r2']],
                  left_on=['drug_id', 'protein'],
                  right_on=['y_id', 'x_protein'])

In [210]:
res_df['protein_last_q_minus_max_c'] = res_df['protein_last_q'] + res_df[
    'IC50_mean'] - res_df['max_conc']
res_df['protein_first_q_minus_max_c'] = res_df['protein_first_q'] + res_df[
    'IC50_mean'] - res_df['max_conc']
res_df['rna_last_q_minus_max_c'] = res_df['rna_last_q'] + res_df[
    'IC50_mean'] - res_df['max_conc']
res_df['last_q_delta'] = res_df['protein_last_q_minus_max_c'] - res_df['rna_last_q_minus_max_c']

In [212]:
corr_dict_merged = pd.read_csv("../../data/protein_rna_correlations.csv")

target_proteins = corr_dict_merged[corr_dict_merged['corr_diff'].abs(
) < 0.1].sort_values('corr_avg').head(1000)['protein'].values
drug_meta = pd.read_csv("../../data/drug/drug_info.csv")
FDA_approved_id = drug_meta[drug_meta['FDA_Approved']=='Yes']['drug_id'].values

In [214]:
res_df['product'] = res_df['protein_last_q_minus_max_c'] * res_df['protein_first_q_minus_max_c']

In [216]:
res_df[(res_df['last_q_delta'] > 2) & (res_df['protein'].isin(target_proteins))
       & (res_df['fdr'] < 0.1) & (res_df['beta'] > 0)
       & (res_df['skew'] < -1) & (~pd.isna(res_df['protein_first_4q_std'])) &
       (res_df['protein_first_4q_std'] < 1.5) &
       (res_df['protein_last_q_minus_max_c'] > 0) &
       (res_df['protein_first_q_minus_max_c'] < 0) &
       (res_df['tissue'] != 'Haematopoietic and Lymphoid')].sort_values(
           'product', ascending=True).head(50)

Unnamed: 0,drug_id,protein,tissue,protein_first_q,protein_last_q,rna_first_q,rna_last_q,protein_first_4q_std,rna_first_4q_std,IC50_mean,...,x_protein,fdr,beta,skew,ppi,r2,protein_last_q_minus_max_c,protein_first_q_minus_max_c,rna_last_q_minus_max_c,product
649,2354;MCL1_8070;GDSC2,SRP14_HUMAN,Peripheral Nervous System,-1.234139,2.423353,1.371667,-0.724653,1.270121,1.258893,2.168161,...,SRP14_HUMAN,0.039105,0.557233,-1.01447,3,0.707725,2.288928,-1.368563,-0.859077,-3.132543
1078,2354;MCL1_8070;GDSC2,DNJC9_HUMAN,Peripheral Nervous System,-1.234139,2.359702,-1.234139,-0.724653,0.78609,1.277398,2.168161,...,DNJC9_HUMAN,0.067165,0.333336,-1.01447,-,0.707725,2.225277,-1.368563,-0.859077,-3.045433
603,1427;AZD5582;GDSC1,CSN4_HUMAN,Liver,-0.944097,1.171563,-0.944097,-0.871474,0.083337,1.073395,2.507706,...,CSN4_HUMAN,0.037248,0.369139,-1.084352,-,0.491294,1.376685,-0.738976,-0.666352,-1.017337


# correlation analysis

In [27]:
res_tissue = pickle.load(open("../../data/correlation/res_tissue_level.pkl", "rb"))

In [30]:
res_tissue[(res_tissue['drug_id']=='1909;Venetoclax;GDSC2') & (res_tissue['gene']=='TSNAX')]

Unnamed: 0,drug_id,gene,tissue,pearson_correlation,pearson_pval,spearman_correlation,spearman_pval,count
6637,1909;Venetoclax;GDSC2,TSNAX,Lung,-0.302149,2.2e-05,-0.313771,1e-05,191
3212,1909;Venetoclax;GDSC2,TSNAX,Haematopoietic and Lymphoid,-0.195409,0.011635,-0.245228,0.00145,166
44312,1909;Venetoclax;GDSC2,TSNAX,Peripheral Nervous System,-0.420851,0.020565,-0.405562,0.026182,30
75137,1909;Venetoclax;GDSC2,TSNAX,Testis,0.999025,0.028121,1.0,0.0,3
16912,1909;Venetoclax;GDSC2,TSNAX,Bone,-0.327659,0.034154,-0.327099,0.034479,42
13487,1909;Venetoclax;GDSC2,TSNAX,Breast,-0.265103,0.060098,-0.154404,0.279327,51
61437,1909;Venetoclax;GDSC2,TSNAX,Soft Tissue,-0.365909,0.148619,-0.56775,0.017438,17
34037,1909;Venetoclax;GDSC2,TSNAX,Central Nervous System,0.17398,0.195557,0.211086,0.114986,57
10062,1909;Venetoclax;GDSC2,TSNAX,Stomach,-0.235414,0.227842,-0.169671,0.388048,28
64862,1909;Venetoclax;GDSC2,TSNAX,Biliary Tract,-0.581339,0.303928,-0.307794,0.614384,5
