In [1]:
import os
import pandas as pd
import numpy as np
np.set_printoptions(precision=2)

import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from collections import Counter

sns.set_style('ticks')

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import matplotlib as mpl
mpl.rcParams['figure.dpi']= 300
mpl.rc("savefig", dpi=300)

##### Read files and select drugs

In [2]:
ref_type = 'log2_median_ic50_hn' # log2_median_ic50_3f_hn | log2_median_ic50_hn
model_name = 'RWEN'

# shift the dosage as GDSC experiment (Syto60) is less sensitive
dosage_shifted = False

In [3]:
drug_info_df = pd.read_csv('../preprocessed_data/GDSC/hn_drug_stat.csv', index_col=0)
drug_info_df.index = drug_info_df.index.astype(str)

drug_id_name_dict = dict(zip(drug_info_df.index, drug_info_df['Drug Name']))
print (drug_info_df.shape)

(81, 27)


In [4]:
tested_drug_list = [1032, 1007, 133, 201, 1010] + [182, 301, 302]
[d for d in tested_drug_list if d not in drug_info_df.index.astype(int)]

[]

##### Read predicted IC50

In [5]:
norm_type = 'cell_TPM'

In [6]:
cadrres_cell_df = pd.read_csv('../result/HN_model/{}/{}_pred.csv'.format(norm_type, model_name), index_col=0)
out_dir = '../result/HN_model/{}/'.format(norm_type)

In [7]:
cadrres_cell_df.head()

Unnamed: 0,1007,133,201,1010,182,301,302,1012
RHH2176,-6.395335,-6.514972,-12.46467,-6.430358,-3.722897,0.708929,-0.279589,-0.499479
RHH2177,-10.289315,-6.113236,-9.612803,0.756091,-4.753741,-3.594752,-0.380591,-0.011896
RHH2178,-10.09869,-5.891169,-12.298502,1.642715,-3.403522,1.604877,0.14318,-1.24775
RHH2179,-9.486539,-5.70574,-12.306209,1.522135,-5.242828,-1.395477,1.562695,-2.288641
RHH2180,-11.620982,-6.84166,-7.702711,3.060791,-4.710782,-1.831584,-5.050828,-1.279218


In [9]:
drug_list = cadrres_cell_df.columns
cluster_list = cadrres_cell_df.index
print(len(drug_list), len(cluster_list))

drug_info_df = drug_info_df.loc[drug_list]
cadrres_cell_df = cadrres_cell_df[drug_list]

8 1116


In [10]:
if dosage_shifted:
    # Shift by 4 uM
    cadrres_cluster_df = cadrres_cluster_df - 2

##### Read cell cluster % in each patient

In [12]:
# freq_df = pd.read_excel('../preprocessed_data/HN_patient_specific/percent_patient_tpm_cluster.xlsx', index_col=[0, 1]).reset_index()
# freq_df = freq_df.pivot(index='patient_id', columns='cluster', values='percent').fillna(0) / 100

# patient_list = freq_df.index

# freq_df.head()

In [13]:
cell_info_df = pd.read_csv('../preprocessed_data/HN_patient_specific/cell_info.csv', index_col=0)
cell_info_df = cell_info_df.loc[cadrres_cell_df.index]
cell_info_df.head()

Unnamed: 0,patient_id,cell_line_id,origin,batch
RHH2176,HN120,HN120P,Primary,RHH
RHH2177,HN120,HN120P,Primary,RHH
RHH2178,HN120,HN120P,Primary,RHH
RHH2179,HN120,HN120P,Primary,RHH
RHH2180,HN120,HN120P,Primary,RHH


In [14]:
patient_list = cell_info_df['patient_id'].unique()
patient_list

array(['HN120', 'HN137', 'HN148', 'HN159', 'HN160', 'HN182'], dtype=object)

##### List all pairs of patient and drug

In [15]:
pred_delta_df = pd.DataFrame(cadrres_cell_df.values - drug_info_df[ref_type].values, columns=drug_list, index=cluster_list)
pred_cv_df = 100 / (1 + (np.power(2, -pred_delta_df)))
pred_kill_df = 100 - pred_cv_df


In [16]:
pred_kill_df

Unnamed: 0,1007,133,201,1010,182,301,302,1012
RHH2176,8.666278,92.867722,93.898495,97.080501,62.941906,88.168353,90.989682,69.828169
RHH2177,58.516692,90.788528,68.068353,18.586010,77.630330,99.325077,91.547421,62.273231
RHH2178,55.277508,89.417912,93.204274,10.990625,57.648288,80.018420,88.281220,79.540035
RHH2179,44.709356,88.138675,93.238032,11.835311,82.966591,96.973801,73.796178,88.887303
RHH2180,78.023937,94.229574,36.191544,4.416577,77.108980,97.745457,99.638682,79.892713
...,...,...,...,...,...,...,...,...
RHO713,37.870813,91.607442,17.584649,53.926024,51.019792,56.454351,98.595270,68.886323
RHO714,48.434243,82.894292,92.994384,52.192966,26.767225,92.893844,40.117381,81.908822
RHO715,77.328245,88.541155,61.259013,7.754993,65.461802,99.658395,95.397265,83.287039
RHO716,50.734027,84.971952,80.151082,33.738551,30.443759,81.050057,62.068290,83.559756


In [17]:
pred_kill_df.to_csv(out_dir + 'pred_drug_kill_{}_{}_cell.csv'.format(ref_type, model_name))

In [18]:
pred_pateint_delta_df = pd.merge(pred_delta_df, cell_info_df[['patient_id']], left_index=True, right_index=True).groupby('patient_id').mean()
pred_pateint_kill_df = pd.merge(pred_kill_df, cell_info_df[['patient_id']], left_index=True, right_index=True).groupby('patient_id').mean()

pred_pateint_delta_df = pred_pateint_delta_df.stack().reset_index()
pred_pateint_delta_df.columns = ['patient', 'drug_id', 'delta']

pred_pateint_kill_df = pred_pateint_kill_df.stack().reset_index()
pred_pateint_kill_df.columns = ['patient', 'drug_id', 'kill']

In [19]:
single_drug_pred_df = pd.merge(pred_pateint_delta_df, pred_pateint_kill_df, left_on=['patient', 'drug_id'], right_on=['patient', 'drug_id'])
single_drug_pred_df.head()

Unnamed: 0,patient,drug_id,delta,kill
0,HN120,1007,-0.462341,56.818499
1,HN120,133,-3.107634,87.297331
2,HN120,201,-2.125949,76.854282
3,HN120,1010,1.480563,33.41007
4,HN120,182,-1.967096,75.642212


In [20]:
single_drug_pred_df.loc[:, 'drug_name'] = [drug_id_name_dict[d] for d in single_drug_pred_df['drug_id'].values]

Save results

In [21]:
if dosage_shifted:
    single_drug_pred_df.to_csv(out_dir + 'pred_drug_kill_{}_{}_shifted.csv'.format(ref_type, model_name), index=False)
else:
    single_drug_pred_df.to_csv(out_dir + 'pred_drug_kill_{}_{}.csv'.format(ref_type, model_name), index=False)