In [1]:
import os
import pandas as pd
import numpy as np
np.set_printoptions(precision=2)

import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from collections import Counter

sns.set_style('ticks')

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import matplotlib as mpl
mpl.rcParams['figure.dpi']= 300
mpl.rc("savefig", dpi=300)

##### Read files and select drugs

In [2]:
# log2_median_ic50, log2_median_ic50_9f, log2_median_ic50_hn, log2_median_ic50_3f_hn, log2_median_ic50_9f_hn, log2_max_conc, log2_median_ic50_3f_hn
ref_type = 'log2_median_ic50_hn' # log2_median_ic50_3f_hn | log2_median_ic50_hn
model_name = 'hn_drug_cw_dw10_100000_model' # hn_drug_cw_dw10_100000_model | hn_drug_cw_dw1_100000_model | hn_drug_cw_dwsim10_100000_model

# shift the dosage as GDSC experiment (Syto60) is less sensitive
dosage_shifted = False

In [3]:
drug_info_df = pd.read_csv('../preprocessed_data/GDSC/hn_drug_stat.csv', index_col=0)
drug_info_df.index = drug_info_df.index.astype(str)

drug_id_name_dict = dict(zip(drug_info_df.index, drug_info_df['Drug Name']))
print (drug_info_df.shape)

(81, 27)


In [4]:
tested_drug_list = [1032, 1007, 133, 201, 1010] + [182, 301, 302]
[d for d in tested_drug_list if d not in drug_info_df.index.astype(int)]

[]

##### Read predicted IC50

In [5]:
norm_type = 'cell_TPM'

In [6]:
cadrres_cell_df = pd.read_csv('../result/HN_model/{}/pred_gdsc_no_bias_{}.csv'.format(norm_type, model_name), index_col=0)
out_dir = '../result/HN_model/{}/'.format(norm_type)

In [7]:
cadrres_cell_df.head()

Unnamed: 0,1001,1003,1004,1006,1007,1010,1012,1014,1015,1016,...,299,301,302,303,305,306,308,328,331,346
RHH2176,10.843385,-6.879123,-5.535641,-0.435868,-5.928303,1.524818,0.550205,1.791061,2.707693,-1.093647,...,0.886537,2.474108,1.464313,2.902817,3.398912,2.633714,-0.767147,-2.297105,1.418391,-4.079475
RHH2177,10.474557,-7.527169,-6.910859,-0.905919,-7.270812,0.719401,0.15581,0.784177,1.74861,-2.465134,...,1.664287,3.003126,1.630304,3.344067,3.843642,3.096594,-0.734055,-1.758856,1.831446,-3.674684
RHH2178,10.344092,-8.172785,-7.413486,-1.686217,-8.234032,0.032052,0.287347,-0.055609,1.185569,-2.81686,...,2.969341,3.556121,2.647798,4.025542,4.234615,3.234458,-0.44615,-1.30017,2.665244,-1.997744
RHH2179,10.523248,-7.165827,-6.176543,-0.431112,-6.661878,1.091142,0.583582,2.332135,3.083348,-1.690173,...,-0.297377,1.935227,0.005749,2.598242,3.179443,2.147339,-1.168008,-3.003178,0.731648,-4.746325
RHH2180,10.892141,-6.086613,-5.476581,0.044589,-6.668638,0.789689,0.84495,2.550936,3.141692,-1.812517,...,2.212161,3.617715,2.072394,3.981815,4.507488,3.369766,0.010497,0.070235,2.338329,-1.610985


In [8]:
drug_list = drug_info_df.index
cluster_list = cadrres_cell_df.index
print(len(drug_list), len(cluster_list))

drug_info_df = drug_info_df.loc[drug_list]
cadrres_cell_df = cadrres_cell_df[drug_list]

81 1116


In [9]:
if dosage_shifted:
    # Shift by 4 uM
    cadrres_cluster_df = cadrres_cluster_df - 2

##### Read cell cluster % in each patient

In [10]:
# freq_df = pd.read_excel('../preprocessed_data/HN_patient_specific/percent_patient_tpm_cluster.xlsx', index_col=[0, 1]).reset_index()
# freq_df = freq_df.pivot(index='patient_id', columns='cluster', values='percent').fillna(0) / 100

# patient_list = freq_df.index

# freq_df.head()

In [11]:
cell_info_df = pd.read_csv('../preprocessed_data/HN_patient_specific/cell_info.csv', index_col=0)
cell_info_df = cell_info_df.loc[cadrres_cell_df.index]
cell_info_df.head()

Unnamed: 0,patient_id,cell_line_id,origin,batch
RHH2176,HN120,HN120P,Primary,RHH
RHH2177,HN120,HN120P,Primary,RHH
RHH2178,HN120,HN120P,Primary,RHH
RHH2179,HN120,HN120P,Primary,RHH
RHH2180,HN120,HN120P,Primary,RHH


In [12]:
patient_list = cell_info_df['patient_id'].unique()
patient_list

array(['HN120', 'HN137', 'HN148', 'HN159', 'HN160', 'HN182'], dtype=object)

##### List all pairs of patient and drug

In [13]:
pred_delta_df = pd.DataFrame(cadrres_cell_df.values - drug_info_df[ref_type].values, columns=drug_list, index=cluster_list)
pred_cv_df = 100 / (1 + (np.power(2, -pred_delta_df)))
pred_kill_df = 100 - pred_cv_df

# rows = []
# for p in patient_list:
#     c_list = cell_info_df[cell_info_df['patient_id']==p].index.values

#     p_pred_delta_weighted = np.matmul(pred_delta_df.loc[c_list].values.T, freqs)
#     p_pred_delta_mat = pred_delta_df.loc[c_list].values
    
#     p_pred_kill_weighted = np.matmul(pred_kill_df.loc[c_list].values.T, freqs)
#     p_pred_kill_mat = pred_kill_df.loc[c_list].values

#     for d_i, d_id in enumerate(drug_list):
#         rows += [[p, d_id] + [] + [] + 
#                  ['|'.join(["{:.14}".format(f) for f in p_pred_delta_mat[:, d_i]])] + 
#                  ["{:.14}".format(p_pred_delta_weighted[d_i])] +
#                  ['|'.join(["{:.14}".format(f) for f in p_pred_kill_mat[:, d_i]])] + 
#                  ["{:.14}".format(p_pred_kill_weighted[d_i])]
#                 ]

In [14]:
pred_kill_df

Drug ID,1001,1003,1004,1006,1007,1010,1012,1014,1015,1016,...,299,301,302,303,305,306,308,328,331,346
RHH2176,34.834361,37.965723,24.607262,34.784597,6.423581,11.815919,52.785694,33.173945,45.444520,36.329534,...,95.546368,68.674404,75.093315,76.803491,76.546903,60.828523,65.077005,70.974659,74.111578,79.535817
RHH2177,40.837758,48.954844,45.848571,42.489605,14.826852,18.973903,59.505834,49.939759,61.823409,59.617496,...,92.599915,60.306537,72.879929,70.917850,70.570972,52.978117,64.553926,62.739731,68.253969,74.591670
RHH2178,43.039377,60.005519,54.536336,55.925896,25.339381,27.383085,57.291212,64.099501,70.523027,65.324885,...,83.509854,50.873591,57.034678,60.325226,64.648760,50.592609,59.867037,55.060877,54.674392,47.865941
RHH2179,40.024930,42.744688,33.728215,34.709861,10.244698,15.324443,52.208751,25.438272,39.100017,46.316444,...,97.989581,76.105366,89.231365,80.350771,79.167061,68.508485,71.100745,79.956310,82.168266,86.053696
RHH2180,34.071193,26.108648,23.855711,27.656829,10.287865,18.236191,47.682697,22.670096,38.141393,48.430347,...,89.539174,49.806332,66.420851,61.048326,60.216509,48.248655,52.083905,32.152733,60.207613,41.253499
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
RHO713,27.370033,2.586817,8.484583,4.131592,3.233271,14.181521,26.828476,10.440088,14.493304,28.072442,...,84.720316,47.895021,66.861596,50.583998,56.954617,49.418161,30.020958,15.505470,44.766470,28.071581
RHO714,24.220024,6.903385,15.891031,11.129257,7.184816,12.177604,25.885932,15.168345,20.071440,31.852675,...,72.219594,42.884191,45.662666,44.259588,49.208348,40.401124,17.011399,8.830582,40.664800,11.635733
RHO715,34.599963,10.788091,20.674076,12.784234,10.334362,18.199795,40.649312,8.824563,16.815734,37.896669,...,84.658860,49.649983,65.775978,54.869469,57.423419,48.196305,43.092166,22.548129,53.211157,39.548974
RHO716,24.342612,3.670202,9.744860,7.665307,4.845182,12.498439,33.356578,12.401965,17.871554,22.017538,...,52.839648,29.382100,28.379530,39.887497,45.976720,33.761751,32.907347,9.360438,28.987178,21.214643


In [15]:
pred_kill_df.to_csv(out_dir + 'pred_drug_kill_{}_{}_cell.csv'.format(ref_type, model_name))

In [14]:
pred_pateint_delta_df = pd.merge(pred_delta_df, cell_info_df[['patient_id']], left_index=True, right_index=True).groupby('patient_id').mean()
pred_pateint_kill_df = pd.merge(pred_kill_df, cell_info_df[['patient_id']], left_index=True, right_index=True).groupby('patient_id').mean()

pred_pateint_delta_df = pred_pateint_delta_df.stack().reset_index()
pred_pateint_delta_df.columns = ['patient', 'drug_id', 'delta']

pred_pateint_kill_df = pred_pateint_kill_df.stack().reset_index()
pred_pateint_kill_df.columns = ['patient', 'drug_id', 'kill']

In [15]:
single_drug_pred_df = pd.merge(pred_pateint_delta_df, pred_pateint_kill_df, left_on=['patient', 'drug_id'], right_on=['patient', 'drug_id'])
single_drug_pred_df.head()

Unnamed: 0,patient,drug_id,delta,kill
0,HN120,1001,0.941144,34.518773
1,HN120,1003,1.660548,26.558972
2,HN120,1004,1.438976,28.542245
3,HN120,1006,1.629793,26.066463
4,HN120,1007,3.234392,10.897336


In [16]:
single_drug_pred_df.loc[:, 'drug_name'] = [drug_id_name_dict[d] for d in single_drug_pred_df['drug_id'].values]

Save results

In [17]:
if dosage_shifted:
    single_drug_pred_df.to_csv(out_dir + 'pred_drug_kill_{}_{}_shifted.csv'.format(ref_type, model_name), index=False)
else:
    single_drug_pred_df.to_csv(out_dir + 'pred_drug_kill_{}_{}.csv'.format(ref_type, model_name), index=False)