In [103]:
import os
import pandas as pd
import numpy as np
np.set_printoptions(precision=2)

import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from collections import Counter

sns.set_style('ticks')

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import matplotlib as mpl
mpl.rcParams['figure.dpi']= 300
mpl.rc("savefig", dpi=300)

##### Read files and select drugs

In [104]:
# log2_median_ic50, log2_median_ic50_9f, log2_median_ic50_hn, log2_median_ic50_3f_hn, log2_median_ic50_9f_hn, log2_max_conc, log2_median_ic50_3f_hn
ref_type = 'log2_median_ic50_3f_hn'
model_name = 'hn_drug_cw_dw10_100000_model'

# for each patient, if cell cluster is less than 5%, then we don't consider that cluster 
freq_cutoff = 0.05

# shift the dosage as GDSC experiment (Syto60) is less sensitive
dosage_shifted = True

In [105]:
drug_info_df = pd.read_csv('../preprocessed_data/GDSC/hn_drug_stat.csv', index_col=0)
drug_info_df.index = drug_info_df.index.astype(str)

drug_id_name_dict = dict(zip(drug_info_df.index, drug_info_df['Drug Name']))
print (drug_info_df.shape)

(81, 27)


In [106]:
tested_drug_list = [1032, 1007, 133, 201, 1010] + [182, 301, 302]
[d for d in tested_drug_list if d not in drug_info_df.index.astype(int)]

[]

##### Read predicted IC50

In [107]:
cadrres_cluster_df = pd.read_csv('../result/HN_model/pred_gdsc_no_bias_{}.csv'.format(model_name), index_col=0)

In [108]:
cadrres_cluster_df.head()

Unnamed: 0,1001,1003,1004,1006,1007,1010,1012,1014,1015,1016,...,299,301,302,303,305,306,308,328,331,346
A1,10.15136,-6.161314,-6.672447,0.05957,-7.530996,-0.082353,0.236646,2.444571,3.236513,-2.192601,...,1.943472,3.044039,1.484686,3.643927,3.901863,2.835411,-0.667095,-0.980774,2.283297,-3.048499
A2,10.640683,-6.376103,-5.735676,0.219908,-6.031828,1.23975,0.304525,2.960092,3.282754,-2.074278,...,-0.232393,2.510217,0.034483,2.753298,3.536657,2.669542,-1.105378,-1.708509,0.885358,-4.39544
B1,10.554961,-6.007625,-5.037957,0.528611,-6.438364,0.225921,0.885083,1.829933,2.722695,-1.45385,...,2.05131,3.081251,1.793193,3.821824,4.21238,3.023686,-0.075776,-0.995107,2.306524,-2.431668
B2,10.941645,-4.527641,-4.322077,1.489009,-5.893981,0.618843,1.438822,2.413475,3.54676,-0.535506,...,2.990144,3.4141,2.559839,4.230664,4.469257,3.242545,0.488697,-0.210191,2.930045,-1.908269
C1,11.157908,-5.116025,-5.631796,0.350056,-7.220292,0.435647,1.478398,0.614453,2.320009,-1.291063,...,5.989048,5.039493,4.991006,5.529244,5.3982,4.212228,0.950321,1.220073,4.648175,0.321519


In [109]:
drug_list = drug_info_df.index
cluster_list = cadrres_cluster_df.index
print(len(drug_list), len(cluster_list))

drug_info_df = drug_info_df.loc[drug_list]
cadrres_cluster_df = cadrres_cluster_df[drug_list]

81 20


In [110]:
if dosage_shifted:
    # Shift by 4 uM
    cadrres_cluster_df = cadrres_cluster_df - 2
#     cadrres_cluster_df = cadrres_cluster_df - 3



##### Read cell cluster % in each patient

In [111]:
freq_df = pd.read_excel('../preprocessed_data/HN_patient_specific/percent_patient_cluster.xlsx', index_col=[0, 1]).reset_index()
freq_df = freq_df.pivot(index='patient_id', columns='cluster', values='percent').fillna(0) / 100

patient_list = freq_df.index

freq_df.head()

cluster,A1,A2,B1,B2,C1,C2,D1,D2,E1,E2,F1,F2,F3,G1,H1,H2,I1,I2,J1
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
HN120,0.022857,0.474286,0.0,0.0,0.0,0.011429,0.0,0.0,0.337143,0.154286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
HN137,0.0,0.0,0.0,0.0,0.275862,0.235632,0.0,0.0,0.0,0.0,0.218391,0.166667,0.103448,0.0,0.0,0.0,0.0,0.0,0.0
HN148,0.0,0.0,0.296703,0.236264,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.467033,0.0,0.0,0.0,0.0,0.0
HN159,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.322368,0.223684,0.0,0.0,0.453947
HN160,0.0,0.0,0.0,0.0,0.0,0.0,0.725275,0.274725,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


##### List all pairs of patient and drug

In [112]:
pred_delta_df = pd.DataFrame(cadrres_cluster_df.values - drug_info_df[ref_type].values, columns=drug_list, index=cluster_list)
pred_cv_df = 100 / (1 + (np.power(2, -pred_delta_df)))
pred_kill_df = 100 - pred_cv_df

rows = []
for p in patient_list:
    c_list = freq_df.loc[p][freq_df.loc[p] >= freq_cutoff].index.values
    freqs = freq_df.loc[p][freq_df.loc[p] >= freq_cutoff].values

    ##### freq sum to 1 (not in use) #####
    # freqs = freqs / np.sum(freqs)

    p_pred_delta_weighted = np.matmul(pred_delta_df.loc[c_list].values.T, freqs)
    p_pred_delta_mat = pred_delta_df.loc[c_list].values
    
    p_pred_kill_weighted = np.matmul(pred_kill_df.loc[c_list].values.T, freqs)
    p_pred_kill_mat = pred_kill_df.loc[c_list].values

    for d_i, d_id in enumerate(drug_list):
        rows += [[p, d_id] + ['|'.join(c_list)] + ['|'.join(["{:.14}".format(f) for f in freqs])] + 
                 ['|'.join(["{:.14}".format(f) for f in p_pred_delta_mat[:, d_i]])] + 
                 ["{:.14}".format(p_pred_delta_weighted[d_i])] +
                 ['|'.join(["{:.14}".format(f) for f in p_pred_kill_mat[:, d_i]])] + 
                 ["{:.14}".format(p_pred_kill_weighted[d_i])]
                ]

In [113]:
single_drug_pred_df = pd.DataFrame(rows, columns=['patient', 'drug_id', 'cluster', 'cluster_p', 'cluster_delta', 'delta', 'cluster_kill', 'kill'])
single_drug_pred_df = single_drug_pred_df[['patient', 'drug_id', 'cluster', 'cluster_p', 'cluster_delta', 'delta', 'cluster_kill', 'kill']]
single_drug_pred_df.head()

Unnamed: 0,patient,drug_id,cluster,cluster_p,cluster_delta,delta,cluster_kill,kill
0,HN120,1001,A2|E1|E2,0.47428571428571|0.33714285714286|0.1542857142...,-0.71413907698934|-0.57893312323327|-0.7078149...,-0.64309486065377,62.128441582274|59.899656305802|62.025245855686,59.230982922712
1,HN120,1003,A2|E1|E2,0.47428571428571|0.33714285714286|0.1542857142...,-0.20364956952201|-0.45564001456729|-0.9229866...,-0.39260751904773,53.523129937829|57.830658129053|65.470047761467,54.983542308649
2,HN120,1004,A2|E1|E2,0.47428571428571|0.33714285714286|0.1542857142...,0.00026898931738728|-0.23123715937791|-0.81618...,-0.20375849124346,49.99533877034|53.998478173381|63.77792178796,51.757298391101
3,HN120,1006,A2|E1|E2,0.47428571428571|0.33714285714286|0.1542857142...,0.14750215730029|-0.69020270204635|-0.88728162...,-0.29963362421687,47.446206602315|61.737283509392|64.908434098862,53.331786261231
4,HN120,1007,A2|E1|E2,0.47428571428571|0.33714285714286|0.1542857142...,2.3461323651774|1.6423130066609|0.96767554099491,1.8157296760548,16.434952355502|24.262074338124|33.833081010266,21.194609249904


In [114]:
single_drug_pred_df.loc[:, 'drug_name'] = [drug_id_name_dict[d] for d in single_drug_pred_df['drug_id'].values]

Save results

In [115]:
if dosage_shifted:
    single_drug_pred_df.to_csv('../result/HN_model/pred_drug_kill_{}_{}_shifted.csv'.format(ref_type, model_name), index=False)
else:
    single_drug_pred_df.to_csv('../result/HN_model/pred_drug_kill_{}_{}.csv'.format(ref_type, model_name), index=False)