In [128]:
import os
import pandas as pd
import numpy as np
np.set_printoptions(precision=2)

import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from collections import Counter

sns.set_style('ticks')

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import matplotlib as mpl
mpl.rcParams['figure.dpi']= 300
mpl.rc("savefig", dpi=300)

##### Read files and select drugs

In [129]:
# log2_median_ic50, log2_median_ic50_9f, log2_median_ic50_hn, log2_median_ic50_3f_hn, log2_median_ic50_9f_hn, log2_max_conc, log2_median_ic50_3f_hn
ref_type = 'log2_median_ic50_hn' # log2_median_ic50_3f_hn | log2_median_ic50_hn
model_name = 'hn_drug_cw_dw10_100000_model'

# for each patient, if cell cluster is less than 5%, then we don't consider that cluster 
freq_cutoff = 0.05

# shift the dosage as GDSC experiment (Syto60) is less sensitive
dosage_shifted = False

In [130]:
drug_info_df = pd.read_csv('../preprocessed_data/GDSC/hn_drug_stat.csv', index_col=0)
drug_info_df.index = drug_info_df.index.astype(str)

drug_id_name_dict = dict(zip(drug_info_df.index, drug_info_df['Drug Name']))
print (drug_info_df.shape)

(81, 27)


In [131]:
tested_drug_list = [1032, 1007, 133, 201, 1010] + [182, 301, 302]
[d for d in tested_drug_list if d not in drug_info_df.index.astype(int)]

[]

##### Read predicted IC50

In [132]:
cadrres_cluster_df = pd.read_csv('../result/HN_model/TMM/pred_gdsc_no_bias_{}.csv'.format(model_name), index_col=0)
out_dir = '../result/HN_model/TMM/'

# cadrres_cluster_df = pd.read_csv('../result/HN_model/TMM_p95/pred_gdsc_no_bias_{}.csv'.format(model_name), index_col=0)
# out_dir = '../result/HN_model/TMM_p95/'

# cadrres_cluster_df = pd.read_csv('../result/HN_model/mat_norm/pred_gdsc_no_bias_{}.csv'.format(model_name), index_col=0)
# out_dir = '../result/HN_model/mat_norm/'

# cadrres_cluster_df = pd.read_csv('../result/HN_model/mat_norm_p95/pred_gdsc_no_bias_{}.csv'.format(model_name), index_col=0)
# out_dir = '../result/HN_model/mat_norm_p95/'

# cadrres_cluster_df = pd.read_csv('../result/HN_model/mat_norm_log2_p95/pred_gdsc_no_bias_{}.csv'.format(model_name), index_col=0)
# out_dir = '../result/HN_model/mat_norm_log2_p95/'

In [133]:
cadrres_cluster_df.head()

Unnamed: 0,1001,1003,1004,1006,1007,1010,1012,1014,1015,1016,...,299,301,302,303,305,306,308,328,331,346
A1,10.397889,-4.905275,-6.028838,1.040087,-6.963326,-0.062473,0.773485,2.688762,3.57076,-1.854418,...,0.582115,2.475985,0.196772,3.214172,3.718802,2.502996,-0.620006,-1.339431,1.635212,-3.8679
A2,9.908769,-6.886588,-7.1698,-0.348555,-7.5962,-0.0491,-0.044619,2.137408,2.964216,-2.318362,...,0.445048,2.08817,0.323746,2.815603,3.178974,2.229404,-1.208334,-2.491812,1.334162,-4.68135
B1,11.076265,-4.596297,-5.164717,1.649882,-6.223027,1.107286,2.094663,3.253375,4.34154,-1.436525,...,1.886014,2.87201,1.282556,4.197876,4.439383,2.970257,1.95838,0.793735,2.351196,-0.521394
B2,10.695237,-5.603614,-6.282297,0.846739,-7.06338,0.881319,1.676247,3.06877,4.197349,-1.874359,...,1.474709,2.500055,0.846012,3.873864,4.010081,2.570091,1.266259,-0.272646,2.024308,-1.428443
C1,10.679884,-5.869343,-4.79176,0.728225,-6.323232,0.428557,1.082646,2.12606,2.961553,-1.240544,...,2.010631,3.021958,1.746378,3.838943,4.306642,3.087817,0.179254,-0.85816,2.201932,-2.376438


In [134]:
drug_list = drug_info_df.index
cluster_list = cadrres_cluster_df.index
print(len(drug_list), len(cluster_list))

drug_info_df = drug_info_df.loc[drug_list]
cadrres_cluster_df = cadrres_cluster_df[drug_list]

81 24


In [135]:
if dosage_shifted:
    # Shift by 4 uM
    cadrres_cluster_df = cadrres_cluster_df - 2

##### Read cell cluster % in each patient

In [136]:
freq_df = pd.read_excel('../preprocessed_data/HN_patient_specific/percent_patient_cluster.xlsx', index_col=[0, 1]).reset_index()
freq_df = freq_df.pivot(index='patient_id', columns='cluster', values='percent').fillna(0) / 100

patient_list = freq_df.index

freq_df.head()

cluster,A1,A2,B1,B2,C1,C2,D1,D2,E1,E2,...,F3,G1,G2,H1,I1,I2,J1,J2,K1,L
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HN120,0.010989,0.005495,0.0,0.0,0.0,0.0,0.313187,0.175824,0.0,0.0,...,0.0,0.340659,0.120879,0.0,0.0,0.0,0.0,0.0,0.0,0.032967
HN137,0.0,0.0,0.0,0.0,0.0,0.0,0.005682,0.0,0.340909,0.085227,...,0.096591,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011364
HN148,0.0,0.0,0.0,0.0,0.313514,0.205405,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.459459,0.0,0.0,0.0,0.0,0.0,0.021622
HN159,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.317365,0.185629,0.0,0.0,0.48503,0.011976
HN160,0.0,0.0,0.422222,0.414815,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.162963


##### List all pairs of patient and drug

In [137]:
pred_delta_df = pd.DataFrame(cadrres_cluster_df.values - drug_info_df[ref_type].values, columns=drug_list, index=cluster_list)
pred_cv_df = 100 / (1 + (np.power(2, -pred_delta_df)))
pred_kill_df = 100 - pred_cv_df

rows = []
for p in patient_list:
    c_list = freq_df.loc[p][freq_df.loc[p] >= freq_cutoff].index.values
    freqs = freq_df.loc[p][freq_df.loc[p] >= freq_cutoff].values

    ##### freq sum to 1 (not in use) #####
    # freqs = freqs / np.sum(freqs)

    p_pred_delta_weighted = np.matmul(pred_delta_df.loc[c_list].values.T, freqs)
    p_pred_delta_mat = pred_delta_df.loc[c_list].values
    
    p_pred_kill_weighted = np.matmul(pred_kill_df.loc[c_list].values.T, freqs)
    p_pred_kill_mat = pred_kill_df.loc[c_list].values

    for d_i, d_id in enumerate(drug_list):
        rows += [[p, d_id] + ['|'.join(c_list)] + ['|'.join(["{:.14}".format(f) for f in freqs])] + 
                 ['|'.join(["{:.14}".format(f) for f in p_pred_delta_mat[:, d_i]])] + 
                 ["{:.14}".format(p_pred_delta_weighted[d_i])] +
                 ['|'.join(["{:.14}".format(f) for f in p_pred_kill_mat[:, d_i]])] + 
                 ["{:.14}".format(p_pred_kill_weighted[d_i])]
                ]

In [138]:
single_drug_pred_df = pd.DataFrame(rows, columns=['patient', 'drug_id', 'cluster', 'cluster_p', 'cluster_delta', 'delta', 'cluster_kill', 'kill'])
single_drug_pred_df = single_drug_pred_df[['patient', 'drug_id', 'cluster', 'cluster_p', 'cluster_delta', 'delta', 'cluster_kill', 'kill']]
single_drug_pred_df.head()

Unnamed: 0,patient,drug_id,cluster,cluster_p,cluster_delta,delta,cluster_kill,kill
0,HN120,1001,D1|D2|G1|G2,0.31318681318681|0.17582417582418|0.3406593406...,0.92135645703131|0.80215122911062|0.7541206387...,0.77841853197482,34.555502207301|36.447143381563|37.22172227549...,34.397431405507
1,HN120,1003,D1|D2|G1|G2,0.31318681318681|0.17582417582418|0.3406593406...,1.0331982865939|1.1136227466817|1.545528432576...,1.1982570866689,32.82394722215|31.606727585492|25.51599915092|...,28.089133172559
2,HN120,1004,D1|D2|G1|G2,0.31318681318681|0.17582417582418|0.3406593406...,1.2259174280292|1.0336259642601|1.569763346432...,1.2791293553305,29.948714769591|32.817411024087|25.19805576509...,26.926190154155
3,HN120,1006,D1|D2|G1|G2,0.31318681318681|0.17582417582418|0.3406593406...,0.80494890047428|0.93639930344007|1.9116774819...,1.2413988913924,36.402237164711|34.320082354212|20.99753745300...,27.852111150813
4,HN120,1007,D1|D2|G1|G2,0.31318681318681|0.17582417582418|0.3406593406...,3.0211910963965|2.6922616957096|4.045923540528...,3.2526266582898,10.966865579688|13.398973100733|5.708576760594...,8.564892969846


In [139]:
single_drug_pred_df.loc[:, 'drug_name'] = [drug_id_name_dict[d] for d in single_drug_pred_df['drug_id'].values]

Save results

In [140]:
if dosage_shifted:
    single_drug_pred_df.to_csv(out_dir + 'pred_drug_kill_{}_{}_shifted.csv'.format(ref_type, model_name), index=False)
else:
    single_drug_pred_df.to_csv(out_dir + 'pred_drug_kill_{}_{}.csv'.format(ref_type, model_name), index=False)