In [1]:
import os
import pandas as pd
import numpy as np
np.set_printoptions(precision=2)

import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from collections import Counter

sns.set_style('ticks')

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import matplotlib as mpl
mpl.rcParams['figure.dpi']= 300
mpl.rc("savefig", dpi=300)

##### Read files and select drugs

In [2]:
ref_type = 'log2_median_ic50_hn' # log2_median_ic50_3f_hn | log2_median_ic50_hn
model_name = 'RWEN'

# for each patient, if cell cluster is less than 5%, then we don't consider that cluster 
freq_cutoff = 0.05

# shift the dosage as GDSC experiment (Syto60) is less sensitive
dosage_shifted = False

In [3]:
drug_info_df = pd.read_csv('../preprocessed_data/GDSC/hn_drug_stat.csv', index_col=0)
drug_info_df.index = drug_info_df.index.astype(str)

drug_id_name_dict = dict(zip(drug_info_df.index, drug_info_df['Drug Name']))
print (drug_info_df.shape)

(81, 27)


In [4]:
tested_drug_list = [1032, 1007, 133, 201, 1010] + [182, 301, 302]
[d for d in tested_drug_list if d not in drug_info_df.index.astype(int)]

[]

##### Read predicted IC50

In [5]:
norm_type = 'patient_TPM'

In [6]:
cadrres_patient_df = pd.read_csv('../result/HN_model/{}/{}_pred.csv'.format(norm_type, model_name), index_col=0)
out_dir = '../result/HN_model/{}/'.format(norm_type)

In [7]:
cadrres_patient_df.head()

Unnamed: 0_level_0,1007,133,201,1010,182,301,302,1012
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
HN120,-9.944659,-5.930867,-12.221588,0.43445,-4.659462,0.542657,-2.792519,-0.559059
HN137,-9.89665,-5.913446,-12.231972,-0.295715,-4.489943,-1.00528,-2.966661,-0.648973
HN148,-9.928479,-5.837932,-12.183435,0.321664,-3.592695,1.00194,-1.49138,-0.82031
HN159,-8.255876,-5.912368,-10.056112,0.803703,-3.580523,1.03916,-2.348676,-0.867052
HN160,-8.660189,-6.011195,-10.082678,1.718468,-2.087583,3.158456,1.940603,0.10816


In [8]:
drug_list = cadrres_patient_df.columns
patient_list = cadrres_patient_df.index
print(len(drug_list), len(patient_list))

drug_info_df = drug_info_df.loc[drug_list]
cadrres_patient_df = cadrres_patient_df[drug_list]

8 6


In [9]:
if dosage_shifted:
    # Shift by 4 uM
    cadrres_patient_df = cadrres_patient_df - 2

##### List all pairs of patient and drug

In [10]:
pred_delta_df = pd.DataFrame(cadrres_patient_df.values - drug_info_df[ref_type].values, columns=drug_list, index=patient_list)
pred_cv_df = 100 / (1 + (np.power(2, -pred_delta_df)))
pred_kill_df = 100 - pred_cv_df

pred_kill_df.head()

Unnamed: 0_level_0,1007,133,201,1010,182,301,302,1012
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
HN120,52.625667,89.675468,92.858719,22.197506,76.475047,89.318748,98.294662,70.691052
HN137,51.79538,89.563133,92.906302,32.123949,74.295821,96.070874,98.485631,71.965477
HN148,52.345992,89.063629,92.681351,23.576926,60.813421,85.880138,95.900024,74.297931
HN159,25.626897,89.556142,74.349112,18.091801,60.612177,85.564395,97.694523,74.911745
HN160,31.319958,90.179706,74.698711,10.48738,35.347757,57.702864,68.426803,60.299148


In [11]:
single_drug_pred_df = pred_kill_df.stack().reset_index()
single_drug_pred_df.columns = ['patient', 'drug_id', 'kill']
single_drug_pred_df.loc[:, 'drug_name'] = [drug_id_name_dict[d] for d in single_drug_pred_df['drug_id'].values]
single_drug_pred_df.head()

Unnamed: 0,patient,drug_id,kill,drug_name
0,HN120,1007,52.625667,Docetaxel
1,HN120,133,89.675468,Doxorubicin
2,HN120,201,92.858719,Epothilone B
3,HN120,1010,22.197506,Gefitinib
4,HN120,182,76.475047,Obatoclax Mesylate


Save results

In [12]:
if dosage_shifted:
    single_drug_pred_df.to_csv(out_dir + 'pred_drug_kill_{}_{}_shifted.csv'.format(ref_type, model_name), index=False)
else:
    single_drug_pred_df.to_csv(out_dir + 'pred_drug_kill_{}_{}.csv'.format(ref_type, model_name), index=False)