### Predicting drug response using the GDSC model

This example shows how to process and predict drug response using the GDSC model based on scRNA-seq data. 

1. Read normalized bulk gene expression file
2. Calculate kernel features
3. Predicting drug response using pre-trained GDSC model

In [1]:
import sys, os, pickle
import pandas as pd
import importlib

import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from scipy import stats

import matplotlib as mpl
mpl.rcParams['figure.dpi']= 300
mpl.rc("savefig", dpi=300)

scriptpath = '..'
sys.path.append(os.path.abspath(scriptpath))

from cadrres import pp, model, evaluation, utility

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


### Read gene expression file and calculate kernel features

##### Indicate input files and output directory

In [2]:
##########

cluster_norm_exp_fname = '../preprocessed_data/HN_patient_specific/log2_fc_cluster_tpm.csv'
output_dir = '../result/HN_model/TPM/'

# cluster_norm_exp_fname = '../preprocessed_data/HN_patient_specific/log2_fc_patient_tpm.csv'
# output_dir = '../result/HN_model/patient_TPM/'

# cluster_norm_exp_fname = '../preprocessed_data/HN_patient_specific/log2_fc_tpm.csv'
# output_dir = '../result/HN_model/cell_TPM/'

##########

# cluster_norm_exp_fname = '../preprocessed_data/HN_patient_specific/log2_fc_cluster_tmm.csv'
# output_dir = '../result/HN_model/TMM/'

# cluster_norm_exp_fname = '../preprocessed_data/HN_patient_specific/log2_fc_patient_tmm.csv'
# output_dir = '../result/HN_model/patient_TMM/'

# cluster_norm_exp_fname = '../preprocessed_data/HN_patient_specific/log2_fc_cluster_tmm_p95.csv'
# output_dir = '../result/HN_model/TMM_p95/'

# cluster_norm_exp_fname = '../preprocessed_data/HN_patient_specific/log2_fc_cluster_mat_norm.csv'
# output_dir = '../result/HN_model/mat_norm/'

# cluster_norm_exp_fname = '../preprocessed_data/HN_patient_specific/log2_fc_cluster_mat_norm_p95.csv'
# output_dir = '../result/HN_model/mat_norm_p95/'

# cluster_norm_exp_fname = '../preprocessed_data/HN_patient_specific/log2_fc_cluster_mat_norm_log2_p95.csv'
# output_dir = '../result/HN_model/mat_norm_log2_p95/'

In [3]:
model_name = 'hn_drug_cw_dw10_100000'
# model_name = 'hn_drug_cw_dwsim10_100000'
# model_name = 'hn_drug_cw_dw1_100000'

In [4]:
cluster_norm_exp_df = pd.read_csv(cluster_norm_exp_fname, index_col=0).T
cluster_norm_exp_df.head()

cluster,A1,A2,B1,B2,C1,C2,D1,D2,E1,E2,...,G1,G2,H1,I1,I2,J1,J2,K1,L,M
AAAS,0.336828,0.733255,-0.100211,0.040937,-0.750019,0.441902,0.974277,0.645204,0.244784,-0.114589,...,0.019484,0.136722,0.92525,-0.500852,-0.559264,-0.233178,-2.153421,-0.672916,-1.381729,0.455242
AAMP,0.473194,0.80953,-0.538478,-0.886829,-1.262913,0.205791,0.197723,-0.08311,0.637455,-0.549449,...,-0.297165,-0.56551,0.710484,-0.937867,-0.961804,-0.854976,-1.168757,-1.749972,-1.927377,0.724136
AARS,-0.390326,-0.834464,-1.148807,-0.74156,-0.632305,0.353565,1.521685,1.146765,0.46482,-1.386324,...,0.001899,0.299582,0.860722,-0.698699,-1.308104,-1.788221,-1.315991,-0.890714,-0.525582,0.086332
AARS2,-0.147163,0.771953,-0.073675,0.496356,-0.296694,-0.514839,-0.117732,-0.63243,-0.53764,-1.055336,...,0.332132,-0.194644,-0.07895,-0.259561,0.01318,-0.182999,1.191579,-0.661154,-1.055336,0.897337
AASDHPPT,0.646827,0.538206,-0.89287,-0.365436,1.541076,0.76027,-0.951734,-0.889768,-1.156785,-0.562848,...,-0.609948,-0.631709,0.407265,-0.445766,-0.539681,-0.372573,-0.907907,-0.100082,-1.22845,0.632283


##### Read GDSC gene expression

In [5]:
gdsc_log2_exp_df = pd.read_csv('../data/GDSC/GDSC_exp.tsv', sep='\t', index_col=0)
gdsc_norm_exp_df, gdsc_mean_exp_df = pp.gexp.normalize_log2_mean_fc(gdsc_log2_exp_df)
gdsc_norm_exp_df.head()

Unnamed: 0_level_0,906826,687983,910927,1240138,1240139,906792,910688,1240135,1290812,907045,...,753584,907044,998184,908145,1659787,1298157,1480372,1298533,930299,905954.1
GENE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TSPAN6,0.941884,0.858532,2.022198,1.107003,1.039128,0.384394,-3.404941,0.271467,-0.747093,-3.234189,...,0.415498,-3.453636,-3.651247,1.683084,0.242039,1.751489,1.732782,1.399116,-3.577806,0.462988
TNMD,0.108897,-0.077972,-0.212181,-0.037766,0.102051,0.033988,-0.027485,0.019062,-0.168814,0.434495,...,-0.056842,-0.110552,0.120717,-0.003137,-0.233059,-0.216412,0.024201,-0.33452,0.014779,-0.021403
DPM1,0.343465,1.771254,-0.155355,-0.152617,0.382752,-0.262101,0.228297,0.169844,0.263669,1.534067,...,0.450398,0.406864,0.275874,0.418742,0.382388,1.427654,0.52169,0.756662,-0.162186,0.752131
SCYL3,-0.632019,-0.179926,-0.290583,-0.183111,0.094688,0.02409,1.721355,-0.53178,-0.3987,1.31407,...,-0.549978,0.3772,0.101711,-0.388691,-0.299252,0.179037,-0.696423,0.196524,0.020015,-0.146319
C1orf112,-0.313637,0.038166,-0.457698,-0.135904,0.146055,0.120737,-0.682451,-0.42587,-0.341484,-0.12309,...,0.032515,0.253426,0.112265,-0.497331,0.120512,0.690414,0.55287,-0.622959,-0.464122,-0.258524


##### Read an essential gene list

In [6]:
ess_gene_list = utility.get_gene_list('../data/essential_genes.txt')
selected_gene_list = [g for g in ess_gene_list if g in cluster_norm_exp_df.index]
len(selected_gene_list)

1724

##### Calculate kernal features

In [7]:
cluster_kernel_df = pp.gexp.calculate_kernel_feature(cluster_norm_exp_df, gdsc_norm_exp_df, selected_gene_list)

Calculating kernel features based on 1543 common genes
(1724, 24) (17737, 1018)


In [8]:
cluster_kernel_df.to_csv(output_dir + "cluster_kernel_df.csv")

### Predicting drug response using pre-trained GDSC model

In [9]:
importlib.reload(pp)
importlib.reload(model)
importlib.reload(evaluation)
importlib.reload(utility)

<module 'cadrres.utility' from '/mnt/volume1/Dropbox/Research/2019_drug_response_heterogeneity/CaDRReS_depository/cadrres/utility.py'>

In [10]:
model_dict = pickle.load(open('../result/HN_model/{}_param_dict.pickle.backup'.format(model_name), 'rb'))
model_training_output_dict = pickle.load(open('../result/HN_model/{}_output_dict.pickle.backup'.format(model_name), 'rb'))

Make a prediction

In [11]:
pred_df, P_df = model.predict_from_model(model_dict, cluster_kernel_df)
pred_df.to_csv(os.path.join(output_dir,'pred_{}_model2.csv'.format('gdsc_no_bias_{}').format(model_name)))
P_df.to_csv(os.path.join(output_dir,'P_{}_model2.csv'.format('gdsc_no_bias_{}').format(model_name)))

In [12]:
pred_df

Drug ID,1001,1003,1004,1006,1007,1010,1012,1014,1015,1016,...,299,301,302,303,305,306,308,328,331,346
A1,10.398524,-4.984888,-6.042325,0.982309,-6.936394,-0.012527,0.738146,2.654245,3.533897,-1.852266,...,0.56549,2.47667,0.206201,3.186974,3.689912,2.494804,-0.697373,-1.430192,1.614384,-3.958066
A2,9.961491,-6.814161,-7.065584,-0.291026,-7.484753,0.024981,0.003014,2.133796,2.968269,-2.287011,...,0.527419,2.163049,0.405491,2.869054,3.224147,2.27466,-1.156717,-2.388423,1.395083,-4.555484
B1,11.021523,-4.749574,-5.249366,1.578844,-6.232563,1.137422,1.965151,3.35653,4.349608,-1.453384,...,1.288136,2.588025,0.801405,3.884401,4.243734,2.818478,1.648026,0.313913,1.946526,-1.245332
B2,10.646173,-5.557965,-6.364599,0.922382,-7.028233,0.893498,1.615433,3.193993,4.282648,-1.901858,...,0.927751,2.224915,0.364318,3.613059,3.819349,2.404219,1.087065,-0.610679,1.703348,-1.995978
C1,10.631363,-6.009131,-4.887686,0.626853,-6.36942,0.41776,0.994489,2.138591,2.940815,-1.298257,...,1.860736,2.955682,1.631775,3.751298,4.231162,3.029577,0.037301,-1.023775,2.103131,-2.568724
C2,10.802532,-4.784965,-4.889357,1.072813,-6.403945,0.107169,1.301386,1.485034,2.833325,-0.87917,...,2.746068,3.1922,2.389507,4.055828,4.336418,3.140493,0.473114,-0.498388,2.831969,-2.106202
D1,10.838702,-6.600623,-5.922908,-0.555175,-6.742192,0.846281,0.529191,1.273207,2.231173,-1.557154,...,1.390929,3.063579,1.598017,3.242089,3.852827,2.945125,-1.235541,-2.036298,1.753422,-3.970967
D2,10.734875,-6.516898,-6.146479,-0.432917,-7.109557,0.680811,0.601391,1.549414,2.504649,-1.556721,...,0.934664,2.82717,0.951483,3.079927,3.731007,2.690864,-1.728753,-2.726807,1.447842,-4.794363
E1,10.570892,-6.884317,-6.937818,-0.966837,-7.755313,0.385784,0.409016,0.796756,2.113789,-1.815461,...,3.191395,3.512492,2.959777,3.920862,4.100335,3.211842,-0.560287,-1.230228,2.807909,-2.703222
E2,11.363586,-5.502124,-5.455824,0.253171,-7.403641,0.854778,1.797459,1.257604,2.717944,-1.100992,...,5.772834,4.938277,4.751861,5.533648,5.620082,4.31401,1.184698,1.215393,4.24097,0.358671
