### Predicting drug response using the GDSC model

This example shows how to process and predict drug response using the GDSC model based on scRNA-seq data. 

1. Read normalized gene expression file (log2 fold-change generated in B01) of cell clusters
2. Calculate kernel features
3. Predicting drug response using pre-trained GDSC model

In [46]:
import sys, os, pickle
import pandas as pd
import importlib

import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from scipy import stats

import matplotlib as mpl
mpl.rcParams['figure.dpi']= 300
mpl.rc("savefig", dpi=300)

scriptpath = '..'
sys.path.append(os.path.abspath(scriptpath))

from cadrres import pp, model, evaluation, utility

### Read gene expression file and calculate kernel features

##### Indicate input files and output directory

In [47]:
cluster_norm_exp_fname = '../preprocessed_data/HN_patient_specific/log2_fc_cluster_tmm.csv'
output_dir = '../result/HN_model/TMM/'

# cluster_norm_exp_fname = '../preprocessed_data/HN_patient_specific/log2_fc_cluster_tmm_p95.csv'
# output_dir = '../result/HN_model/TMM_p95/'

# cluster_norm_exp_fname = '../preprocessed_data/HN_patient_specific/log2_fc_cluster_mat_norm.csv'
# output_dir = '../result/HN_model/mat_norm/'

# cluster_norm_exp_fname = '../preprocessed_data/HN_patient_specific/log2_fc_cluster_mat_norm_p95.csv'
# output_dir = '../result/HN_model/mat_norm_p95/'

# cluster_norm_exp_fname = '../preprocessed_data/HN_patient_specific/log2_fc_cluster_mat_norm_log2_p95.csv'
# output_dir = '../result/HN_model/mat_norm_log2_p95/'

In [48]:
model_name = 'hn_drug_cw_dw10_100000'

In [49]:
cluster_norm_exp_df = pd.read_csv(cluster_norm_exp_fname, index_col=0).T
cluster_norm_exp_df.head()

cluster,A1,A2,B1,B2,C1,C2,D1,D2,E1,E2,...,G1,G2,H1,I1,I2,J1,J2,K1,L,M
AAAS,0.436395,0.957639,-0.465562,-0.156186,-0.789281,0.512087,0.956244,0.69738,0.054413,-0.419837,...,0.13916,0.159178,0.961019,-0.462397,-0.59487,-0.450762,-2.402877,-0.638281,-0.798495,0.653658
AAMP,0.62522,1.014652,-0.947105,-1.045014,-1.290047,0.294273,0.195267,-0.041096,0.449954,-0.86207,...,-0.203133,-0.564619,0.765725,-0.874619,-1.027867,-1.081713,-1.453521,-1.654376,-1.481006,0.914446
AARS,-0.263372,-0.618539,-1.354482,-0.934238,-0.757683,0.385947,1.486916,1.15623,0.276002,-1.721281,...,0.100568,0.279268,0.902204,-0.649027,-1.349182,-2.038202,-1.630981,-0.953501,0.114207,0.279083
AARS2,-0.065423,0.980821,-0.277443,0.190518,-0.28835,-0.492771,-0.15127,-0.643326,-0.630778,-1.104033,...,0.418013,-0.1461,-0.031738,-0.229396,-0.036583,-0.308567,1.13805,-0.713902,-1.104033,1.141099
AASDHPPT,0.807717,0.782389,-1.372569,-0.719371,1.411915,0.863444,-0.919821,-0.858347,-1.280509,-0.839123,...,-0.477491,-0.512637,0.474007,-0.354342,-0.54883,-0.613024,-1.147354,-0.147671,-1.50949,0.857701


##### Read GDSC gene expression

In [50]:
gdsc_log2_exp_df = pd.read_csv('../data/GDSC/GDSC_exp.tsv', sep='\t', index_col=0)
gdsc_norm_exp_df, gdsc_mean_exp_df = pp.gexp.normalize_log2_mean_fc(gdsc_log2_exp_df)
gdsc_norm_exp_df.head()

Unnamed: 0_level_0,906826,687983,910927,1240138,1240139,906792,910688,1240135,1290812,907045,...,753584,907044,998184,908145,1659787,1298157,1480372,1298533,930299,905954.1
GENE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TSPAN6,0.941884,0.858532,2.022198,1.107003,1.039128,0.384394,-3.404941,0.271467,-0.747093,-3.234189,...,0.415498,-3.453636,-3.651247,1.683084,0.242039,1.751489,1.732782,1.399116,-3.577806,0.462988
TNMD,0.108897,-0.077972,-0.212181,-0.037766,0.102051,0.033988,-0.027485,0.019062,-0.168814,0.434495,...,-0.056842,-0.110552,0.120717,-0.003137,-0.233059,-0.216412,0.024201,-0.33452,0.014779,-0.021403
DPM1,0.343465,1.771254,-0.155355,-0.152617,0.382752,-0.262101,0.228297,0.169844,0.263669,1.534067,...,0.450398,0.406864,0.275874,0.418742,0.382388,1.427654,0.52169,0.756662,-0.162186,0.752131
SCYL3,-0.632019,-0.179926,-0.290583,-0.183111,0.094688,0.02409,1.721355,-0.53178,-0.3987,1.31407,...,-0.549978,0.3772,0.101711,-0.388691,-0.299252,0.179037,-0.696423,0.196524,0.020015,-0.146319
C1orf112,-0.313637,0.038166,-0.457698,-0.135904,0.146055,0.120737,-0.682451,-0.42587,-0.341484,-0.12309,...,0.032515,0.253426,0.112265,-0.497331,0.120512,0.690414,0.55287,-0.622959,-0.464122,-0.258524


##### Read an essential gene list

In [51]:
ess_gene_list = utility.get_gene_list('../data/essential_genes.txt')
selected_gene_list = [g for g in ess_gene_list if g in cluster_norm_exp_df.index]
len(selected_gene_list)

1724

##### Calculate kernal features

In [52]:
cluster_kernel_df = pp.gexp.calculate_kernel_feature(cluster_norm_exp_df, gdsc_norm_exp_df, selected_gene_list)

Calculating kernel features based on 1543 common genes
(1724, 24) (17737, 1018)


In [53]:
cluster_kernel_df.to_csv(output_dir + "cluster_kernel_df.csv")

### Predicting drug response using pre-trained GDSC model

In [54]:
importlib.reload(pp)
importlib.reload(model)
importlib.reload(evaluation)
importlib.reload(utility)

<module 'cadrres.utility' from 'C:\\Users\\suphavilaic\\Dropbox\\Research\\2019_drug_response_heterogeneity\\CaDRReS_depository\\cadrres\\utility.py'>

In [55]:
model_dict = pickle.load(open('../result/HN_model/{}_param_dict.pickle'.format(model_name), 'rb'))
model_training_output_dict = pickle.load(open('../result/HN_model/{}_output_dict.pickle'.format(model_name), 'rb'))

Make a prediction

In [56]:
pred_df, P_df = model.predict_from_model(model_dict, cluster_kernel_df)
pred_df.to_csv(os.path.join(output_dir,'pred_{}_model.csv'.format('gdsc_no_bias_{}').format(model_name)))
P_df.to_csv(os.path.join(output_dir,'P_{}_model.csv'.format('gdsc_no_bias_{}').format(model_name)))