### Predicting drug response using the GDSC model

This example shows how to process and predict drug response using the GDSC model based on scRNA-seq data. 

1. Read normalized bulk gene expression file
2. Calculate kernel features
3. Predicting drug response using pre-trained GDSC model

In [67]:
import sys, os, pickle
import pandas as pd
import importlib

import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from scipy import stats

import matplotlib as mpl
mpl.rcParams['figure.dpi']= 300
mpl.rc("savefig", dpi=300)

scriptpath = '..'
sys.path.append(os.path.abspath(scriptpath))

from cadrres import pp, model, evaluation, utility

### Read gene expression file and calculate kernel features

##### Indicate input files and output directory

In [75]:
# cluster_norm_exp_fname = '../preprocessed_data/HN_patient_specific/log2_fc_cluster_tmm.csv'
# output_dir = '../result/HN_model/TMM/'

# cluster_norm_exp_fname = '../preprocessed_data/HN_patient_specific/log2_fc_patient_tmm.csv'
# output_dir = '../result/HN_model/patient_TMM/'

# cluster_norm_exp_fname = '../preprocessed_data/HN_patient_specific/log2_fc_cluster_tmm_p95.csv'
# output_dir = '../result/HN_model/TMM_p95/'

# cluster_norm_exp_fname = '../preprocessed_data/HN_patient_specific/log2_fc_cluster_tpm.csv'
# output_dir = '../result/HN_model/TPM/'

cluster_norm_exp_fname = '../preprocessed_data/HN_patient_specific/log2_fc_patient_tpm.csv'
output_dir = '../result/HN_model/patient_TPM/'

# cluster_norm_exp_fname = '../preprocessed_data/HN_patient_specific/log2_fc_cluster_mat_norm.csv'
# output_dir = '../result/HN_model/mat_norm/'

# cluster_norm_exp_fname = '../preprocessed_data/HN_patient_specific/log2_fc_cluster_mat_norm_p95.csv'
# output_dir = '../result/HN_model/mat_norm_p95/'

# cluster_norm_exp_fname = '../preprocessed_data/HN_patient_specific/log2_fc_cluster_mat_norm_log2_p95.csv'
# output_dir = '../result/HN_model/mat_norm_log2_p95/'

In [76]:
model_name = 'hn_drug_cw_dw10_100000'
# model_name = 'hn_drug_cw_dwsim10_100000'
# model_name = 'hn_drug_cw_dw1_100000'

In [77]:
cluster_norm_exp_df = pd.read_csv(cluster_norm_exp_fname, index_col=0).T
cluster_norm_exp_df.head()

patient_id,HN120,HN137,HN148,HN159,HN160,HN182
AAAS,0.418215,0.185904,0.475748,-0.62883,-0.261312,-0.61147
AAMP,0.110669,0.871637,0.431081,-1.045853,-0.686607,-0.651498
AARS,1.015166,0.026307,0.552121,-0.539379,-1.037365,-1.648649
AARS2,-0.018434,0.170624,-0.214077,-0.35667,0.12049,0.210369
AASDHPPT,-0.758945,0.3385,0.982756,-0.259402,-0.745159,-0.441521


##### Read GDSC gene expression

In [78]:
gdsc_log2_exp_df = pd.read_csv('../data/GDSC/GDSC_exp.tsv', sep='\t', index_col=0)
gdsc_norm_exp_df, gdsc_mean_exp_df = pp.gexp.normalize_log2_mean_fc(gdsc_log2_exp_df)
gdsc_norm_exp_df.head()

Unnamed: 0_level_0,906826,687983,910927,1240138,1240139,906792,910688,1240135,1290812,907045,...,753584,907044,998184,908145,1659787,1298157,1480372,1298533,930299,905954.1
GENE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TSPAN6,0.941884,0.858532,2.022198,1.107003,1.039128,0.384394,-3.404941,0.271467,-0.747093,-3.234189,...,0.415498,-3.453636,-3.651247,1.683084,0.242039,1.751489,1.732782,1.399116,-3.577806,0.462988
TNMD,0.108897,-0.077972,-0.212181,-0.037766,0.102051,0.033988,-0.027485,0.019062,-0.168814,0.434495,...,-0.056842,-0.110552,0.120717,-0.003137,-0.233059,-0.216412,0.024201,-0.33452,0.014779,-0.021403
DPM1,0.343465,1.771254,-0.155355,-0.152617,0.382752,-0.262101,0.228297,0.169844,0.263669,1.534067,...,0.450398,0.406864,0.275874,0.418742,0.382388,1.427654,0.52169,0.756662,-0.162186,0.752131
SCYL3,-0.632019,-0.179926,-0.290583,-0.183111,0.094688,0.02409,1.721355,-0.53178,-0.3987,1.31407,...,-0.549978,0.3772,0.101711,-0.388691,-0.299252,0.179037,-0.696423,0.196524,0.020015,-0.146319
C1orf112,-0.313637,0.038166,-0.457698,-0.135904,0.146055,0.120737,-0.682451,-0.42587,-0.341484,-0.12309,...,0.032515,0.253426,0.112265,-0.497331,0.120512,0.690414,0.55287,-0.622959,-0.464122,-0.258524


##### Read an essential gene list

In [79]:
ess_gene_list = utility.get_gene_list('../data/essential_genes.txt')
selected_gene_list = [g for g in ess_gene_list if g in cluster_norm_exp_df.index]
len(selected_gene_list)

1724

##### Calculate kernal features

In [80]:
cluster_kernel_df = pp.gexp.calculate_kernel_feature(cluster_norm_exp_df, gdsc_norm_exp_df, selected_gene_list)

Calculating kernel features based on 1543 common genes
(1724, 6) (17737, 1018)


In [81]:
cluster_kernel_df.to_csv(output_dir + "cluster_kernel_df.csv")

### Predicting drug response using pre-trained GDSC model

In [82]:
importlib.reload(pp)
importlib.reload(model)
importlib.reload(evaluation)
importlib.reload(utility)

<module 'cadrres.utility' from '/mnt/volume1/Dropbox/Research/2019_drug_response_heterogeneity/CaDRReS_depository/cadrres/utility.py'>

In [83]:
model_dict = pickle.load(open('../result/HN_model/{}_param_dict.pickle'.format(model_name), 'rb'))
model_training_output_dict = pickle.load(open('../result/HN_model/{}_output_dict.pickle'.format(model_name), 'rb'))

Make a prediction

In [84]:
pred_df, P_df = model.predict_from_model(model_dict, cluster_kernel_df)
pred_df.to_csv(os.path.join(output_dir,'pred_{}_model.csv'.format('gdsc_no_bias_{}').format(model_name)))
P_df.to_csv(os.path.join(output_dir,'P_{}_model.csv'.format('gdsc_no_bias_{}').format(model_name)))