# Cross-validation for indication-specific mode

3. CaDRReS + no bp + ciu + du (du = sample weight based on cancer type)

## Read gene expression file and calculate kernel features


In [1]:
import sys, os, pickle
import pandas as pd
import numpy as np
np.set_printoptions(precision=2)
from collections import Counter
import importlib

import matplotlib as mpl
mpl.rcParams['figure.dpi']= 300
mpl.rc("savefig", dpi=300)

import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

scriptpath = '..'
sys.path.append(os.path.abspath(scriptpath))

from cadrres import pp, model, evaluation, utility

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
import tensorflow as tf
tf.__version__

'1.14.0'

### Read cell line info

In [3]:
gdsc_sample_df = pd.read_csv('../data/GDSC/GDSC_tissue_info.csv', index_col=0)
gdsc_sample_df.index = gdsc_sample_df.index.astype(str)

gdsc_obs_df = pd.read_csv('../data/GDSC/gdsc_all_abs_ic50_bayesian_sigmoid_only9dosages.csv', index_col=0)
gdsc_obs_df.index = gdsc_obs_df.index.astype(str)

gdsc_sample_list = gdsc_obs_df.index.astype(str)

In [4]:
indication_count_df = gdsc_sample_df.groupby(['TCGA_CLASS']).size().sort_values(ascending=False).drop('UNCLASSIFIED')
selected_indications = indication_count_df.index[indication_count_df >= 35]

### Read drug info

In [5]:
gdsc_drug_df = pd.read_csv('../preprocessed_data/GDSC/drug_stat.csv', index_col=0)
gdsc_drug_df.index = gdsc_drug_df.index.astype(str)

gdsc_drug_list = gdsc_drug_df.index
gdsc_drug_df.shape

(226, 27)

### Read gene expression and normalization

In [6]:
gdsc_log2_exp_df = pd.read_csv('../data/GDSC/GDSC_exp.tsv', sep='\t', index_col=0)

## Sample with both expression and response data

In [7]:
gdsc_sample_list = np.array([s for s in gdsc_sample_list if s in gdsc_log2_exp_df.columns])
len(gdsc_sample_list)

985

In [8]:
gdsc_sample_df = gdsc_sample_df.loc[gdsc_sample_list]

In [9]:
gdsc_sample_dict = {}
gdsc_obs_df_dict = {}
for i in selected_indications:
    gdsc_sample_dict[i] = gdsc_sample_df[gdsc_sample_df['TCGA_CLASS']==i].index
    gdsc_obs_df_dict[i] = gdsc_obs_df.loc[gdsc_sample_dict[i]]
    print (i, gdsc_sample_dict[i].shape)

SCLC (60,)
LUAD (63,)
SKCM (52,)
BRCA (49,)
COREAD (48,)
HNSC (42,)
GBM (35,)
ESCA (35,)


In [10]:
gdsc_obs_df = gdsc_obs_df.loc[gdsc_sample_list, gdsc_drug_list]
gdsc_drug_df = gdsc_drug_df.loc[gdsc_drug_list]

## Calculate kernel feature 

Based on all 985 GDSC samples with gene expression profiles

In [11]:
kernel_feature_df = pd.read_csv('../preprocessed_data/GDSC/kernel_features.csv', index_col=0)
kernel_feature_df.head()

Unnamed: 0,1240121,1240122,1240123,1240124,1240125,1240127,1240128,1240129,1240130,1240131,...,949175,949176,949177,949178,949179,971773,971774,971777,998184,998189
1240121,1.0,0.200762,-0.097257,0.079455,-0.080807,-0.107964,-0.058302,0.079915,0.063199,0.035671,...,-0.129215,-0.179337,-0.0953,-0.112817,-0.186527,-0.088457,-0.143004,-0.189747,-0.25959,-0.054617
1240122,0.200762,1.0,0.193214,-0.049567,-0.180749,0.187601,0.042315,0.17116,-0.049354,-0.061332,...,-0.008915,0.042224,0.080204,0.052032,-0.091817,0.007112,0.046598,0.099549,-0.010853,-0.037156
1240123,-0.097257,0.193214,1.0,0.165331,-0.07925,0.187299,-0.092017,-0.022646,0.057621,-0.160944,...,0.082774,0.044117,0.080209,0.092592,0.025462,-0.139113,0.126919,0.068192,0.09884,0.289321
1240124,0.079455,-0.049567,0.165331,1.0,0.213386,-0.134121,-0.063749,-0.069065,0.121054,0.002488,...,-0.271613,-0.177998,-0.222713,-0.215761,-0.092428,-0.068879,-0.154265,-0.092515,-0.003545,0.374791
1240125,-0.080807,-0.180749,-0.07925,0.213386,1.0,-0.048241,-0.110779,0.039282,0.270578,0.084228,...,-0.176282,-0.160407,-0.128728,-0.236164,0.028765,0.0336,-0.054744,0.132978,-0.040489,0.017542


## Cross validation (5-fold)

## Train and predict the validation set

- train_model for 'cadrres', 'cadrres-wo-sample-bias'
- train_model_logistic_weight (with d_u and c_iu; no sample bias implementation)
    - cadrres-wo-sample-bias-weight

### Select indication specific drugs

In [12]:
min_num = 20
min_sensitive_percent = 0.3

In [13]:
sensitive_df = gdsc_obs_df <= gdsc_drug_df[['log2_max_conc']].T.values
sensitive_df.head()

Drug ID,1,1001,1003,1004,1005,1006,1007,1008,1009,1010,...,64,71,83,86,87,88,89,9,91,94
1240121,False,True,True,True,True,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1240122,False,True,True,True,False,True,True,False,False,True,...,False,False,False,False,False,False,False,False,False,False
1240123,False,False,True,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
1240124,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1240125,False,False,True,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [14]:
indication_drug_dict = {}
indication_sample_dict = {}

for i in selected_indications[0:6]:
    
    i_sample_list = gdsc_sample_df[gdsc_sample_df['TCGA_CLASS']==i].index
    indication_sample_dict[i] = list(i_sample_list.astype(int))
    min_num_sensitive = int(np.ceil(min_sensitive_percent * len(i_sample_list)))
    
    i_count_df = (~gdsc_obs_df.loc[i_sample_list].isnull()).sum()
    i_sensitive_df = sensitive_df.loc[i_sample_list].sum()
    
    i_df = pd.concat([i_count_df, i_sensitive_df], axis=1)
    indication_drug_dict[i] = list(i_df[(i_df[0] >= min_num) & (i_df[1] >= min_num_sensitive)].index)
    
    print (i, min_num_sensitive, len(indication_drug_dict[i]))

SCLC 18 85
LUAD 19 73
SKCM 16 85
BRCA 15 69
COREAD 15 79
HNSC 13 81


### Train for indication-specific
'cadrres-wo-sample-bias-weight-indication'

- Indication specific weight: 1-10
- Indication specific drugs

In [15]:
output_dir = '../result/cv_pred/'
model_spec_name = 'cadrres-wo-sample-bias-weight'
indication_specific_degree = 1 # 1 5 10

In [16]:
for k in range(1, 5+1):
    
    print ("Fold #", k)

    X_train = pd.read_csv('../preprocessed_data/GDSC/cv_data/{}_5f_{}.csv'.format('X_train', k), index_col=0)
    X_test = pd.read_csv('../preprocessed_data/GDSC/cv_data/{}_5f_{}.csv'.format('X_test', k), index_col=0)
    
    for i in selected_indications[0:6]:
        
        print (i)
    
        Y_test = pd.read_csv('../preprocessed_data/GDSC/cv_data/{}_5f_{}.csv'.format('Y_test', k), index_col=0)
        Y_train = pd.read_csv('../preprocessed_data/GDSC/cv_data/{}_5f_{}.csv'.format('Y_train', k), index_col=0)

        # select drugs
        Y_test = Y_test[indication_drug_dict[i]]
        Y_train = Y_train[indication_drug_dict[i]]
        
        #########################

        ##### Prepare x0 for calculating logistic sample weigh (o_i) #####

        sample_weights_logistic_x0_df = model.get_sample_weights_logistic_x0(gdsc_drug_df, 'log2_max_conc', X_train.index)

        ##### Prepare indication weight (skip for this analysis = set all to 1) #####

        indication_weight_df = pd.DataFrame(np.ones(Y_train.shape), index=Y_train.index, columns=Y_train.columns)
        i_sample_list = [cl for cl in indication_sample_dict[i] if cl in X_train.index]
        indication_weight_df.loc[i_sample_list, :] = indication_weight_df.loc[i_sample_list, :] * indication_specific_degree

        #########################

        if model_spec_name in ['cadrres', 'cadrres-wo-sample-bias']:
            cadrres_model_dict, cadrres_output_dict = model.train_model(Y_train, X_train, Y_test, X_test, 10, 0.0, 100000, 0.01, model_spec_name=model_spec_name, save_interval=5000, output_dir=output_dir)
        elif model_spec_name in ['cadrres-wo-sample-bias-weight']:
            cadrres_model_dict, cadrres_output_dict = model.train_model_logistic_weight(Y_train, X_train, Y_test, X_test, sample_weights_logistic_x0_df, indication_weight_df, 10, 0.0, 100000, 0.01, model_spec_name=model_spec_name, save_interval=5000, output_dir=output_dir)

        #########################

        ##### Save model and data #####
        pickle.dump(cadrres_model_dict, open(output_dir + '{}_{}_{}_5f_{}_param_dict.pickle'.format(model_spec_name, i, indication_specific_degree, k), 'wb'))
        pickle.dump(cadrres_output_dict, open(output_dir + '{}_{}_{}_5f_{}_output_dict.pickle'.format(model_spec_name, i, indication_specific_degree, k), 'wb'))


Fold # 1
SCLC
Getting data ...
Initializing the model ...



Train: 56681 out of 66725
Starting model training ...

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


TF session started ...

Starting 1st iteration ...
MSE train at step 0: 24.741 (0.01m)
MSE train at step 5000: 7.613 (0.38m)
MSE train at step 10000: 6.429 (0.76m)
MSE train at step 15000: 6.066 (1.14m)
MSE train at step 20000: 5.838 (1.52m)
MSE train at step 25000: 5.665 (1.90m)
MSE train at step 30000: 5.501 (2.27m)
MSE train at step 35000: 5.349 (2.65m)
MSE train at step 40000: 5.228 (3.03m)
MSE train at step 45000: 5.113 (3.40m)
MSE train at step 50000: 4.990 (3.78m)
MSE train at step 55000: 4.874 (4.15m)
MSE train at step 60000: 4.776 (4.52m)
MSE train at step 65000: 4.681 (4.90m)
MSE train at step 70000: 4.592 (5.27m)
MSE train at step 75000: 4.483 (5.65m)
MSE train at step 80000: 4.402 (6.02m)
MSE train at step 85000: 4.326 (6.39m)
MSE train at step 90000: 4.256 (6.76m)


MSE train at step 30000: 5.364 (2.25m)
MSE train at step 35000: 5.228 (2.61m)
MSE train at step 40000: 5.107 (2.97m)
MSE train at step 45000: 4.991 (3.34m)
MSE train at step 50000: 4.883 (3.72m)
MSE train at step 55000: 4.785 (4.09m)
MSE train at step 60000: 4.693 (4.46m)
MSE train at step 65000: 4.602 (4.85m)
MSE train at step 70000: 4.521 (5.23m)
MSE train at step 75000: 4.440 (5.60m)
MSE train at step 80000: 4.361 (5.98m)
MSE train at step 85000: 4.288 (6.35m)
MSE train at step 90000: 4.215 (6.73m)
MSE train at step 95000: 4.145 (7.10m)
Saving model parameters and predictions ...
DONE
LUAD
Getting data ...
Initializing the model ...
Train: 51572 out of 57378
Starting model training ...
TF session started ...
Starting 1st iteration ...
MSE train at step 0: 22.921 (0.00m)
MSE train at step 5000: 7.035 (0.37m)
MSE train at step 10000: 6.000 (0.74m)
MSE train at step 15000: 5.662 (1.10m)
MSE train at step 20000: 5.457 (1.47m)
MSE train at step 25000: 5.296 (1.83m)
MSE train at step 3000

MSE train at step 80000: 4.142 (5.86m)
MSE train at step 85000: 4.088 (6.22m)
MSE train at step 90000: 4.033 (6.58m)
MSE train at step 95000: 3.981 (6.94m)
Saving model parameters and predictions ...
DONE
BRCA
Getting data ...
Initializing the model ...
Train: 49144 out of 54372
Starting model training ...
TF session started ...
Starting 1st iteration ...
MSE train at step 0: 23.502 (0.00m)
MSE train at step 5000: 7.006 (0.37m)
MSE train at step 10000: 6.086 (0.74m)
MSE train at step 15000: 5.754 (1.11m)
MSE train at step 20000: 5.551 (1.47m)
MSE train at step 25000: 5.372 (1.84m)
MSE train at step 30000: 5.226 (2.21m)
MSE train at step 35000: 5.090 (2.57m)
MSE train at step 40000: 4.959 (2.95m)
MSE train at step 45000: 4.850 (3.31m)
MSE train at step 50000: 4.748 (3.67m)
MSE train at step 55000: 4.657 (4.04m)
MSE train at step 60000: 4.567 (4.41m)
MSE train at step 65000: 4.476 (4.78m)
MSE train at step 70000: 4.390 (5.14m)
MSE train at step 75000: 4.315 (5.51m)
MSE train at step 8000

Starting 1st iteration ...
MSE train at step 0: 22.972 (0.00m)
MSE train at step 5000: 7.268 (0.38m)
MSE train at step 10000: 6.128 (0.75m)
MSE train at step 15000: 5.767 (1.13m)
MSE train at step 20000: 5.540 (1.50m)
MSE train at step 25000: 5.373 (1.89m)
MSE train at step 30000: 5.224 (2.27m)
MSE train at step 35000: 5.098 (2.64m)
MSE train at step 40000: 4.983 (3.00m)
MSE train at step 45000: 4.886 (3.37m)
MSE train at step 50000: 4.782 (3.74m)
MSE train at step 55000: 4.690 (4.11m)
MSE train at step 60000: 4.603 (4.48m)
MSE train at step 65000: 4.530 (4.86m)
MSE train at step 70000: 4.464 (5.24m)
MSE train at step 75000: 4.400 (5.61m)
MSE train at step 80000: 4.334 (5.98m)
MSE train at step 85000: 4.280 (6.34m)
MSE train at step 90000: 4.227 (6.72m)
MSE train at step 95000: 4.178 (7.09m)
Saving model parameters and predictions ...
DONE
Fold # 5
SCLC
Getting data ...
Initializing the model ...
Train: 56927 out of 67235
Starting model training ...
TF session started ...
Starting 1st 

In [None]:
# cadrres_model_dict = pickle.load(open(output_dir + '{}_{}_5f_{}_param_dict.pickle'.format(model_spec_name, i, 1), 'rb'))
# cadrres_output_dict = pickle.load(open(output_dir + '{}_{}_5f_{}_output_dict.pickle'.format(model_spec_name, i, 1), 'rb'))

In [None]:
# y = cadrres_output_dict['pred_test_df'].values.flatten()
# x = cadrres_output_dict['obs_test_df'].values.flatten()
# plt.scatter(x[~np.isnan(x)], y[~np.isnan(x)])