In [1]:
import sys, os, pickle
import pandas as pd
import numpy as np
np.set_printoptions(precision=2)
from collections import Counter
import importlib

import matplotlib as mpl
mpl.rcParams['figure.dpi']= 300
mpl.rc("savefig", dpi=300)

import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

scriptpath = '..'
sys.path.append(os.path.abspath(scriptpath))

from cadrres import pp, model, evaluation, utility

In [2]:
output_dir = '../result/cv_pred/'
n_fold = 1
indication = 'HNSC'

model_spec_name_list = []
for m in ['cadrres', 'cadrres-wo-sample-bias', 'cadrres-wo-sample-bias-weight']:
    model_spec_name_list += ["{}_{}".format(m, indication)]


In [3]:
gdsc_drug_df = pd.read_csv('../preprocessed_data/GDSC/hn_drug_stat.csv', index_col=0)
gdsc_drug_df.index = gdsc_drug_df.index.astype(str)

gdsc_drug_list = gdsc_drug_df.index

print (gdsc_drug_df.shape)
# gdsc_drug_df.head(1).T

(81, 27)


In [4]:
drug_log2_max_conc_dict = dict(zip(gdsc_drug_df.index, gdsc_drug_df['log2_max_conc']))
drug_log2_median_ic50_dict = dict(zip(gdsc_drug_df.index, gdsc_drug_df['log2_median_ic50']))

In [5]:
gdsc_sample_df = pd.read_csv('../data/GDSC/GDSC_tissue_info.csv', index_col=0)
gdsc_sample_df.index = gdsc_sample_df.index.astype(str)

gdsc_sample_list = pd.read_csv('../data/GDSC/gdsc_all_abs_ic50_bayesian_sigmoid_only9dosages.csv', index_col=0).index.astype(str)

indication_sample_list = [u for u in gdsc_sample_df[gdsc_sample_df['TCGA_CLASS']==indication].index if u in gdsc_sample_list]
len(indication_sample_list)

42

##### Read predictions

In [6]:
output_dict = {}

for model_spec_name in model_spec_name_list:
    
    output_dict[model_spec_name] = []
    
    for k in range(1, n_fold+1):
        pred_dict = pickle.load(open(output_dir + '{}_5f_{}_output_dict.pickle'.format(model_spec_name, k), 'rb'))
        output_dict[model_spec_name].append(pred_dict)

##### Compare obs and pred for all validation sets

In [7]:
from sklearn.metrics import f1_score, precision_score, accuracy_score

In [8]:
result_df_list = []

for model_spec_name in model_spec_name_list:
    for k in range(n_fold):
        
        pred_test_df = output_dict[model_spec_name][k]['pred_test_df']
        obs_test_df = output_dict[model_spec_name][k]['obs_test_df']
        
        sample_list_k = sorted(pred_test_df.index[pred_test_df.index.isin(indication_sample_list)])
        pred_test_df = pred_test_df.loc[sample_list_k]
        obs_test_df = obs_test_df.loc[sample_list_k]
        
        drug_list = obs_test_df.columns
        
        results = []
        
        for i, d in enumerate(drug_list):
            
            x = obs_test_df[d].values
            y = pred_test_df[d].values
            sel = ~np.isnan(x)
            
            x = x[sel]
            y = y[sel]
            
            # spearman
            scor, pval = stats.spearmanr(x, y)
            
            # F1 weighted
            x_bool = (x < drug_log2_max_conc_dict[d]).astype(str)
            y_bool = (y < drug_log2_max_conc_dict[d]).astype(str)
            f1 = f1_score(x_bool, y_bool, average='weighted')
            acc = accuracy_score(x_bool, y_bool)
            
            precent_sensitive = (np.sum(x < drug_log2_max_conc_dict[d]) / len(x)) * 100
            
            if (precent_sensitive > 0) & (precent_sensitive < 100):
                (f1_resistant, f1_sensitive) = f1_score(x_bool, y_bool, average=None)
                (precision1_resistant, precision_sensitive) = precision_score(x_bool, y_bool, average=None)
            else:
                f1_resistant = np.nan
                f1_sensitive = np.nan
                precision1_resistant = np.nan
                precision_sensitive = np.nan
            
            # MAE (sensitive)
            sensitive_sel = x < drug_log2_max_conc_dict[d]
            x_sensitive = x[sensitive_sel]
            y_sensitive = y[sensitive_sel]
            mae = np.mean(np.abs(x_sensitive - y_sensitive))
            
            results += [[d, precent_sensitive, drug_log2_max_conc_dict[d], drug_log2_median_ic50_dict[d],
                         scor, pval, acc, f1, f1_resistant, f1_sensitive, precision1_resistant, precision_sensitive, mae]]
            
        result_df = pd.DataFrame(results, columns=['drug_id', 'precent_sensitive', 'log2_max_conc', 'log2_median_ic50', 'spearman', 'pval', 'accurary', 'f1', 'f1_resistant', 'f1_sensitive', 'precision1_resistant', 'precision_sensitive', 'MAE'])
        result_df.loc[:, 'model'] = model_spec_name
        result_df.loc[:, 'k'] = k+1
        
        result_df_list.append(result_df)
    

In [9]:
all_result_df = pd.concat(result_df_list, axis=0)
all_result_df.shape

(243, 15)

In [10]:
all_result_df.to_excel('../result/cv_pred/cv_score_{}.xlsx'.format(indication), index=False)

##### Summarize 5-fold

In [11]:
all_result_df.head()

Unnamed: 0,drug_id,precent_sensitive,log2_max_conc,log2_median_ic50,spearman,pval,accurary,f1,f1_resistant,f1_sensitive,precision1_resistant,precision_sensitive,MAE,model,k
0,1001,71.428571,10.965784,10.861657,0.25,0.588724,0.714286,0.714286,0.5,0.8,0.5,0.8,1.095393,cadrres_HNSC,1
1,1003,100.0,-3.321928,-5.793488,0.6,0.208,0.833333,0.909091,,,,,1.781029,cadrres_HNSC,1
2,1004,100.0,-3.321928,-6.119126,0.0,1.0,1.0,1.0,,,,,0.996345,cadrres_HNSC,1
3,1006,57.142857,1.0,0.553154,0.392857,0.383317,0.571429,0.552381,0.4,0.666667,0.5,0.6,2.087725,cadrres_HNSC,1
4,1007,85.714286,-6.321928,-7.18899,-0.071429,0.879048,0.571429,0.623377,0.0,0.727273,0.0,0.8,1.749042,cadrres_HNSC,1


In [12]:
score_df = all_result_df.groupby(by=['drug_id', 'model']).mean().reset_index().drop(['pval', 'k'], axis=1)
score_df.to_excel('../result/cv_pred/cv_score_summary_{}.xlsx'.format(indication), index=False)
score_df.head()

Unnamed: 0,drug_id,model,precent_sensitive,log2_max_conc,log2_median_ic50,spearman,accurary,f1,f1_resistant,f1_sensitive,precision1_resistant,precision_sensitive,MAE
0,1001,cadrres-wo-sample-bias-weight_HNSC,71.428571,10.965784,10.861657,0.5,0.857143,0.863492,0.8,0.888889,0.666667,1.0,1.02604
1,1001,cadrres-wo-sample-bias_HNSC,71.428571,10.965784,10.861657,0.428571,1.0,1.0,1.0,1.0,1.0,1.0,1.00292
2,1001,cadrres_HNSC,71.428571,10.965784,10.861657,0.25,0.714286,0.714286,0.5,0.8,0.5,0.8,1.095393
3,1003,cadrres-wo-sample-bias-weight_HNSC,100.0,-3.321928,-5.793488,-0.085714,1.0,1.0,,,,,1.945527
4,1003,cadrres-wo-sample-bias_HNSC,100.0,-3.321928,-5.793488,0.314286,0.833333,0.909091,,,,,1.950149
