In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import auc,confusion_matrix
import scipy.stats as stats

In [2]:
#count number of samples in each arm
table2 = pd.read_excel('i-spy2-tables2.xlsx')
table2.set_index('Patient Identifier', inplace = True)
arm_samples = table2['Arm (short name)'].value_counts()
arm_samples = arm_samples.to_frame()

In [3]:
# find number of pCR amples in each arm
arm_short_name = ['Ctr', 'AMG386', 'N', 'MK2206', 'Ganitumab',  'Ganetespib', 'VC', 'Pembro', 'TDM1/P', 'Pertuzumab']
#arm_short_name = ['Pertuzumab']
arm_samples = arm_samples.loc[arm_short_name]
n_pCR = []
for a in arm_short_name:
    table2_1arm = table2.loc[table2['Arm (short name)']== a]
    temp = table2_1arm['pCR'].sum()
    n_pCR.append(temp)
arm_samples['pCR'] = n_pCR
arm_samples['pCR_rate'] = np.round(arm_samples['pCR']/arm_samples['count'],3)
arm_samples

Unnamed: 0_level_0,count,pCR,pCR_rate
Arm (short name),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ctr,210,38,0.181
AMG386,134,40,0.299
N,114,41,0.36
MK2206,94,35,0.372
Ganitumab,106,24,0.226
Ganetespib,93,25,0.269
VC,71,27,0.38
Pembro,69,31,0.449
TDM1/P,52,30,0.577
Pertuzumab,44,26,0.591


In [4]:
def compute_acc_all(cmfs): #cmfs: 2n x 2 array containing n cfms of 2x2, 
    sensitivity_all =[]
    specificity_all =[]
    precision_all = []
    accuracy_all = []
    F1_all = []

    for i in range(int(cfms.shape[0]/2)):
        cfm = cfms.iloc[2*i:2*i+2,:]
        tn = cfm.iloc[0,0]
        fn = cfm.iloc[1,0]
        fp = cfm.iloc[0,1]
        tp = cfm.iloc[1,1]     

        if tn+fp+fn+tp == 0:
            accuracy = 0
            sensitivity = 0
            specificity = 0
            precision = 0
            f1 = 0
        else: 
            accuracy = (tn + tp)/ (tn+fp+fn+tp)
            if tp + fn == 0:
                sensitivity = 0
            else:
                sensitivity = tp / (tp + fn)
            if  tn + fp == 0:
                specificity = 0
            else:
                specificity = tn / (tn + fp)
            if tp + fp == 0:
                precision = 0
            else:
                precision = tp / (tp + fp)

        if sensitivity == 0 and precision == 0:
            f1 = 0
        else:
            f1 = 2* sensitivity * precision/(sensitivity + precision)
        
        sensitivity_all.append(sensitivity)
        specificity_all.append(specificity)
        precision_all.append(precision)
        accuracy_all.append(accuracy)
        F1_all.append(f1)

    #compute mean and standard deviation of acc, sensitivity, specificity, annd precision
    temp = np.array(sensitivity_all)
    mean_sensitivity =[temp.mean(), temp.std()]
    temp = np.array(specificity_all)
    mean_specificity = [temp.mean(), temp.std()]
    temp = np.array(precision_all)
    mean_precision = [temp.mean(), temp.std()]
    temp = np.array(accuracy_all)
    mean_accuracy = [temp.mean(), temp.std()]
    temp = np.array(F1_all)
    mean_f1 = [temp.mean(), temp.std()]
    all = mean_accuracy  + mean_sensitivity + mean_specificity + mean_precision + mean_f1
    
    return all, precision_all

In [5]:
#calculate accuracy, senstivity, specificity, and precision from confunsion matrices
acc_all_arms = [] #without distinguishing four HR/HER2 categories
acc_all_arms_hh00 = [] #for four HR/HER2 categories seperately
acc_all_arms_hh01 = []
acc_all_arms_hh10 = []
acc_all_arms_hh11 = []
prec_pval = []  #p-value of comparing precisions of XGBoost with that of I-SPY2
for a in arm_short_name:
    if a == 'TDM1/P':
        b = 'TDM1P'
    else:
        b = a

    #overall perforamnce metrics
    filename_prefix = 'res_xgb35/results_xgb35_hh1norm1_'
    filename = filename_prefix + b + '_cfm.csv'
    cfms = pd.read_csv(filename, header = None)
    all, precision = compute_acc_all(cfms)
    acc_all_arms.append(all)

    #test significance 
    t, pt = stats.ttest_1samp(precision, arm_samples.loc[a,'pCR_rate'], alternative = 'greater')
    t, pw = stats.wilcoxon(precision - arm_samples.loc[a,'pCR_rate'], alternative = 'greater')
    temp = [pt, pw]
    prec_pval.append(temp)
    
    #performance metrices for four HR/HER2 catetories
    filename = filename_prefix + b + '_cfm_hh.csv'
    cfms_hh = pd.read_csv(filename, header = None)
    for hr in [0,1]:
        for her2 in [0,1]:
            i = (hr*2+her2)*2
            cfms = cfms_hh.iloc[:,i:i+2]
            all, _ = compute_acc_all(cfms)
            temp = 'acc_all_arms_hh' + str(hr)+str(her2) +'.append(all)'
            eval(temp)
            
cols = ['acc','acc_std', 'sen', 'sen_std', 'spec','spec_std','prec','prec_std','f1','f1_std']
df_acc_all_arms  = pd.DataFrame(acc_all_arms, index = arm_short_name, columns = cols)
df_acc_all_arms_hh00  = pd.DataFrame(acc_all_arms_hh00, index = arm_short_name, columns = cols)
df_acc_all_arms_hh01  = pd.DataFrame(acc_all_arms_hh01, index = arm_short_name, columns = cols)
df_acc_all_arms_hh10  = pd.DataFrame(acc_all_arms_hh10, index = arm_short_name, columns = cols)
df_acc_all_arms_hh11  = pd.DataFrame(acc_all_arms_hh11, index = arm_short_name, columns = cols)

In [6]:
arm_samples[['prec', 'prec_std']]=np.round(df_acc_all_arms[['prec', 'prec_std']],3)
temp = np.array(prec_pval)
arm_samples[['p_t', 'p_w']] = temp
#pd.set_option('display.float_format', '{:.2e}'.format)
arm_samples

Unnamed: 0_level_0,count,pCR,pCR_rate,prec,prec_std,p_t,p_w
Arm (short name),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Ctr,210,38,0.181,0.303,0.099,1.616712e-05,1.907349e-06
AMG386,134,40,0.299,0.484,0.094,2.929368e-08,1.907349e-06
N,114,41,0.36,0.555,0.11,1.465957e-07,9.536743e-07
MK2206,94,35,0.372,0.471,0.134,0.002274725,0.003194809
Ganitumab,106,24,0.226,0.358,0.104,1.190996e-05,4.768372e-06
Ganetespib,93,25,0.269,0.485,0.233,0.0003421918,0.000292778
VC,71,27,0.38,0.642,0.107,9.569261e-10,9.536743e-07
Pembro,69,31,0.449,0.74,0.124,1.814886e-09,9.536743e-07
TDM1/P,52,30,0.577,0.784,0.1,1.416344e-08,1.907349e-06
Pertuzumab,44,26,0.591,0.621,0.203,0.2616117,0.2261877


In [7]:
filename_prefix = 'res_xgb35_analysis/'
temp1 = pd.read_csv(filename_prefix + 'all_auc.csv')
temp1.set_index('Unnamed: 0', inplace = True)
temp2 = pd.read_csv(filename_prefix + 'all_auprc.csv')
temp2.set_index('Unnamed: 0', inplace = True)
df_all = pd.concat([df_acc_all_arms,temp1,temp2],axis = 1)
df_all = np.round(df_all, 3)

In [8]:
df_all

Unnamed: 0,acc,acc_std,sen,sen_std,spec,spec_std,prec,prec_std,f1,f1_std,auc_mean,auc_std,auprc_mean,auprc_std
Ctr,0.702,0.078,0.419,0.191,0.767,0.121,0.303,0.099,0.334,0.104,0.622,0.097,0.436,0.054
AMG386,0.683,0.069,0.569,0.183,0.732,0.118,0.484,0.094,0.505,0.115,0.713,0.077,0.502,0.058
N,0.632,0.063,0.565,0.198,0.677,0.149,0.555,0.11,0.537,0.109,0.653,0.071,0.531,0.045
MK2206,0.595,0.091,0.481,0.182,0.665,0.138,0.471,0.134,0.464,0.139,0.578,0.122,0.5,0.06
Ganitumab,0.648,0.081,0.59,0.214,0.665,0.139,0.358,0.104,0.423,0.099,0.67,0.09,0.48,0.059
Ganetespib,0.705,0.092,0.51,0.257,0.775,0.149,0.485,0.233,0.449,0.177,0.711,0.122,0.539,0.077
VC,0.713,0.079,0.708,0.189,0.717,0.143,0.642,0.107,0.656,0.107,0.792,0.086,0.605,0.076
Pembro,0.768,0.09,0.725,0.199,0.8,0.115,0.74,0.124,0.718,0.133,0.814,0.094,0.641,0.058
TDM1/P,0.691,0.123,0.707,0.178,0.662,0.163,0.784,0.1,0.734,0.133,0.788,0.132,0.692,0.055
Pertuzumab,0.594,0.154,0.61,0.3,0.575,0.238,0.621,0.203,0.59,0.222,0.671,0.179,0.625,0.081


In [9]:
df_all.to_csv(filename_prefix +'all_metrics.csv')

arm_samples['p_t'] = arm_samples['p_t'].apply(lambda x: f'{x:.2e}')
arm_samples['p_w'] = arm_samples['p_w'].apply(lambda x: f'{x:.2e}')
arm_samples.to_csv(filename_prefix +'all_pCR_rate.csv')

In [10]:
df_acc_all_arms_hh00 = np.round(df_acc_all_arms_hh00,3)
df_acc_all_arms_hh00

Unnamed: 0,acc,acc_std,sen,sen_std,spec,spec_std,prec,prec_std,f1,f1_std
Ctr,0.668,0.12,0.467,0.371,0.711,0.174,0.251,0.175,0.3,0.198
AMG386,0.545,0.104,0.68,0.204,0.433,0.207,0.508,0.099,0.566,0.121
N,0.607,0.186,0.65,0.288,0.575,0.317,0.588,0.263,0.57,0.208
MK2206,0.571,0.128,0.4,0.249,0.7,0.245,0.482,0.328,0.407,0.234
Ganitumab,0.515,0.156,0.617,0.284,0.471,0.235,0.339,0.136,0.42,0.163
Ganetespib,0.594,0.128,0.562,0.305,0.62,0.26,0.569,0.247,0.513,0.204
VC,0.556,0.115,0.71,0.214,0.3,0.296,0.636,0.087,0.654,0.117
Pembro,0.733,0.186,0.838,0.213,0.525,0.37,0.793,0.161,0.799,0.161
TDM1/P,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Pertuzumab,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
df_acc_all_arms_hh01 = np.round(df_acc_all_arms_hh01,3)
df_acc_all_arms_hh01

Unnamed: 0,acc,acc_std,sen,sen_std,spec,spec_std,prec,prec_std,f1,f1_std
Ctr,0.7,0.256,0.65,0.477,0.725,0.334,0.483,0.421,0.533,0.424
AMG386,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
N,0.51,0.195,0.583,0.296,0.4,0.339,0.577,0.266,0.558,0.24
MK2206,0.612,0.23,0.65,0.288,0.5,0.5,0.808,0.246,0.685,0.23
Ganitumab,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ganetespib,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
VC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Pembro,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TDM1/P,0.7,0.218,0.883,0.264,0.15,0.357,0.733,0.202,0.793,0.217
Pertuzumab,0.65,0.196,0.825,0.286,0.3,0.458,0.692,0.231,0.732,0.22


In [12]:
df_acc_all_arms_hh10 = np.round(df_acc_all_arms_hh10,3)
df_acc_all_arms_hh10

Unnamed: 0,acc,acc_std,sen,sen_std,spec,spec_std,prec,prec_std,f1,f1_std
Ctr,0.747,0.094,0.417,0.331,0.809,0.116,0.266,0.204,0.311,0.229
AMG386,0.808,0.086,0.45,0.384,0.873,0.105,0.32,0.278,0.36,0.299
N,0.675,0.195,0.6,0.49,0.7,0.277,0.337,0.344,0.412,0.37
MK2206,0.633,0.172,0.25,0.433,0.71,0.173,0.133,0.261,0.167,0.303
Ganitumab,0.758,0.137,0.55,0.35,0.8,0.152,0.412,0.327,0.431,0.275
Ganetespib,0.805,0.12,0.3,0.458,0.861,0.14,0.139,0.258,0.178,0.298
VC,0.893,0.1,0.7,0.458,0.925,0.098,0.542,0.418,0.592,0.417
Pembro,0.794,0.099,0.5,0.354,0.892,0.132,0.537,0.377,0.485,0.31
TDM1/P,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Pertuzumab,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
df_acc_all_arms_hh11 = np.round(df_acc_all_arms_hh11,3)
df_acc_all_arms_hh11

Unnamed: 0,acc,acc_std,sen,sen_std,spec,spec_std,prec,prec_std,f1,f1_std
Ctr,0.638,0.167,0.05,0.218,0.833,0.197,0.05,0.218,0.05,0.218
AMG386,0.65,0.223,0.25,0.433,0.85,0.278,0.217,0.398,0.225,0.402
N,0.7,0.094,0.45,0.264,0.825,0.144,0.622,0.302,0.475,0.192
MK2206,0.562,0.294,0.45,0.497,0.6,0.309,0.283,0.362,0.333,0.391
Ganitumab,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ganetespib,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
VC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Pembro,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TDM1/P,0.686,0.125,0.575,0.211,0.833,0.197,0.859,0.166,0.656,0.177
Pertuzumab,0.567,0.2,0.467,0.386,0.667,0.316,0.465,0.37,0.438,0.335


In [14]:
df_acc_all_arms_hh00.to_csv(filename_prefix +'all_metrics_hh00.csv')
df_acc_all_arms_hh01.to_csv(filename_prefix +'all_metrics_hh01.csv')
df_acc_all_arms_hh10.to_csv(filename_prefix +'all_metrics_hh10.csv')
df_acc_all_arms_hh11.to_csv(filename_prefix +'all_metrics_hh11.csv')