In [1]:
import os,sys
import pandas as pd
import numpy as np

# Functions

In [3]:
# HP SELECTION - INNER LOOP
# find out the best HP per fold
def selectHP(metric, perf_inner):
    data1 = []
    data2 = []
    data3 = []
    for target in perf_inner['target'].unique():
        # extract results for the current target
        sub = perf_inner.loc[perf_inner['target']==target]
    
        # get AUC performance averaged for each given fold i.e. averaged over the two inner folds
        subgrouped3 = sub.groupby(['ofold', 'hp']).mean().reset_index().sort_values('auc', ascending=False)    
        
        
        # extract best AUC for each of the three folds
        sub1 = subgrouped3.loc[subgrouped3['ofold']==1].sort_values(metric, ascending=False).drop(['ofold','ifold','auc','recall','precision','kappa'],axis=1).head(1)
        sub2 = subgrouped3.loc[subgrouped3['ofold']==2].sort_values(metric, ascending=False).drop(['ofold','ifold','auc','recall','precision','kappa'],axis=1).head(1)
        sub3 = subgrouped3.loc[subgrouped3['ofold']==3].sort_values(metric, ascending=False).drop(['ofold','ifold','auc','recall','precision','kappa'],axis=1).head(1)

        data1.append(sub1.values[0])
        data2.append(sub2.values[0])
        data3.append(sub3.values[0])
        
    hp_fold1 = pd.DataFrame(data1, columns=[ 'hp_fold1','target'])
    hp_fold2 = pd.DataFrame(data2, columns=[ 'hp_fold2','target'])
    hp_fold3 = pd.DataFrame(data3, columns=[ 'hp_fold3','target'])
    
    return hp_fold1.merge(hp_fold2, on='target').merge(hp_fold3, on='target')

In [4]:
# Find out the performance - OUTER LOOP
# using the HP selected in inner loop
def getOuterLoopPerformance(perf_outer, hp_select):    
    
    data1,data2,data3=[],[],[]
    for i in range(526):
        
        hp_fold1 = hp_selection.loc[hp_selection['target']==i]['hp_fold1'].values[0]
        hp_fold2 = hp_selection.loc[hp_selection['target']==i]['hp_fold2'].values[0]
        hp_fold3 = hp_selection.loc[hp_selection['target']==i]['hp_fold3'].values[0]
        
        perf_fold1 = perf_outer.loc[(perf_outer['target']==i)&(perf_outer['hp']==hp_fold1)&(perf_outer['ofold']==1)]
        perf_fold2 = perf_outer.loc[(perf_outer['target']==i)&(perf_outer['hp']==hp_fold2)&(perf_outer['ofold']==2)]
        perf_fold3 = perf_outer.loc[(perf_outer['target']==i)&(perf_outer['hp']==hp_fold3)&(perf_outer['ofold']==3)]

        ####  one target missing for SVM and RF
        if perf_fold1.shape[0]==0 or perf_fold2.shape[0]==0 or perf_fold3.shape[0]==0:continue
        ####
                
        data1.append(perf_fold1.values[0])
        data2.append(perf_fold2.values[0])
        data3.append(perf_fold3.values[0])
        
        
    all_perf_fold1 = pd.DataFrame(data1, columns=perf_fold1.columns.values)
    all_perf_fold2 = pd.DataFrame(data2, columns=perf_fold2.columns.values)
    all_perf_fold3 = pd.DataFrame(data3, columns=perf_fold3.columns.values)
    
    return all_perf_fold1,all_perf_fold2,all_perf_fold3

# XGboost

## Inner fold - HP search

In [9]:
xgboost_df = pd.read_csv('results_v5_xgboost_ecfp/all-nested-cv-perf-scores.csv', header=0, sep="\t")

# separate inner and out loop results
xgb_inner = xgboost_df.loc[xgboost_df['ifold'].isin([1,2])].copy(deep=True).drop(['al','sample'],axis=1)
xgb_outer = xgboost_df.loc[xgboost_df['ifold'].isin([0])].copy(deep=True).drop(['al','sample'],axis=1)
del xgboost_df

In [7]:
xgb_inner.head()

Unnamed: 0,ofold,ifold,target,hp,auc,recall,precision,kappa
9468,1,1,0,booster_gbtree-learning_rate_0.05-max_depth_5-...,0.808374,0.702326,0.731826,0.507079
9469,1,1,0,booster_gbtree-learning_rate_0.05-max_depth_5-...,0.804593,0.857364,0.603053,0.404244
9470,1,1,0,booster_gbtree-learning_rate_0.05-max_depth_5-...,0.792723,0.913178,0.523556,0.253211
9471,1,1,0,booster_gbtree-learning_rate_0.05-max_depth_5-...,0.810966,0.699225,0.722756,0.495144
9472,1,1,0,booster_gbtree-learning_rate_0.05-max_depth_5-...,0.81093,0.865116,0.605863,0.412486


In [8]:
# select best HP per fold
hp_selection = selectHP('auc', xgb_inner)

In [14]:
hp_selection.head()

Unnamed: 0,hp_fold1,target,hp_fold2,hp_fold3
0,booster_gbtree-learning_rate_0.05-max_depth_10...,0.0,booster_gbtree-learning_rate_0.05-max_depth_10...,booster_gbtree-learning_rate_0.05-max_depth_10...
1,booster_gbtree-learning_rate_0.05-max_depth_5-...,100.0,booster_gbtree-learning_rate_0.05-max_depth_5-...,booster_gbtree-learning_rate_0.05-max_depth_10...
2,booster_gbtree-learning_rate_0.05-max_depth_5-...,101.0,booster_gbtree-learning_rate_0.05-max_depth_10...,booster_gbtree-learning_rate_0.05-max_depth_5-...
3,booster_gbtree-learning_rate_0.05-max_depth_5-...,102.0,booster_gbtree-learning_rate_0.05-max_depth_5-...,booster_gbtree-learning_rate_0.05-max_depth_5-...
4,booster_gbtree-learning_rate_0.05-max_depth_10...,103.0,booster_gbtree-learning_rate_0.05-max_depth_5-...,booster_gbtree-learning_rate_0.05-max_depth_10...


## Outer folds - Models evaluations

In [15]:
# use the selected HP to find out the performance in outer loop
perf_fold1, perf_fold2, perf_fold3 = getOuterLoopPerformance(xgb_outer, hp_selection)
perf_fold1.to_csv("results_v5_xgboost_ecfp/perf_fold1.csv", sep="\t", index=None)
perf_fold2.to_csv("results_v5_xgboost_ecfp/perf_fold2.csv", sep="\t", index=None)
perf_fold3.to_csv("results_v5_xgboost_ecfp/perf_fold3.csv", sep="\t", index=None)

In [9]:
# hp selection based on AUC
print("           fold1           fold2           fold3")
print("AUC      =({:.4f}+/-{:.4f},{:.4f}+/-{:.4f},{:.4f}+/-{:.4f})".format(perf_fold1['auc'].mean(),
                                                                           perf_fold1['auc'].std(),
                                                                           perf_fold2['auc'].mean(),
                                                                           perf_fold2['auc'].std(),
                                                                           perf_fold3['auc'].mean(),
                                                                           perf_fold3['auc'].std()))

print("PRECISION=({:.4f}+/-{:.4f},{:.4f}+/-{:.4f},{:.4f}+/-{:.4f})".format(perf_fold1['precision'].mean(),
                                                                           perf_fold1['precision'].std(),
                                                                           perf_fold2['precision'].mean(),
                                                                           perf_fold2['precision'].std(),
                                                                           perf_fold3['precision'].mean(),
                                                                           perf_fold3['precision'].std()))

print("RECALL   =({:.4f}+/-{:.4f},{:.4f}+/-{:.4f},{:.4f}+/-{:.4f})".format(perf_fold1['recall'].mean(),
                                                                           perf_fold1['recall'].std(),
                                                                           perf_fold2['recall'].mean(),
                                                                           perf_fold2['recall'].std(),
                                                                           perf_fold3['recall'].mean(),
                                                                           perf_fold3['recall'].std()))

print("KAPPA    =({:.4f}+/-{:.4f},{:.4f}+/-{:.4f},{:.4f}+/-{:.4f})".format(perf_fold1['kappa'].mean(),
                                                                           perf_fold1['kappa'].std(),
                                                                           perf_fold2['kappa'].mean(),
                                                                           perf_fold2['kappa'].std(),
                                                                           perf_fold3['kappa'].mean(),
                                                                           perf_fold3['kappa'].std()))

           fold1           fold2           fold3
AUC      =(0.8084+/-0.1212,0.8067+/-0.1224,0.8106+/-0.1177)
PRECISION=(0.6579+/-0.2562,0.6549+/-0.2602,0.6677+/-0.2410)
RECALL   =(0.5599+/-0.3462,0.5524+/-0.3426,0.5577+/-0.3394)
KAPPA    =(0.3626+/-0.2411,0.3619+/-0.2339,0.3649+/-0.2359)


## find out best HP out of three outer folds

In [11]:
# get the best HP out of the three HP selected in inner loop based on outer loop performances
best_hp_for_full_scale = []
for i in range(526):
    
    best_fold = np.argmax(np.array([perf_fold1.iloc[i]['auc'],
                                    perf_fold2.iloc[i]['auc'],
                                    perf_fold3.iloc[i]['auc']]))
    
    
    best_hp = np.array([perf_fold1.iloc[i]['hp'],
                        perf_fold2.iloc[i]['hp'],
                        perf_fold3.iloc[i]['hp']])[best_fold]
    
    
    best_hp_for_full_scale.append(best_hp)


In [None]:
# select one best HP per target based on full outer loop perf
# 

In [39]:
best_hp_for_full_scale = xgb_outer.sort_values('auc',ascending=False).groupby(['target']).first().reset_index()['hp'].values

In [40]:
# generate commands for full-scale models to run 
#
# Usage: python xgboost_model.py [target_idx] [obj] [lr] [scale_pos_w] [n_Estim] [max_depth]
srun_cmd = 'srun --partition=core -c 12 --time=3-00:00:00 -J xgb'

for i in range(526):
    target  = i
    hp      = best_hp_for_full_scale[i].split('-')
    booster = hp[0].replace('booster_', '')
    lr      = hp[1].replace('learning_rate_', '')
    max_d   = hp[2].replace('max_depth_', '')
    n_estim = hp[3].replace('n_estimators_', '')
    obj     = hp[4].replace('objective_', '')
    scale_pw= hp[5].replace('scale_pos_weight_', '')
    print("{}{} python xgboost_model.py {} {} {} {} {} {} & ".format(srun_cmd, i, target, obj, lr, scale_pw, n_estim, max_d))

    
    

srun --partition=core -c 12 --time=3-00:00:00 -J xgb0 python xgboost_model.py 0 binary:logistic 0.05 1 200 10 & 
srun --partition=core -c 12 --time=3-00:00:00 -J xgb1 python xgboost_model.py 1 binary:logistic 0.05 1 200 5 & 
srun --partition=core -c 12 --time=3-00:00:00 -J xgb2 python xgboost_model.py 2 binary:logistic 0.05 1 200 5 & 
srun --partition=core -c 12 --time=3-00:00:00 -J xgb3 python xgboost_model.py 3 binary:logistic 0.05 10 200 10 & 
srun --partition=core -c 12 --time=3-00:00:00 -J xgb4 python xgboost_model.py 4 binary:logistic 0.05 1 200 10 & 
srun --partition=core -c 12 --time=3-00:00:00 -J xgb5 python xgboost_model.py 5 binary:logistic 0.05 5 200 5 & 
srun --partition=core -c 12 --time=3-00:00:00 -J xgb6 python xgboost_model.py 6 binary:logistic 0.05 1 200 10 & 
srun --partition=core -c 12 --time=3-00:00:00 -J xgb7 python xgboost_model.py 7 binary:logistic 0.05 10 200 10 & 
srun --partition=core -c 12 --time=3-00:00:00 -J xgb8 python xgboost_model.py 8 binary:logistic 0