In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import os
import joblib
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.utils.class_weight import compute_sample_weight

import xgboost as xgb

In [3]:
df = joblib.load('df_predictor_response.joblib')

In [4]:
df.head()

Unnamed: 0,gmv_1,gmv_2,gmv_3,gmv_4,gmv_5,gmv_6,gmv_7,gmv_8,gmv_9,gmv_10,...,fc_pca_42,fc_pca_43,fc_pca_44,fc_pca_45,fc_pca_46,fc_pca_47,fc_pca_48,fc_pca_49,fc_pca_50,responder
subj_1,0.567524,0.615216,0.610756,0.71475,0.532017,0.5383,0.621279,0.657148,0.59901,0.613495,...,-0.084168,0.183773,-0.734604,2.74505,0.58321,0.029631,-2.123206,3.374903,-3.654506,1.0
subj_2,0.564539,0.617759,0.679379,0.608611,0.596662,0.567769,0.816205,0.660637,0.676277,0.609318,...,5.886537,1.196023,-3.081668,-3.239135,2.283227,-4.860814,1.856561,-0.037,-2.071572,0.0
subj_3,0.597278,0.663706,0.643271,0.595084,0.567406,0.527736,0.754227,0.593889,0.643579,0.596471,...,-7.685474,-1.708413,-5.985563,7.651689,2.911852,-4.93971,-5.346591,2.766212,3.664858,1.0
subj_4,0.603452,0.604712,0.726019,0.646666,0.571758,0.586931,0.64559,0.609655,0.590275,0.535053,...,5.257554,-6.626827,1.683651,-0.296066,-4.691328,3.658944,3.403001,2.052774,-1.300202,0.0
subj_5,0.683891,0.649077,0.655115,0.588122,0.582557,0.576514,0.676339,0.613668,0.716159,0.653681,...,-0.866787,-2.709679,0.204509,2.405831,9.386981,-2.507854,1.345326,-4.24217,6.9745,0.0


In [5]:
##########

In [6]:
x_arr = df[df.columns[:-1]].astype('float').values

y_arr = df['responder'].astype('float').values

In [7]:
x_arr.shape, y_arr.shape

((57, 1983), (57,))

In [8]:
np.where(y_arr == 1)[0].shape, np.where(y_arr == 0)[0].shape

((20,), (37,))

In [9]:
##########

In [10]:
# used when fit xgb model
sample_weights = compute_sample_weight(
    class_weight='balanced',
    y=y_arr
)

In [11]:
# fine-tuned parameter
xgb_parameter = {
    'gamma': 0.01, 
    'subsample': 0.9, 
    'colsample_bytree': 0.3, 
    'colsample_bylevel': 0.5, 
    'colsample_bynode': 0.3, 
    'booster': 'gbtree', 
    'reg_alpha': 0.1, 
    'verbosity': 0, 
    'scale_pos_weight': 2.5, 
    'n_estimators': 50, 
    'max_depth': 2, 
    'learning_rate': 0.12, 
    'reg_lambda': 1.65
}

In [12]:
##########

In [13]:
def main_func(xgb_para):
    y_true = []
    y_pre = []
    
    for i in range(y_arr.shape[0]):
        rows_train = [pos for pos in range(y_arr.shape[0]) if pos != i]
        
        x_train = x_arr[rows_train]
        y_train = y_arr[rows_train]
        
        w1 = sample_weights[rows_train]
        
        x_test = x_arr[i]
        x_test = x_test.reshape(1, -1)
        y_test = y_arr[i]
        
        xgb_instance = xgb.XGBClassifier(**xgb_para)
        xgb_instance.fit(x_train, y_train, sample_weight=w1)
        
        y_test_pre = xgb_instance.predict(x_test)
        y_test_pre = np.squeeze(y_test_pre)
        
        y_true.append(y_test)
        y_pre.append(y_test_pre)
    
    save_res = {
        'y_true': np.array(y_true),
        'y_pre': np.array(y_pre)
    }
    
    return save_res

In [14]:
##########

In [15]:
res = main_func(xgb_parameter)
y_true = res['y_true']
y_pre = res['y_pre']

In [16]:
accuracy = metrics.accuracy_score(y_true, y_pre)

print(accuracy)

0.8596491228070176


In [17]:
cm = confusion_matrix(y_true, y_pre)

tn, fp, fn, tp = cm.ravel()

sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)

print(sensitivity, specificity)

0.85 0.8648648648648649


In [18]:
f1_score = metrics.f1_score(y_true, y_pre)

print(f1_score)

0.8095238095238095


In [19]:
roc_auc = metrics.roc_auc_score(y_true, y_pre)

print(roc_auc)

0.8574324324324325
