In [1]:
import tensorflow as tf
import xgboost as xgb
import pandas as pd
import functools
import time
from tqdm import tqdm
tqdm.pandas()
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_curve, roc_auc_score, \
    auc, average_precision_score, pairwise_distances
import scikitplot as skplt
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgb_hyper import objective
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import pickle
import dill
from functools import partial

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config = config)
print("The following GPU devices ara available: %s"%tf.test.gpu_device_name())

The following GPU devices ara available: /device:GPU:0


In [3]:
def calculate_metrics(y_true, y_pred, plots=False):
    assert isinstance(y_true, np.ndarray), 'y_true should be np.array'
    assert len(y_true.shape) == len(y_pred.shape) == 1, 'y_true or y_pred shapes are not 1 (probably not squeezed)'
    y_pred_bin = y_pred > 0.5

    cf = confusion_matrix(y_true, y_pred_bin)
    tn, fp, fn, tp = cf.ravel()

    metrics = {
        'roc_auc': roc_auc_score(y_true, y_pred),
        'tn': tn,
        'fp': fp,
        'fn': fn,
        'tp': tp,
        'map': average_precision_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred_bin),
        'recall': recall_score(y_true, y_pred_bin),
        'accuracy': accuracy_score(y_true, y_pred_bin),
    }

    if plots:
        print('predictions histogram')
        plt.figure()
        plt.hist(y_pred, bins=int(len(y_pred) / 3))
        plt.show()

        print('confusion matrix')
        plt.figure()
        group_names = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
        group_counts = ['{0:0.0f}'.format(value) for value in
                        cf.flatten()]
        group_percentages = ['{0:.2%}'.format(value) for value in
                             cf.flatten() / np.sum(cf)]
        labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in
                  zip(group_names, group_counts, group_percentages)]
        labels = np.asarray(labels).reshape(2, 2)
        sns.heatmap(cf, annot=labels, fmt='', cmap='Blues')
        plt.show()

        print('roc curve')
        random_probs = [0 for _ in range(len(y_true))]
        auc = roc_auc_score(y_true, y_pred)
        print('Logistic: ROC AUC=%.3f' % (auc))
        ns_fpr, ns_tpr, _ = roc_curve(y_true, random_probs)
        lr_fpr, lr_tpr, _ = roc_curve(y_true, y_pred)
        plt.plot(ns_fpr, ns_tpr, linestyle='--', label='random')
        plt.plot(lr_fpr, lr_tpr, marker='.')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.legend()
        plt.show()

    return metrics


In [4]:
target = 'p38'
base_path = f'C:/Users/tomas/Documents/GitHub/kinase_binding'

data_fpath = base_path+f'/data/{target}/data.csv'
df=pd.read_csv(data_fpath).set_index('biolab_index')

with open(base_path+f'/data/{target}/train_val_folds.pkl', "rb") as in_f:
    train_val_folds = dill.load(in_f)
with open(base_path+f'/data/{target}/train_test_folds.pkl', "rb") as in_f:
    train_test_folds = dill.load(in_f)

In [5]:
training_list = [df.loc[train_val_folds[0][0]],
                 df.loc[train_val_folds[1][0]],
                 df.loc[train_val_folds[2][0]],
                 df.loc[train_val_folds[3][0]],
                 df.loc[train_val_folds[4][0]],
                 df.loc[train_val_folds[5][0]],
                 ]
validation_list = [df.loc[train_val_folds[0][1]],
                   df.loc[train_val_folds[1][1]],
                   df.loc[train_val_folds[2][1]],
                   df.loc[train_val_folds[3][1]],
                   df.loc[train_val_folds[4][1]],
                   df.loc[train_val_folds[5][1]],
                   ]

In [6]:
# defining the space
fspace = {
    'colsample_bylevel' : hp.uniform('colsample_bylevel', 0.1, 1), #+
    'colsample_bytree' : hp.uniform('colsample_bytree', 0.1, 1), #+
    'gamma' : hp.uniform('gamma', 0.1, 1), #+
    'learning_rate' : hp.uniform('learning_rate', 0.1, 1),
    'max_delta_step' : hp.quniform('max_delta_step',1,10,1),
    'max_depth' : hp.quniform('max_depth',6, 12, 1),
    'min_child_weight' : hp.quniform('min_child_weight',10 ,500 ,5),
    'reg_alpha' : hp.uniform('reg_alpha',0.1,100),
    'reg_lambda' : hp.uniform('reg_lambda',0.1,100),
    'subsample' : hp.uniform('subsample',0.1,1.0),
    'max_bin' : hp.quniform('max_bin',16,256,16)
    # add sampling method,max bin,predicto,monotone_constraints,interaction_constraints,single_precision_histogram
}

In [7]:
fmin_objective = partial(objective, train_sets = training_list, val_sets = validation_list)

In [8]:
def run_trials():

    trials_step = 1  # how many additional trials to do after loading saved trials. 1 = save after iteration
    max_trials = 1  # initial max_trials. put something small to not have to wait

    
    try:  # try to load an already saved trials object, and increase the max
        trials = pickle.load(open("xgboost.hyperopt", "rb"))
        print("Found saved Trials! Loading...")
        max_trials = len(trials.trials) + trials_step
        print("Rerunning from {} trials to {} (+{}) trials".format(len(trials.trials), max_trials, trials_step))
    except:  # create a new trials object and start searching
        trials = Trials()

    best = fmin(fn = fmin_objective, space = fspace, algo=tpe.suggest, max_evals=max_trials, trials=trials)

    print("Best:", best)
    
    # save the trials object
    with open("xgboost.hyperopt", "wb") as f:
        pickle.dump(trials, f)
    return(trials)

In [9]:
trials = run_trials()

Found saved Trials! Loading...
Rerunning from 509 trials to 510 (+1) trials
100%|█████████████████████████████████████████████| 510/510 [00:30<00:00, 16.47trial/s, best loss: -0.8434333737766622]
Best: {'colsample_bylevel': 0.5612301667238877, 'colsample_bytree': 0.788688363076523, 'gamma': 0.35376030016117566, 'learning_rate': 0.4023692255888918, 'max_bin': 16.0, 'max_delta_step': 3.0, 'max_depth': 8.0, 'min_child_weight': 70.0, 'reg_alpha': 0.15030685758880047, 'reg_lambda': 15.311721955443915, 'subsample': 0.8303923929525608}


In [None]:
best_loss = trials.trials[0]['result']['loss']
for i in range(1,len(trials.trials)):
    if (trials.trials[i]['result']['loss'] <=  best_loss):
        best_loss = trials.trials[i]['result']['loss']
        index = i

    
best_params = trials.trials[index]['misc']['vals']
hyper_params = {
        "colsample_bylevel" : best_params['colsample_bylevel'][0],
        "colsample_bytree" : best_params['colsample_bytree'][0],
        "gamma" : best_params['gamma'][0],
        "eta" : best_params['learning_rate'][0],
        "max_delta_step" : int(best_params['max_delta_step'][0]),
        "max_depth" : int(best_params['max_depth'][0]),
        "min_child_weight" : int(best_params['min_child_weight'][0]),
        "alpha" : best_params['reg_alpha'][0],
        "lambda" : best_params['reg_lambda'][0],
        "subsample" : best_params['subsample'][0],
        "eval_metric":'auc',
        "objective":'binary:logistic',
        "booster":'gbtree',
        "tree_method" : 'gpu_hist',
        "single_precision_histogram" : True
}

In [None]:
from xgb_hyper import XGB_hyper
class_xgb = XGB_hyper(hyper_params)
training_metrics = {}
validation_metrics = {}
test_metrics = {}
dmatrix_test,test_labels = class_xgb.to_xgb_input(test_data)
for i in range(len(training_list)):
    dmatrix_train,train_labels = class_xgb.to_xgb_input(training_list[i])
    dmatrix_val,val_labels = class_xgb.to_xgb_input(validation_list[i])
    evalist = [(dmatrix_val,'eval'),(dmatrix_train,'train')]
    assert len(validation_list[i].index.intersection(training_list[i].index)) == 0
    model = class_xgb.build_model(dmatrix_train,evalist,True,300)
    
    y_pred_val = model.predict(dmatrix_val)
    validation_metrics['Val_%s'%i] = calculate_metrics(np.array(val_labels),y_pred_val)
    
    #y_pred_test = model.predict(dmatrix_test)
    #test_metrics['Fold_%s'%i] = calculate_metrics(np.array(test_labels),y_pred_test)
    
    y_pred_train = model.predict(dmatrix_train)
    training_metrics['Train_%s'%i] = calculate_metrics(np.array(train_labels),y_pred_train)
    
    
  


In [None]:
training_metrics = pd.DataFrame(training_metrics)
training_metrics['Average'] = training_metrics.mean(axis=1)
training_metrics = training_metrics.T
training_metrics

In [None]:
validation_metrics = pd.DataFrame(validation_metrics)
validation_metrics['Average'] = validation_metrics.mean(axis=1)
validation_metrics = validation_metrics.T
validation_metrics

In [None]:
train_ = df.loc[train_test_folds[0]]
test_ = df.loc[train_test_folds[1]]

In [None]:
test_metrics = {}
dmatrix_train,train_labels = class_xgb.to_xgb_input(train_)
dmatrix_test,test_labels = class_xgb.to_xgb_input(test_)
evalist = [(dmatrix_val,'eval'),(dmatrix_train,'train')]
assert len(test_.index.intersection(train_.index)) == 0
model = class_xgb.build_model(dmatrix_train,evalist,True,300)
y_pred_test = model.predict(dmatrix_test)
test_metrics['Test'] = calculate_metrics(np.array(test_labels),y_pred_test)
test_metrics = pd.DataFrame(test_metrics)
test_metrics = test_metrics.T
test_metrics