In [1]:
import pandas as pd
from keras.callbacks import History, ReduceLROnPlateau,EarlyStopping,ModelCheckpoint
import os
import numpy as np
from data_analysis import calculate_metrics
from functools import partial
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import pickle
import dill
from hyper_mining import objective_fn

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
fspace = {
    'conv1' : hp.quniform('conv1', 32, 64, 8),
    'conv2' : hp.quniform('conv2', 64, 128, 8),
    'conv3' : hp.quniform('conv3', 128, 168, 8),
    'fp' : hp.quniform('fp', 96, 196, 8),
    'dense1' : hp.quniform('dense1',96,512,32),
    'dense2' : hp.quniform('dense2',96,512,32),
    'dense3' : hp.quniform('dense3',64,512,32),
    'dropout_rate' : hp.uniform('dropout_rate',0.1,0.5),
    'lr' : hp.uniform('lr',0.000001,0.01),
    'n_epochs' : hp.quniform('n_epochs',15,60,5),
    'batch_size' : hp.quniform('batch_size',64,256,16),
    'colsample_bylevel' : hp.uniform('colsample_bylevel', 0.1, 1), 
    'colsample_bytree' : hp.uniform('colsample_bytree', 0.1, 1), 
    'gamma' : hp.uniform('gamma', 0.1, 1), 
    'learning_rate' : hp.uniform('learning_rate', 0.1, 1),
    'max_delta_step' : hp.quniform('max_delta_step',1,10,1),
    'max_depth' : hp.quniform('max_depth',6, 12, 1),
    'min_child_weight' : hp.quniform('min_child_weight',10 ,500 ,5),
    'reg_alpha' : hp.uniform('reg_alpha',0.1,100),
    'reg_lambda' : hp.uniform('reg_lambda',0.1,100),
    'subsample' : hp.uniform('subsample',0.1,1.0),
    'max_bin' : hp.quniform('max_bin',16,256,16)
    #'margin' : hp.uniform('margin',0.2,2)
}

In [3]:
target = 'p38'
base_path = f'C:/Users/tomas/Documents/GitHub/kinase_binding'

data_fpath = base_path+f'/data/{target}/data.csv'
df=pd.read_csv(data_fpath).set_index('biolab_index')

with open(base_path+f'/data/{target}/train_val_folds.pkl', "rb") as in_f:
    train_val_folds = dill.load(in_f)
with open(base_path+f'/data/{target}/train_test_folds.pkl', "rb") as in_f:
    train_test_folds = dill.load(in_f)

In [5]:
training_list = [df.loc[train_val_folds[0][0]],
                 df.loc[train_val_folds[1][0]],
                 df.loc[train_val_folds[2][0]],
                 df.loc[train_val_folds[3][0]],
                 df.loc[train_val_folds[4][0]],
                 df.loc[train_val_folds[5][0]],
                 ]
validation_list = [df.loc[train_val_folds[0][1]],
                   df.loc[train_val_folds[1][1]],
                   df.loc[train_val_folds[2][1]],
                   df.loc[train_val_folds[3][1]],
                   df.loc[train_val_folds[4][1]],
                   df.loc[train_val_folds[5][1]],
                   ]

In [6]:
fmin_objective = partial(objective_fn, train_sets = training_list, val_sets = validation_list)

In [7]:
def run_trials():

    trials_step = 0  # how many additional trials to do after loading saved trials. 1 = save after iteration
    max_trials = 1  # initial max_trials. put something small to not have to wait

    
    try:  # try to load an already saved trials object, and increase the max
        trials = pickle.load(open("gcn_xgb.hyperopt", "rb"))
        print("Found saved Trials! Loading...")
        max_trials = len(trials.trials) + trials_step
        print("Rerunning from {} trials to {} (+{}) trials".format(len(trials.trials), max_trials, trials_step))
    except:  # create a new trials object and start searching
        trials = Trials()

    best = fmin(fn = fmin_objective, space = fspace, algo=tpe.suggest, max_evals=max_trials, trials=trials)

    print("Best:", best)
    
    # save the trials object
    with open("gcn_xgb.hyperopt", "wb") as f:
        pickle.dump(trials, f)
    return(trials)

In [8]:
trials = run_trials()

Found saved Trials! Loading...
Rerunning from 69 trials to 69 (+0) trials
100%|██████████████████████████████████████████████████████████████████████████| 69/69 [00:00<?, ?trial/s, best loss=?]
Best: {'batch_size': 256.0, 'colsample_bylevel': 0.9742808465908208, 'colsample_bytree': 0.37580168789089324, 'conv1': 40.0, 'conv2': 80.0, 'conv3': 160.0, 'dense1': 256.0, 'dense2': 384.0, 'dense3': 384.0, 'dropout_rate': 0.11318471159226598, 'fp': 96.0, 'gamma': 0.6244840665119147, 'learning_rate': 0.10045460409368323, 'lr': 0.00022802502278976386, 'max_bin': 80.0, 'max_delta_step': 4.0, 'max_depth': 7.0, 'min_child_weight': 25.0, 'n_epochs': 15.0, 'reg_alpha': 50.67508723786807, 'reg_lambda': 18.383656172678286, 'subsample': 0.4818345443328064}


In [9]:
trials.trials

[{'state': 2,
  'tid': 0,
  'spec': None,
  'result': {'loss': -0.4865970290545754, 'status': 'ok'},
  'misc': {'tid': 0,
   'cmd': ('domain_attachment', 'FMinIter_Domain'),
   'workdir': None,
   'idxs': {'batch_size': [0],
    'colsample_bylevel': [0],
    'colsample_bytree': [0],
    'conv1': [0],
    'conv2': [0],
    'conv3': [0],
    'dense1': [0],
    'dense2': [0],
    'dense3': [0],
    'dropout_rate': [0],
    'fp': [0],
    'gamma': [0],
    'learning_rate': [0],
    'lr': [0],
    'max_bin': [0],
    'max_delta_step': [0],
    'max_depth': [0],
    'min_child_weight': [0],
    'n_epochs': [0],
    'reg_alpha': [0],
    'reg_lambda': [0],
    'subsample': [0]},
   'vals': {'batch_size': [224.0],
    'colsample_bylevel': [0.42978801235510555],
    'colsample_bytree': [0.8528808281502837],
    'conv1': [48.0],
    'conv2': [96.0],
    'conv3': [144.0],
    'dense1': [160.0],
    'dense2': [192.0],
    'dense3': [448.0],
    'dropout_rate': [0.4033342875738264],
    'fp': [192.

In [None]:
from numba import cuda 
device = cuda.get_current_device()
device.reset()

In [8]:
best_loss = trials.trials[0]['result']['loss']
for i in range(1,len(trials.trials)):
    if (trials.trials[i]['result']['loss'] <=  best_loss):
        best_loss = trials.trials[i]['result']['loss']
        index = i
best_params = trials.trials[index]['misc']['vals']

In [9]:
best_params

{'batch_size': [256.0],
 'colsample_bylevel': [0.9742808465908208],
 'colsample_bytree': [0.37580168789089324],
 'conv1': [40.0],
 'conv2': [80.0],
 'conv3': [160.0],
 'dense1': [256.0],
 'dense2': [384.0],
 'dense3': [384.0],
 'dropout_rate': [0.11318471159226598],
 'fp': [96.0],
 'gamma': [0.6244840665119147],
 'learning_rate': [0.10045460409368323],
 'lr': [0.00022802502278976386],
 'max_bin': [80.0],
 'max_delta_step': [4.0],
 'max_depth': [7.0],
 'min_child_weight': [25.0],
 'n_epochs': [15.0],
 'reg_alpha': [50.67508723786807],
 'reg_lambda': [18.383656172678286],
 'subsample': [0.4818345443328064]}

In [11]:
from hyper_mining import XGB_predictor,GCN_online_mining_test
from data_analysis import calculate_metrics
es = EarlyStopping(monitor='loss',patience=8, min_delta=0)
rlr = ReduceLROnPlateau(monitor='loss',factor=0.5, patience=4, verbose=1, min_lr=0.0000001)
gcn_best = {
        "num_layers" : 3,
        "max_atoms" : 70,
        "num_atom_features" : 62,
        "num_atom_features_original" : 62,
        "num_bond_features" : 6,
        "max_degree" : 5,
        "conv_width" : [int(best_params['conv1'][0]), int(best_params['conv2'][0]), int(best_params['conv3'][0])],
        "fp_length" : [int(best_params['fp'][0]), int(best_params['fp'][0]), int(best_params['fp'][0])],
        "activ_enc" : "selu",
        "activ_dec" : "selu",
        "learning_rates" : [0.001,0.001,0.001],
        "learning_rates_fp": [0.005,0.005,0.005],
        "losses_conv" : {
                    "neighbor_output": "mean_squared_error",
                    "self_output": "mean_squared_error",
                    },
        "lossWeights" : {"neighbor_output": 1.0, "self_output": 1.0},
        "metrics" : "mse",
        "loss_fp" : "mean_squared_error",
        "enc_layer_names" : ["enc_1", "enc_2", "enc_3"],
        'callbacks' : [es,rlr],
        'adam_decay': 0.0005329142291371636,
        'beta': 5,
        'p': 0.004465204118126482,
        'dense_size' : [int(best_params['dense1'][0]), int(best_params['dense2'][0]), int(best_params['dense3'][0])],
        'dropout_rate' : [best_params['dropout_rate'][0], best_params['dropout_rate'][0]],
        'lr' : best_params['lr'][0],
        'batch_size' : int(best_params['batch_size'][0]),
        'n_epochs' : int(best_params['n_epochs'][0])
        #'margin' : best_params['margin'][0]
        }
xgb_best = {
        "colsample_bylevel" : best_params['colsample_bylevel'][0],
        "colsample_bytree" : best_params['colsample_bytree'][0],
        "gamma" : best_params['gamma'][0],
        "eta" : best_params['learning_rate'][0],
        "max_delta_step" : int(best_params['max_delta_step'][0]),
        "max_depth" : int(best_params['max_depth'][0]),
        "min_child_weight" : int(best_params['min_child_weight'][0]),
        "alpha" : best_params['reg_alpha'][0],
        "lambda" : best_params['reg_lambda'][0],
        "subsample" : best_params['subsample'][0],
        "max_bin" : int(best_params['max_bin'][0]),
        "eval_metric":'auc',
        "objective":'binary:logistic',
        "booster":'gbtree'
        #"single_precision_histogram" : True
        }
class_XGB = XGB_predictor(xgb_best)
class_GCN = GCN_online_mining_test(gcn_best)

In [None]:
xgb_hyper = {
        "colsample_bylevel" : 0.5612301667238877,
        "colsample_bytree" : 0.788688363076523,
        "gamma" : 0.35376030016117566,
        "eta" : 0.4023692255888918,
        "max_delta_step" : int(3),
        "max_depth" : int(8),
        "min_child_weight" : int(70),
        "alpha" : 0.15030685758880047,
        "lambda" : 15.311721955443915,
        "subsample" : 0.8303923929525608,
        "max_bin" : int(208),
        "eval_metric":'auc',
        "objective":'binary:logistic',
        "booster":'gbtree',
        "single_precision_histogram" : True
}
class_XGB_2 = XGB_predictor(xgb_hyper)

In [14]:
#K.clear_session()
training_metrics = {}
validation_metrics = {}
es2 = EarlyStopping(monitor='loss',patience=15, min_delta=0)
rlr2 = ReduceLROnPlateau(monitor='loss',factor=0.5, patience=2, verbose=1, min_lr=0.000000001)
for i in range(len(training_list)):
        X_atoms_cold,X_bonds_cold,X_edges_cold = class_GCN.dataframe_to_gcn_input(validation_list[i])
        Y_cold = validation_list[i].Binary 
        Y_dummy_cold = np.empty((X_atoms_cold.shape[0],gcn_best['dense_size'][2]+1))
        X_atoms_train, X_bonds_train, X_edges_train = class_GCN.dataframe_to_gcn_input(training_list[i])
        Y = training_list[i].Binary
        Y_dummy_train = np.empty((X_atoms_train.shape[0],gcn_best['dense_size'][2]+1))
        
        gcn_encoder = class_GCN.build_encoder()
        gcn_model = class_GCN.build_model(gcn_encoder)
        gcn_mining = class_GCN.build_mining(gcn_model)
        
        gcn_mining.fit([X_atoms_train,X_bonds_train,X_edges_train,Y],
                       Y_dummy_train,
                       epochs = gcn_best['n_epochs'],
                       batch_size = gcn_best['batch_size'],
                       shuffle = True,
                       validation_data = ([X_atoms_cold,X_bonds_cold,X_edges_cold,Y_cold],Y_dummy_cold),
                       callbacks=[es2,rlr2]
                      )
        #Predict Embeddings
        embeddings_cold = gcn_model.predict([X_atoms_cold,X_bonds_cold,X_edges_cold])
        embeddings_train = gcn_model.predict([X_atoms_train, X_bonds_train, X_edges_train])
        
        #Prepare data for XGBoost
        dmatrix_train = class_XGB.to_xgb_input(Y,embeddings_train)
        dmatrix_cold = class_XGB.to_xgb_input(Y_cold,embeddings_cold)
        
        evalist = [(dmatrix_train,'train'),(dmatrix_cold,'eval')]
        xgb_model = class_XGB.build_model(dmatrix_train,evalist,300)
        
        xgb_pred_cold = xgb_model.predict(dmatrix_cold)
        validation_metrics['Val_%s'%i] = calculate_metrics(np.array(Y_cold),xgb_pred_cold)
        
        xgb_pred_train = xgb_model.predict(dmatrix_train)
        training_metrics['Train_%s'%i] = calculate_metrics(np.array(Y),xgb_pred_train)
        


LAYER 0
LAYER 1
LAYER 2


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 2541 samples, validate on 509 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
LAYER 0
LAYER 1
LAYER 2


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 2541 samples, validate on 509 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
LAYER 0
LAYER 1
LAYER 2


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 2541 samples, validate on 509 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
LAYER 0
LAYER 1
LAYER 2


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 2541 samples, validate on 509 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15

Epoch 00015: ReduceLROnPlateau reducing learning rate to 0.00011401250958442688.
LAYER 0
LAYER 1
LAYER 2


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 2541 samples, validate on 509 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
LAYER 0
LAYER 1
LAYER 2


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 2545 samples, validate on 505 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [15]:
validation_metrics

{'Val_0': {'roc_auc': 0.8744913806491691,
  'tn': 203,
  'fp': 71,
  'fn': 35,
  'tp': 200,
  'map': 0.858605979880733,
  'precision': 0.7380073800738007,
  'recall': 0.851063829787234,
  'accuracy': 0.7917485265225933},
 'Val_1': {'roc_auc': 0.9031126247425948,
  'tn': 241,
  'fp': 54,
  'fn': 39,
  'tp': 175,
  'map': 0.8609554237843965,
  'precision': 0.7641921397379913,
  'recall': 0.8177570093457944,
  'accuracy': 0.8172888015717092},
 'Val_2': {'roc_auc': 0.8586200500788278,
  'tn': 201,
  'fp': 62,
  'fn': 51,
  'tp': 195,
  'map': 0.8538456995808472,
  'precision': 0.7587548638132295,
  'recall': 0.7926829268292683,
  'accuracy': 0.7779960707269156},
 'Val_3': {'roc_auc': 0.8554640417838488,
  'tn': 199,
  'fp': 63,
  'fn': 47,
  'tp': 200,
  'map': 0.8186819243729625,
  'precision': 0.7604562737642585,
  'recall': 0.8097165991902834,
  'accuracy': 0.7838899803536346},
 'Val_4': {'roc_auc': 0.90339170880939,
  'tn': 184,
  'fp': 44,
  'fn': 50,
  'tp': 231,
  'map': 0.910058199

In [17]:
training_metrics

{'Train_0': {'roc_auc': 0.9469330882662786,
  'tn': 1087,
  'fp': 205,
  'fn': 132,
  'tp': 1117,
  'map': 0.939930309529514,
  'precision': 0.8449319213313162,
  'recall': 0.8943154523618895,
  'accuracy': 0.867375049193231},
 'Train_1': {'roc_auc': 0.9410852016825986,
  'tn': 1080,
  'fp': 191,
  'fn': 143,
  'tp': 1127,
  'map': 0.9325836947433694,
  'precision': 0.855083459787557,
  'recall': 0.8874015748031496,
  'accuracy': 0.8685556867375049},
 'Train_2': {'roc_auc': 0.9450134956363901,
  'tn': 1108,
  'fp': 195,
  'fn': 134,
  'tp': 1104,
  'map': 0.9376158463050164,
  'precision': 0.8498845265588915,
  'recall': 0.8917609046849758,
  'accuracy': 0.8705234159779615},
 'Train_3': {'roc_auc': 0.9451702615173262,
  'tn': 1114,
  'fp': 190,
  'fn': 137,
  'tp': 1100,
  'map': 0.9358870527258855,
  'precision': 0.8527131782945736,
  'recall': 0.889248181083266,
  'accuracy': 0.8713105076741441},
 'Train_4': {'roc_auc': 0.9427514919726095,
  'tn': 1145,
  'fp': 193,
  'fn': 150,
  't