In [1]:
import pandas as pd
from keras.callbacks import History, ReduceLROnPlateau,EarlyStopping,ModelCheckpoint
import os
import numpy as np
from data_analysis import calculate_metrics, load_weights_and_evaluate
from model_builders import GCN_siam_model
from hyper_mining import XGB_predictor
import pickle
import dill

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
target = 'pi3k'
base_path = f'C:/Users/tomas/Documents/GitHub/kinase_binding'

data_fpath = base_path+f'/data/{target}/data.csv'
df = pd.read_csv(data_fpath).set_index('biolab_index')

with open(base_path+f'/data/{target}/train_val_folds.pkl', "rb") as in_f:
    train_val_folds = dill.load(in_f)

with open(base_path+f'/data/{target}/train_test_folds.pkl', "rb") as in_f:
    train_test_folds = dill.load(in_f)

train_sets = [df.loc[train_val_folds[0][0]],
                 df.loc[train_val_folds[1][0]],
                 df.loc[train_val_folds[2][0]],
                 df.loc[train_val_folds[3][0]],
                 df.loc[train_val_folds[4][0]],
                 df.loc[train_val_folds[5][0]],
                 df.loc[train_test_folds[0]]
                 ]
val_sets = [df.loc[train_val_folds[0][1]],
                   df.loc[train_val_folds[1][1]],
                   df.loc[train_val_folds[2][1]],
                   df.loc[train_val_folds[3][1]],
                   df.loc[train_val_folds[4][1]],
                   df.loc[train_val_folds[5][1]],
                   df.loc[train_test_folds[1]]
                   ]
triplets_sets = [pd.read_csv('../../../../Desktop/binding/Triplets/p38/fold_0/triplets_train.csv',index_col = 0),
                pd.read_csv('../../../../Desktop/binding/Triplets/p38/fold_1/triplets_train.csv',index_col = 0),
                pd.read_csv('../../../../Desktop/binding/Triplets/p38/fold_2/triplets_train.csv',index_col = 0),
                pd.read_csv('../../../../Desktop/binding/Triplets/p38/fold_3/triplets_train.csv',index_col = 0),
                pd.read_csv('../../../../Desktop/binding/Triplets/p38/fold_4/triplets_train.csv',index_col = 0),
                pd.read_csv('../../../../Desktop/binding/Triplets/p38/fold_5/triplets_train.csv',index_col = 0),
                pd.read_csv('../../../../Desktop/binding/Triplets/p38/Test/triplets_train.csv',index_col = 0)]

In [3]:
es = EarlyStopping(monitor='loss',patience=8, min_delta=0)
rlr = ReduceLROnPlateau(monitor='loss',factor=0.5, patience=4, verbose=1, min_lr=0.0000001)
es2 = EarlyStopping(monitor='loss',patience=8, min_delta=0)
rlr2 = ReduceLROnPlateau(monitor='loss',factor=0.5, patience=2, verbose=1, min_lr=0.0000001)
model_params = {
        "num_layers" : 3,
        "max_atoms" : 70,
        "num_atom_features" : 62,
        "num_atom_features_original" : 62,
        "num_bond_features" : 6,
        "max_degree" : 5,
        "conv_width" : [int(96), int(104), int(120)],
        "fp_length" : [int(160), int(160), int(160)],
        "activ_enc" : "selu",
        "activ_dec" : "selu",
        "learning_rates" : [0.001,0.001,0.001],
        "learning_rates_fp": [0.005,0.005,0.005],
        "losses_conv" : {
                    "neighbor_output": "mean_squared_error",
                    "self_output": "mean_squared_error",
                    },
        "lossWeights" : {"neighbor_output": 1.0, "self_output": 1.0},
        "metrics" : "mse",
        "loss_fp" : "mean_squared_error",
        "enc_layer_names" : ["enc_1", "enc_2", "enc_3"],
        'callbacks' : [es,rlr],
        'adam_decay': 0.0005329142291371636,
        'beta': 5,
        'p': 0.004465204118126482,
        'dense_size' : [int(256), int(256), int(256)],
        'dropout_rate' : [0.354, 0.354],
        'lr' : 0.0005,
        'batch_size' : int(64),
        'n_epochs' : int(35),
        'margin' : 0.2
        }
xgb_params = {
        "colsample_bylevel" : 0.5612301667238877,
        "colsample_bytree" : 0.788688363076523,
        "gamma" : 0.35376030016117566,
        "eta" : 0.4023692255888918,
        "max_delta_step" : int(3),
        "max_depth" : int(8),
        "min_child_weight" : int(70),
        "alpha" : 0.15030685758880047,
        "lambda" : 15.311721955443915,
        "subsample" : 0.8303923929525608,
        "eval_metric":'auc',
        "objective":'binary:logistic',
        "booster":'gbtree'
}
class_XGB = XGB_predictor(xgb_params)
gcn = GCN_siam_model(model_params)
val_metrics = {}
train_metrics  ={}

In [4]:
gcn_encoder = gcn.build_encoder()
gcn_model = gcn.build_model(gcn_encoder)
siamese = gcn.build_siam(gcn_model)

LAYER 0
LAYER 1
LAYER 2
y_pred.shape =  Tensor("merged_layer/concat:0", shape=(?, 768), dtype=float32)


In [5]:
gcn_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
atom_inputs (InputLayer)        (None, 70, 62)       0                                            
__________________________________________________________________________________________________
bond_inputs (InputLayer)        (None, 70, 5, 6)     0                                            
__________________________________________________________________________________________________
edge_inputs (InputLayer)        (None, 70, 5)        0                                            
__________________________________________________________________________________________________
model_10 (Model)                (None, 160)          241696      atom_inputs[0][0]                
                                                                 bond_inputs[0][0]                
          

In [6]:
siamese.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
atom_inputs_anchor (InputLayer) (None, 70, 62)       0                                            
__________________________________________________________________________________________________
bond_inputs_anchor (InputLayer) (None, 70, 5, 6)     0                                            
__________________________________________________________________________________________________
edge_inputs_anchor (InputLayer) (None, 70, 5)        0                                            
__________________________________________________________________________________________________
atom_inputs_pos (InputLayer)    (None, 70, 62)       0                                            
__________________________________________________________________________________________________
bond_input

In [4]:
for i in range(len(train_sets)):
    anchor_atoms, anchor_bonds, anchor_edges = gcn.dataframe_to_gcn_input(triplets_sets[i]["A"])
    pos_atoms, pos_bonds, pos_edges = gcn.dataframe_to_gcn_input(triplets_sets[i]["P"])
    neg_atoms, neg_bonds, neg_edges = gcn.dataframe_to_gcn_input(triplets_sets[i]["N"])
    
    gcn_encoder = gcn.build_encoder()
    gcn_model = gcn.build_model(gcn_encoder)
    siamese = gcn.build_siam(gcn_model)
    
    
    Y_dummy = np.empty((anchor_atoms.shape[0],768))
    siamese.fit([anchor_atoms, anchor_bonds, anchor_edges,
                 pos_atoms, pos_bonds, pos_edges,
                 neg_atoms, neg_bonds, neg_edges],Y_dummy,
                batch_size=256,
                epochs=5,
                verbose=2,
                shuffle=True,
                validation_data=None,
                callbacks = [es2,rlr2])
    
    Y_val = val_sets[i].Binary
    val_atoms, val_bonds, val_edges = gcn.dataframe_to_gcn_input(val_sets[i]["rdkit"])
    emb_val = gcn_model.predict([val_atoms, val_bonds, val_edges])
    
    Y = train_sets[i].Binary
    train_atoms, train_bonds, train_edges = gcn.dataframe_to_gcn_input(train_sets[i]["rdkit"])
    emb_train = gcn_model.predict([train_atoms, train_bonds, train_edges])
    
    dmatrix_train = class_XGB.to_xgb_input(Y,emb_train)
    dmatrix_cold = class_XGB.to_xgb_input(Y_val,emb_val)
    
    evalist = [(dmatrix_train,'train'),(dmatrix_cold,'eval')]
    xgb_model = class_XGB.build_model(dmatrix_train,evalist,300)
    xgb_pred_cold = xgb_model.predict(dmatrix_cold)
    xgb_pred_train = xgb_model.predict(dmatrix_train)
    
    if i<6:
        val_metrics['Fold_%s'%i] = calculate_metrics(np.array(Y_val),xgb_pred_cold)
        train_metrics['Fold_%s'%i] = calculate_metrics(np.array(Y),xgb_pred_train)
    elif i == 6:
        val_metrics['Test'] = calculate_metrics(np.array(Y_val),xgb_pred_cold)
        train_metrics['Test'] = calculate_metrics(np.array(Y),xgb_pred_train)
        
    del gcn_encoder, gcn_model, siamese, anchor_atoms, anchor_bonds, anchor_edges, pos_atoms, pos_bonds, pos_edges
    del neg_atoms, neg_bonds, neg_edges, val_atoms, val_bonds, val_edges, train_atoms, train_bonds, train_edges
    del emb_val, emb_train, dmatrix_train, dmatrix_cold, xgb_model,  xgb_pred_cold, xgb_pred_train, evalist

LAYER 0
LAYER 1
LAYER 2
y_pred.shape =  Tensor("merged_layer/concat:0", shape=(?, 768), dtype=float32)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/5
 - 21s - loss: 0.5009
Epoch 2/5
 - 12s - loss: 0.5001
Epoch 3/5
 - 12s - loss: 0.4998
Epoch 4/5
 - 12s - loss: 0.5003
Epoch 5/5
 - 12s - loss: 0.4991
LAYER 0
LAYER 1
LAYER 2
y_pred.shape =  Tensor("merged_layer_1/concat:0", shape=(?, 768), dtype=float32)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/5
 - 20s - loss: 0.5006
Epoch 2/5
 - 12s - loss: 0.5012
Epoch 3/5
 - 12s - loss: 0.4992
Epoch 4/5
 - 12s - loss: 0.4989
Epoch 5/5
 - 12s - loss: 0.5014
LAYER 0
LAYER 1
LAYER 2
y_pred.shape =  Tensor("merged_layer_2/concat:0", shape=(?, 768), dtype=float32)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/5
 - 21s - loss: 0.5007
Epoch 2/5
 - 12s - loss: 0.4994
Epoch 3/5
 - 12s - loss: 0.4993
Epoch 4/5
 - 12s - loss: 0.5003

Epoch 00004: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 5/5
 - 12s - loss: 0.4995
LAYER 0
LAYER 1
LAYER 2
y_pred.shape =  Tensor("merged_layer_3/concat:0", shape=(?, 768), dtype=float32)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/5
 - 23s - loss: 0.5022
Epoch 2/5
 - 12s - loss: 0.5011
Epoch 3/5
 - 12s - loss: 0.5008
Epoch 4/5
 - 12s - loss: 0.4989
Epoch 5/5
 - 12s - loss: 0.4986
LAYER 0
LAYER 1
LAYER 2
y_pred.shape =  Tensor("merged_layer_4/concat:0", shape=(?, 768), dtype=float32)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/5
 - 24s - loss: 0.5011
Epoch 2/5
 - 12s - loss: 0.5010
Epoch 3/5
 - 12s - loss: 0.4995
Epoch 4/5
 - 12s - loss: 0.5001
Epoch 5/5
 - 12s - loss: 0.5003

Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
LAYER 0
LAYER 1
LAYER 2
y_pred.shape =  Tensor("merged_layer_5/concat:0", shape=(?, 768), dtype=float32)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/5
 - 26s - loss: 0.5022
Epoch 2/5
 - 12s - loss: 0.4995
Epoch 3/5
 - 12s - loss: 0.4995
Epoch 4/5
 - 12s - loss: 0.5008

Epoch 00004: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 5/5
 - 12s - loss: 0.4999
LAYER 0
LAYER 1
LAYER 2
y_pred.shape =  Tensor("merged_layer_6/concat:0", shape=(?, 768), dtype=float32)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/5
 - 31s - loss: 0.5016
Epoch 2/5
 - 15s - loss: 0.4989
Epoch 3/5
 - 15s - loss: 0.4996
Epoch 4/5
 - 15s - loss: 0.4998

Epoch 00004: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 5/5
 - 16s - loss: 0.4997


In [5]:
pd.DataFrame(train_metrics).T

Unnamed: 0,roc_auc,tn,fp,fn,tp,map,precision,recall,accuracy
Fold_0,0.985606,1639.0,59.0,117.0,865.0,0.976783,0.936147,0.880855,0.934328
Fold_1,0.987879,1614.0,46.0,100.0,920.0,0.981513,0.952381,0.901961,0.945522
Fold_2,0.987779,1648.0,40.0,110.0,882.0,0.982071,0.956616,0.889113,0.94403
Fold_3,0.986052,1606.0,47.0,111.0,916.0,0.979248,0.951194,0.891918,0.941045
Fold_4,0.984481,1633.0,52.0,121.0,874.0,0.977128,0.943844,0.878392,0.935448
Fold_5,0.985932,1709.0,42.0,123.0,811.0,0.975712,0.950762,0.868308,0.938547
Test,0.988069,1969.0,58.0,134.0,1056.0,0.981317,0.947935,0.887395,0.940317


In [6]:
pd.DataFrame(val_metrics).T

Unnamed: 0,roc_auc,tn,fp,fn,tp,map,precision,recall,accuracy
Fold_0,0.688611,274.0,55.0,126.0,82.0,0.609218,0.59854,0.394231,0.662942
Fold_1,0.800737,317.0,50.0,79.0,91.0,0.694,0.64539,0.535294,0.759777
Fold_2,0.796013,289.0,50.0,90.0,108.0,0.723568,0.683544,0.545455,0.739292
Fold_3,0.783029,298.0,76.0,65.0,98.0,0.620218,0.563218,0.601227,0.73743
Fold_4,0.830679,308.0,34.0,88.0,107.0,0.757691,0.758865,0.548718,0.772812
Fold_5,0.816548,221.0,55.0,82.0,174.0,0.785769,0.759825,0.679688,0.742481
Test,0.79129,307.0,53.0,82.0,95.0,0.674192,0.641892,0.536723,0.748603
