In [1]:
import pandas as pd
from keras.callbacks import History, ReduceLROnPlateau,EarlyStopping,ModelCheckpoint
import os
import numpy as np
from data_analysis import calculate_metrics, load_weights_and_evaluate
from model_builders import GCN_siam_model
from hyper_mining import XGB_predictor
import pickle
import dill

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [19]:
target = 'pi3k'
base_path = f'C:/Users/tomas/Documents/GitHub/kinase_binding'

data_fpath = base_path+f'/data/{target}/data.csv'
df = pd.read_csv(data_fpath).set_index('biolab_index')

with open(base_path+f'/data/{target}/train_val_folds.pkl', "rb") as in_f:
    train_val_folds = dill.load(in_f)

with open(base_path+f'/data/{target}/train_test_folds.pkl', "rb") as in_f:
    train_test_folds = dill.load(in_f)

train_sets = [#df.loc[train_val_folds[0][0]],
                 #df.loc[train_val_folds[1][0]],
                 #df.loc[train_val_folds[2][0]],
                 #df.loc[train_val_folds[3][0]],
                 #df.loc[train_val_folds[4][0]],
                 #df.loc[train_val_folds[5][0]],
                 df.loc[train_test_folds[0]]
                 ]
val_sets = [#df.loc[train_val_folds[0][1]],
                   #df.loc[train_val_folds[1][1]],
                   #df.loc[train_val_folds[2][1]],
                   #df.loc[train_val_folds[3][1]],
                   #df.loc[train_val_folds[4][1]],
                   #df.loc[train_val_folds[5][1]],
                   df.loc[train_test_folds[1]]
                   ]
triplets_sets = [#pd.read_csv('../../../../Desktop/binding/Triplets/p38/fold_0/triplets_train.csv',index_col = 0),
                #pd.read_csv('../../../../Desktop/binding/Triplets/p38/fold_1/triplets_train.csv',index_col = 0),
                #pd.read_csv('../../../../Desktop/binding/Triplets/p38/fold_2/triplets_train.csv',index_col = 0),
                #pd.read_csv('../../../../Desktop/binding/Triplets/p38/fold_3/triplets_train.csv',index_col = 0),
                #pd.read_csv('../../../../Desktop/binding/Triplets/p38/fold_4/triplets_train.csv',index_col = 0),
                #pd.read_csv('../../../../Desktop/binding/Triplets/p38/fold_5/triplets_train.csv',index_col = 0),
                pd.read_csv('../../../../Desktop/binding/Triplets/pi3k/Test/triplets_train.csv',index_col = 0)]

In [20]:
es = EarlyStopping(monitor='loss',patience=8, min_delta=0)
rlr = ReduceLROnPlateau(monitor='loss',factor=0.5, patience=4, verbose=1, min_lr=0.0000001)
es2 = EarlyStopping(monitor='loss',patience=8, min_delta=0)
rlr2 = ReduceLROnPlateau(monitor='loss',factor=0.5, patience=2, verbose=1, min_lr=0.0000001)
model_params = {
        "num_layers" : 3,
        "max_atoms" : 70,
        "num_atom_features" : 62,
        "num_atom_features_original" : 62,
        "num_bond_features" : 6,
        "max_degree" : 5,
        "conv_width" : [int(96), int(104), int(120)],
        "fp_length" : [int(160), int(160), int(160)],
        "activ_enc" : "selu",
        "activ_dec" : "selu",
        "learning_rates" : [0.001,0.001,0.001],
        "learning_rates_fp": [0.005,0.005,0.005],
        "losses_conv" : {
                    "neighbor_output": "mean_squared_error",
                    "self_output": "mean_squared_error",
                    },
        "lossWeights" : {"neighbor_output": 1.0, "self_output": 1.0},
        "metrics" : "mse",
        "loss_fp" : "mean_squared_error",
        "enc_layer_names" : ["enc_1", "enc_2", "enc_3"],
        'callbacks' : [es,rlr],
        'adam_decay': 0.0005329142291371636,
        'beta': 5,
        'p': 0.004465204118126482,
        'dense_size' : [int(256), int(256), int(256)],
        'dropout_rate' : [0.354, 0.354],
        'lr' : 0.0005,
        'batch_size' : int(64),
        'n_epochs' : int(35),
        'margin' : 0.2
        }
xgb_params = {
        "colsample_bylevel" : 0.5612301667238877,
        "colsample_bytree" : 0.788688363076523,
        "gamma" : 0.35376030016117566,
        "eta" : 0.4023692255888918,
        "max_delta_step" : int(3),
        "max_depth" : int(8),
        "min_child_weight" : int(70),
        "alpha" : 0.15030685758880047,
        "lambda" : 15.311721955443915,
        "subsample" : 0.8303923929525608,
        "eval_metric":'auc',
        "objective":'binary:logistic',
        "booster":'gbtree'
}
class_XGB = XGB_predictor(xgb_params)
gcn = GCN_siam_model(model_params)
val_metrics = {}
train_metrics  ={}

In [21]:
for i in range(len(train_sets)):
    anchor_atoms, anchor_bonds, anchor_edges = gcn.dataframe_to_gcn_input(triplets_sets[i]["A"])
    pos_atoms, pos_bonds, pos_edges = gcn.dataframe_to_gcn_input(triplets_sets[i]["P"])
    neg_atoms, neg_bonds, neg_edges = gcn.dataframe_to_gcn_input(triplets_sets[i]["N"])
    
    gcn_encoder = gcn.build_encoder()
    gcn_model = gcn.build_model(gcn_encoder)
    siamese = gcn.build_siam(gcn_model)
    
    
    Y_dummy = np.empty((anchor_atoms.shape[0],768))
    siamese.fit([anchor_atoms, anchor_bonds, anchor_edges,
                 pos_atoms, pos_bonds, pos_edges,
                 neg_atoms, neg_bonds, neg_edges],Y_dummy,
                batch_size=256,
                epochs=5,
                verbose=2,
                shuffle=True,
                validation_data=None,
                callbacks = [es2,rlr2])
    
    Y_val = val_sets[i].Binary
    val_atoms, val_bonds, val_edges = gcn.dataframe_to_gcn_input(val_sets[i]["rdkit"])
    emb_val = gcn_model.predict([val_atoms, val_bonds, val_edges])
    
    Y = train_sets[i].Binary
    train_atoms, train_bonds, train_edges = gcn.dataframe_to_gcn_input(train_sets[i]["rdkit"])
    emb_train = gcn_model.predict([train_atoms, train_bonds, train_edges])
    
    dmatrix_train = class_XGB.to_xgb_input(Y,emb_train)
    dmatrix_cold = class_XGB.to_xgb_input(Y_val,emb_val)
    
    evalist = [(dmatrix_train,'train'),(dmatrix_cold,'eval')]
    xgb_model = class_XGB.build_model(dmatrix_train,evalist,300)
    xgb_pred_cold = xgb_model.predict(dmatrix_cold)
    xgb_pred_train = xgb_model.predict(dmatrix_train)
    

    val_metrics['Test'] = calculate_metrics(np.array(Y_val),xgb_pred_cold)
    train_metrics['Test'] = calculate_metrics(np.array(Y),xgb_pred_train)
        
    del gcn_encoder, gcn_model, siamese, anchor_atoms, anchor_bonds, anchor_edges, pos_atoms, pos_bonds, pos_edges
    del neg_atoms, neg_bonds, neg_edges, val_atoms, val_bonds, val_edges, train_atoms, train_bonds, train_edges
    del emb_val, emb_train, dmatrix_train, dmatrix_cold, xgb_model,  xgb_pred_cold, xgb_pred_train, evalist

LAYER 0
LAYER 1
LAYER 2
y_pred.shape =  Tensor("merged_layer_3/concat:0", shape=(?, 768), dtype=float32)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/5
 - 26s - loss: 0.5017
Epoch 2/5
 - 16s - loss: 0.4994
Epoch 3/5
 - 16s - loss: 0.4993
Epoch 4/5
 - 16s - loss: 0.5003

Epoch 00004: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 5/5
 - 16s - loss: 0.4975


In [22]:
pd.DataFrame(train_metrics).T

Unnamed: 0,accuracy,fn,fp,map,precision,recall,roc_auc,tn,tp
Test,0.943115,122.0,61.0,0.981899,0.94597,0.897479,0.988912,1966.0,1068.0


In [23]:
pd.DataFrame(val_metrics).T

Unnamed: 0,accuracy,fn,fp,map,precision,recall,roc_auc,tn,tp
Test,0.791434,65.0,47.0,0.707148,0.704403,0.632768,0.846281,313.0,112.0


In [24]:
val_metrics = pd.DataFrame(val_metrics).T

In [25]:
val_metrics.to_csv('../../../../Desktop/binding/thesis english/Results/3-One-Shot/Offline/pi3k.csv')