In [None]:
import pandas as pd
from keras.callbacks import History, ReduceLROnPlateau,EarlyStopping,ModelCheckpoint
import os
import dill
import numpy as np
from data_analysis import calculate_metrics, load_weights_and_evaluate
from model_builders import GCN_pretraining

# GCN with pretraining functionality parameters

In [None]:
# Specify model callbacks on training
es = EarlyStopping(monitor='loss',patience=8, min_delta=0)
rlr = ReduceLROnPlateau(monitor='loss',factor=0.5, patience=4, verbose=1, min_lr=0.0000001)

encoder_params = {
        "num_layers" : 3,
        "max_atoms" : 70,
        "num_atom_features" : 62,
        "num_atom_features_original" : 62,
        "num_bond_features" : 6,
        "max_degree" : 5,
        "conv_width" : [96,104,120],
        "fp_length" : [160,160,160],
        "activ_enc" : "selu",
        "activ_dec" : "selu",
        "learning_rates" : [0.001,0.001,0.001],
        "learning_rates_fp": [0.005,0.005,0.005],
        "losses_conv" : {
                    "neighbor_output": "mean_squared_error",
                    "self_output": "mean_squared_error",
                    },
        "lossWeights" : {"neighbor_output": 1.0, "self_output": 1.0},
        "metrics" : "mse",
        "loss_fp" : "mean_squared_error",
        "enc_layer_names" : ["enc_1", "enc_2", "enc_3"],
        'callbacks' : [es,rlr],
        'adam_decay': 0.0005329142291371636,
        'beta': 5,
        'p': 0.004465204118126482
        }
model_params = {
        'dense_size' : [256,192,96],
        'dropout_rate' : [0.354,0.354],
        'lr' : 0.007
}

# Load train, val, test data

In [None]:
target = 'p38'

In [None]:
base_path = f'C:/Users/user/Documents/kinase_binding'

data_fpath = base_path+f'/data/{target}/data.csv'
df=pd.read_csv(data_fpath).set_index('biolab_index')

with open(base_path+f'/data/{target}/train_val_folds.pkl', "rb") as in_f:
    train_val_folds = dill.load(in_f)
with open(base_path+f'/data/{target}/train_test_folds.pkl', "rb") as in_f:
    train_test_folds = dill.load(in_f)

In [None]:
model_name = 'gcn_pretraining_ensemble_optimized'

In [None]:
# no need for manual changes in this cell
weight_files = os.path.join(base_path, f'results/{target}/{model_name}/fold_{{}}/model_weights/model_{{}}.h5')

# Cross validation and test predictions

In [None]:
n_ensemble = 5

In [None]:
val_preds = []
test_preds = []
gcn = GCN_pretraining(encoder_params,model_params)
for i in range(6):
    train_set = df.loc[train_val_folds[i][0]]
    val_set = df.loc[train_val_folds[i][1]]
    
    en_val_preds = []
    en_test_preds = []
    X_atoms_cold,X_bonds_cold,X_edges_cold = gcn.dataframe_to_gcn_input(val_set)
    Y_cold = val_set.Binary
    X_atoms_train, X_bonds_train, X_edges_train = gcn.dataframe_to_gcn_input(train_set)
    Y = train_set.Binary
    X_atoms_test, X_bonds_test, X_edges_test = gcn.dataframe_to_gcn_input(df.loc[train_test_folds[1]])
    for j in range(n_ensemble):
        gcn_encoder = gcn.build_encoder()
        gcn_model = gcn.build_model(gcn_encoder)
        gcn_model.fit([X_atoms_train,X_bonds_train,X_edges_train],Y,
                    batch_size=64,
                    epochs=35,
                    verbose=2,
                    shuffle=True,
                    validation_data=([X_atoms_cold,X_bonds_cold,X_edges_cold],Y_cold))
        y_pred_val = gcn_model.predict([X_atoms_cold,X_bonds_cold,X_edges_cold])
        y_pred_test = gcn_model.predict([X_atoms_test,X_bonds_test,X_edges_test])
        en_val_preds.append(y_pred_val)
        en_test_preds.append(y_pred_test)
        gcn_model.save_weights(os.path.join(base_path, f'results/{target}/{model_name}/fold_%s/model_weights/model_%s.h5'%(i,j)))
    val_preds.append(np.mean(en_val_preds, axis = 0))
    test_preds.append(np.mean(en_test_preds, axis = 0))

# Calculate metrics from predictions

In [None]:
dfs = []
ave_preds = np.mean(test_preds,axis = 0)
for i, y_pred in enumerate(val_preds):
    print(f'fold {i}\n')
    y_true = df.loc[train_val_folds[i][1]].Binary
    dfs.append(calculate_metrics(y_true.values, y_pred.squeeze(), plots=True))
dfs.append(calculate_metrics(train_test_folds[1].Binary.values, ave_preds.squeeze(), plots=True))
metrics = pd.DataFrame(dfs)
metrics.rename(index={7:'test_set'}, inplace=True)
metrics

# Calculate metrics from trained weights

In [None]:
gcn = GCN_pretraining(encoder_params,model_params)
gcn_encoder = gcn.build_encoder()
gcn_model = gcn.build_model(gcn_encoder)

In [None]:
preds_val = []
preds_test = []
dfs = []
test_set = df.loc[train_test_folds[1]]
for i in range(6):
    df_val = df.loc[train_val_folds[i][1]]
    y_true = df_val.Binary
    val_data = gcn.dataframe_to_gcn_input(df_val)
    gcn_model.load_weights(weight_files.format(i,i))
    pred_val = gcn_model.predict(val_data, batch_size = 1024)
    preds_val.append(pred_val)
    test_data = gcn.dataframe_to_gcn_input(test_set)
    preds_test.append(gcn_model.predict(test_data,batch_size = 1024))
    dfs.append(calculate_metrics(y_true.values, pred_val.squeeze(), plots=True))
ave_preds = np.mean(preds_test,axis = 0)
dfs.append(calculate_metrics(test_set.Binary.values, ave_preds.squeeze(), plots=True))
metrics = pd.DataFrame(dfs)
metrics.rename(index={7:'test_set'}, inplace=True)
metrics

In [None]:
metrics.to_csv(os.path.join(base_path, f'results/{target}/{model_name}/performance_metrics/performance_0.csv'))

# Evaluate ensemble from weights

In [None]:
val_sets = [df.loc[train_val_folds[i][1]] for i in range(6)]
test_set = df.loc[train_test_folds[1]]
eval_params = {
    'val_sets' : val_sets,
    'test_set' : test_set,
    'model_class': gcn,
    'model' : gcn_model,
    'model_type' : 'gcn',
    'weight_file_format' : weight_files,
    'n_ensemble' : int(10)
}

In [None]:
metrics = load_weights_and_evaluate(eval_params)

In [None]:
metrics