# Training gradient boosting model for enzyme-substrate pair prediction with ESM-1b-vectors

### 1. Loading and preprocessing data for model training and evaluation
### 2. Hyperparameter optimization using a 5-fold cross-validation (CV)
### 3. Training and validating the final model

In [8]:
import pandas as pd
import numpy as np
import random
import pickle
import sys
import os
import logging
from os.path import join
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from hyperopt import fmin, tpe, hp, Trials, rand
import xgboost as xgb
from sklearn.metrics import matthews_corrcoef



sys.path.append('.\\additional_code')
#from data_preprocessing import *

CURRENT_DIR = os.getcwd()
print(CURRENT_DIR)

C:\Users\alexk\projects\SubFinder\notebooks_and_code


## 1. Loading and preprocessing data for model training and evaluation

In [2]:
def array_column_to_strings(df, column):
    df[column] = [str(list(df[column][ind])) for ind in df.index]
    return(df)

def string_column_to_array(df, column):
    df[column] = [np.array(eval(df[column][ind])) for ind in df.index]
    return(df)

### (a) Loading data: 
Only keeping data points from the GO Annotation database with experimental evidence

In [3]:
df_train = pd.read_pickle(join(CURRENT_DIR, ".." ,"data","splits", "df_train_with_ESM1b_ts_promiscous.pkl"))
df_train = df_train.loc[df_train["ESM1b"] != ""]
df_train = df_train.loc[df_train["type"] != "engqvist"]
df_train.reset_index(inplace = True, drop = True)

df_test  = pd.read_pickle(join(CURRENT_DIR, ".." ,"data","splits", "df_test_with_ESM1b_ts_promiscous.pkl"))
df_test = df_test.loc[df_test["ESM1b"] != ""]
df_test = df_test.loc[df_test["type"] != "engqvist"]
df_test.reset_index(inplace = True, drop = True)


  result = libops.scalar_compare(x.ravel(), y, op)
  result = libops.scalar_compare(x.ravel(), y, op)


### (b) Splitting training set into 5-folds for hyperparameter optimization:
The 5 folds are created in such a way that the same enzyme does not occure in two different folds

In [4]:
def split_dataframe(df, frac):
    df1 = pd.DataFrame(columns = list(df.columns))
    df2 = pd.DataFrame(columns = list(df.columns))
    try:
        df.drop(columns = ["level_0"], inplace = True)
    except: 
        pass
    df.reset_index(inplace = True)
    
    train_indices = []
    test_indices = []
    ind = 0
    while len(train_indices) +len(test_indices) < len(df):
        if ind not in train_indices and ind not in test_indices:
            if ind % frac != 0:
                n_old = len(train_indices)
                train_indices.append(ind)
                train_indices = list(set(train_indices))

                while n_old != len(train_indices):
                    n_old = len(train_indices)

                    training_seqs= list(set(df["ESM1b"].loc[train_indices]))

                    train_indices = train_indices + (list(df.loc[df["ESM1b"].isin(training_seqs)].index))
                    train_indices = list(set(train_indices))
                
            else:
                n_old = len(test_indices)
                test_indices.append(ind)
                test_indices = list(set(test_indices))

                while n_old != len(test_indices):
                    n_old = len(test_indices)

                    testing_seqs= list(set(df["ESM1b"].loc[test_indices]))

                    test_indices = test_indices + (list(df.loc[df["ESM1b"].isin(testing_seqs)].index))
                    test_indices = list(set(test_indices))
                
        ind +=1
    return(df.loc[train_indices], df.loc[test_indices])

In [5]:
data_train2 = df_train.copy()
data_train2 = array_column_to_strings(data_train2, column = "ESM1b")

data_train2, df_fold = split_dataframe(df = data_train2, frac=5)
indices_fold1 = list(df_fold["index"])
print(len(data_train2), len(indices_fold1))#

data_train2, df_fold = split_dataframe(df = data_train2, frac=4)
indices_fold2 = list(df_fold["index"])
print(len(data_train2), len(indices_fold2))

data_train2, df_fold = split_dataframe(df = data_train2, frac=3)
indices_fold3 = list(df_fold["index"])
print(len(data_train2), len(indices_fold3))

data_train2, df_fold = split_dataframe(df = data_train2, frac=2)
indices_fold4 = list(df_fold["index"])
indices_fold5 = list(data_train2["index"])
print(len(data_train2), len(indices_fold4))


fold_indices = [indices_fold1, indices_fold2, indices_fold3, indices_fold4, indices_fold5]

train_indices = [[], [], [], [], []]
test_indices = [[], [], [], [], []]

for i in range(5):
    for j in range(5):
        if i != j:
            train_indices[i] = train_indices[i] + fold_indices[j]
            
    test_indices[i] = fold_indices[i]

8933 2183
6793 2140
4579 2214
2025 2554


## 2. Hyperparameter optimization using a 5-fold cross-validation (CV)

### (a) ECFP and ESM1b:

#### (i) Creating numpy arrays with input vectors and output variables

In [6]:
def create_input_and_output_data(df):
    X = ();
    y = ();
    
    for ind in df.index:
        emb = df["ESM1b_ts"][ind]
        ecfp = np.array(list(df["ECFP"][ind])).astype(int)
                
        X = X +(np.concatenate([ecfp, emb]), );
        y = y + (df["Binding"][ind], );

    return(X,y)

train_X, train_y =  create_input_and_output_data(df = df_train)
test_X, test_y =  create_input_and_output_data(df = df_test)


feature_names =  ["ECFP_" + str(i) for i in range(1024)]
feature_names = feature_names + ["ESM1b_" + str(i) for i in range(1280)]

train_X = np.array(train_X)
test_X  = np.array(test_X)

train_y = np.array(train_y)
test_y  = np.array(test_y)

#### (ii) Performing hyperparameter optimization

In [9]:
def cross_validation_neg_acc_gradient_boosting(param):
    num_round = param["num_rounds"]
    param["tree_method"] = "gpu_hist"
    param["sampling_method"] = "gradient_based"
    param['objective'] = 'binary:logistic'
    weights = np.array([param["weight"] if binding == 0 else 1.0 for binding in df_train["Binding"]])
    
    del param["num_rounds"]
    del param["weight"]
    
    loss = []
    for i in range(5):
        train_index, test_index  = train_indices[i], test_indices[i]
        dtrain = xgb.DMatrix(np.array(train_X[train_index]), weight = weights[train_index],
                         label = np.array(train_y[train_index]))
        dvalid = xgb.DMatrix(np.array(train_X[test_index]))
        bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
        y_valid_pred = np.round(bst.predict(dvalid))
        validation_y = train_y[test_index]
    
        false_positive = 100*(1-np.mean(np.array(validation_y)[y_valid_pred == 1]))
        false_negative = 100*(np.mean(np.array(validation_y)[y_valid_pred == 0]))
        logging.info("False positive rate: " + str(false_positive)+ "; False negative rate: " + str(false_negative))
        loss.append(2*(false_negative**2) + false_positive**1.3)
    return(np.mean(loss))

#Defining search space for hyperparameter optimization
space_gradient_boosting = {"learning_rate": hp.uniform("learning_rate", 0.01, 0.5),
    "max_depth": hp.choice("max_depth", [9,10,11,12,13]),
    "reg_lambda": hp.uniform("reg_lambda", 0, 5),
    "reg_alpha": hp.uniform("reg_alpha", 0, 5),
    "max_delta_step": hp.uniform("max_delta_step", 0, 5),
    "min_child_weight": hp.uniform("min_child_weight", 0.1, 15),
    "num_rounds":  hp.uniform("num_rounds", 200, 400),
    "weight" : hp.uniform("weight", 0.1,0.33)}

Performing a random grid search:

In [10]:
trials = Trials()

for i in range(1,2000):
    best = fmin(fn = cross_validation_neg_acc_gradient_boosting, space = space_gradient_boosting,
                algo = rand.suggest, max_evals = i, trials = trials)
    #np.save(join(CURRENT_DIR, ".." ,"data", "results", "cross_validation_binding_ESM1b.npy"), trials.best_trial)
    #np.save(join(CURRENT_DIR, ".." ,"data", "results", "cross_validation_binding_ESM1b_argmin.npy"), trials.argmin)
    print(i)
    print(trials.best_trial["result"]["loss"])
    print(trials.argmin)

100%|██████████████████████████████████████████████████| 1/1 [04:24<00:00, 264.48s/trial, best loss: 262.5389853662725]
1
262.5389853662725
{'learning_rate': 0.1148489436009765, 'max_delta_step': 3.1684783173308557, 'max_depth': 1, 'min_child_weight': 2.47149091325713, 'num_rounds': 363.7908208797275, 'reg_alpha': 2.8761434380847724, 'reg_lambda': 1.2188050305765386, 'weight': 0.2533132829576781}
100%|██████████████████████████████████████████████████| 2/2 [03:15<00:00, 195.00s/trial, best loss: 262.5389853662725]
2
262.5389853662725
{'learning_rate': 0.1148489436009765, 'max_delta_step': 3.1684783173308557, 'max_depth': 1, 'min_child_weight': 2.47149091325713, 'num_rounds': 363.7908208797275, 'reg_alpha': 2.8761434380847724, 'reg_lambda': 1.2188050305765386, 'weight': 0.2533132829576781}
100%|███████████████████████████████████████████████████| 3/3 [09:17<00:00, 557.23s/trial, best loss: 255.037971745901]
3
255.037971745901
{'learning_rate': 0.09102301643349951, 'max_delta_step': 0.19

100%|████████████████████████████████████████████████| 21/21 [01:53<00:00, 114.00s/trial, best loss: 253.6826050122041]
21
253.6826050122041
{'learning_rate': 0.17600392450633315, 'max_delta_step': 1.204110529702846, 'max_depth': 1, 'min_child_weight': 3.1896308745823183, 'num_rounds': 298.19866810959155, 'reg_alpha': 0.27065867995679893, 'reg_lambda': 1.3326635539401765, 'weight': 0.16342544218583369}
100%|███████████████████████████████████████████████| 22/22 [02:32<00:00, 152.55s/trial, best loss: 247.54687877579468]
22
247.54687877579468
{'learning_rate': 0.12277257256212021, 'max_delta_step': 0.33994039642580187, 'max_depth': 0, 'min_child_weight': 0.5065175007474233, 'num_rounds': 209.3694899343695, 'reg_alpha': 4.6760873578257875, 'reg_lambda': 4.484197931051557, 'weight': 0.13738315470495976}
100%|███████████████████████████████████████████████| 23/23 [05:59<00:00, 359.52s/trial, best loss: 247.54687877579468]
23
247.54687877579468
{'learning_rate': 0.12277257256212021, 'max_de

100%|███████████████████████████████████████████████| 42/42 [02:37<00:00, 157.04s/trial, best loss: 238.93838838049925]
42
238.93838838049925
{'learning_rate': 0.23258374432866033, 'max_delta_step': 4.087581210801292, 'max_depth': 4, 'min_child_weight': 1.925593494006774, 'num_rounds': 335.9808545767529, 'reg_alpha': 1.423400568682085, 'reg_lambda': 3.955876924272332, 'weight': 0.17114897417853941}
100%|███████████████████████████████████████████████| 43/43 [02:05<00:00, 125.71s/trial, best loss: 238.93838838049925]
43
238.93838838049925
{'learning_rate': 0.23258374432866033, 'max_delta_step': 4.087581210801292, 'max_depth': 4, 'min_child_weight': 1.925593494006774, 'num_rounds': 335.9808545767529, 'reg_alpha': 1.423400568682085, 'reg_lambda': 3.955876924272332, 'weight': 0.17114897417853941}
100%|████████████████████████████████████████████████| 44/44 [01:21<00:00, 81.64s/trial, best loss: 238.93838838049925]
44
238.93838838049925
{'learning_rate': 0.23258374432866033, 'max_delta_step

100%|███████████████████████████████████████████████| 63/63 [03:10<00:00, 190.45s/trial, best loss: 238.93838838049925]
63
238.93838838049925
{'learning_rate': 0.23258374432866033, 'max_delta_step': 4.087581210801292, 'max_depth': 4, 'min_child_weight': 1.925593494006774, 'num_rounds': 335.9808545767529, 'reg_alpha': 1.423400568682085, 'reg_lambda': 3.955876924272332, 'weight': 0.17114897417853941}
100%|███████████████████████████████████████████████| 64/64 [02:45<00:00, 165.98s/trial, best loss: 238.93838838049925]
64
238.93838838049925
{'learning_rate': 0.23258374432866033, 'max_delta_step': 4.087581210801292, 'max_depth': 4, 'min_child_weight': 1.925593494006774, 'num_rounds': 335.9808545767529, 'reg_alpha': 1.423400568682085, 'reg_lambda': 3.955876924272332, 'weight': 0.17114897417853941}
100%|███████████████████████████████████████████████| 65/65 [02:38<00:00, 158.17s/trial, best loss: 238.93838838049925]
65
238.93838838049925
{'learning_rate': 0.23258374432866033, 'max_delta_step

100%|███████████████████████████████████████████████| 84/84 [02:45<00:00, 165.42s/trial, best loss: 238.93838838049925]
84
238.93838838049925
{'learning_rate': 0.23258374432866033, 'max_delta_step': 4.087581210801292, 'max_depth': 4, 'min_child_weight': 1.925593494006774, 'num_rounds': 335.9808545767529, 'reg_alpha': 1.423400568682085, 'reg_lambda': 3.955876924272332, 'weight': 0.17114897417853941}
100%|███████████████████████████████████████████████| 85/85 [04:58<00:00, 298.96s/trial, best loss: 238.93838838049925]
85
238.93838838049925
{'learning_rate': 0.23258374432866033, 'max_delta_step': 4.087581210801292, 'max_depth': 4, 'min_child_weight': 1.925593494006774, 'num_rounds': 335.9808545767529, 'reg_alpha': 1.423400568682085, 'reg_lambda': 3.955876924272332, 'weight': 0.17114897417853941}
100%|███████████████████████████████████████████████| 86/86 [04:56<00:00, 296.66s/trial, best loss: 238.93838838049925]
86
238.93838838049925
{'learning_rate': 0.23258374432866033, 'max_delta_step

KeyboardInterrupt: 

In [11]:
trials.argmin

{'learning_rate': 0.23258374432866033,
 'max_delta_step': 4.087581210801292,
 'max_depth': 4,
 'min_child_weight': 1.925593494006774,
 'num_rounds': 335.9808545767529,
 'reg_alpha': 1.423400568682085,
 'reg_lambda': 3.955876924272332,
 'weight': 0.17114897417853941}

Best set of hyperparameters:

In [9]:
param = {'learning_rate': 0.15413360344307642,
         'max_delta_step': 1.2762890808461014,
         'max_depth': 11,
         'min_child_weight': 1.702130735213737,
         'num_rounds': 398.33364651483566,
         'reg_alpha': 2.5568884181958356,
         'reg_lambda': 2.5526015056842817,
         'weight': 0.13909428322475764}

param = {'learning_rate': 0.12771337495138718,
         'max_delta_step': 3.080851382419611,
         'max_depth': 13,
         'min_child_weight': 2.68947814956559,
         'num_rounds': 332.92969059815346,
         'reg_alpha': 1.4293630231664674,
         'reg_lambda': 0.12220981612600046,
         'weight': 0.11412319177763543}

In [15]:
param = {'learning_rate': 0.23258374432866033,
 'max_delta_step': 4.087581210801292,
 'max_depth': 13,
 'min_child_weight': 1.925593494006774,
 'num_rounds': 335.9808545767529,
 'reg_alpha': 1.423400568682085,
 'reg_lambda': 3.955876924272332,
 'weight': 0.17114897417853941}

#### (iii) Repeating 5-fold CV for best set of hyperparameters

In [16]:
num_round = param["num_rounds"]
param["tree_method"] = "gpu_hist"
param["sampling_method"] = "gradient_based"
param['objective'] = 'binary:logistic'
weights = np.array([param["weight"] if binding == 0 else 1.0 for binding in df_train["Binding"]])

del param["num_rounds"]
del param["weight"]


loss = []
accuracy = []
ROC_AUC = []

for i in range(5):
    train_index, test_index  = train_indices[i], test_indices[i]
    dtrain = xgb.DMatrix(np.array(train_X[train_index]), weight = weights[train_index],
                     label = np.array(train_y[train_index]))
    dvalid = xgb.DMatrix(np.array(train_X[test_index]))
    bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
    y_valid_pred = np.round(bst.predict(dvalid))
    validation_y = train_y[test_index]

    #calculate loss:
    false_positive = 100*(1-np.mean(np.array(validation_y)[y_valid_pred == 1]))
    false_negative = 100*(np.mean(np.array(validation_y)[y_valid_pred == 0]))
    logging.info("False positive rate: " + str(false_positive)+ "; False negative rate: " + str(false_negative))
    loss.append(2*(false_negative**2) + false_positive**1.3)
    #calculate accuracy:
    accuracy.append(np.mean(y_valid_pred == np.array(validation_y)))
    #calculate ROC-AUC score:
    ROC_AUC.append(roc_auc_score(np.array(validation_y), bst.predict(dvalid)))
    
print("Loss values: %s" %loss) 
print("Accuracies: %s" %accuracy)
print("ROC-AUC scores: %s" %ROC_AUC)


Loss values: [239.6373774252441, 205.63435294814062, 253.13824236782585, 255.40983459530463, 240.87213456598107]
Accuracies: [0.847457627118644, 0.8537383177570094, 0.8486901535682023, 0.8453406421299922, 0.8345679012345679]
ROC-AUC scores: [0.8865283513815269, 0.9093155599507349, 0.9002365323664021, 0.8896475880752803, 0.8946331655005557]


#### (iv) 3. Training and validating the final model

Training the model and validating it on the test set:

In [17]:
dtrain = xgb.DMatrix(np.array(train_X), weight = weights, label = np.array(train_y),
                feature_names= feature_names)
dtest = xgb.DMatrix(np.array(test_X), label = np.array(test_y),
                    feature_names= feature_names)

bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
y_test_pred = np.round(bst.predict(dtest))
acc_test = np.mean(y_test_pred == np.array(test_y))
roc_auc = roc_auc_score(np.array(test_y), bst.predict(dtest))
mcc = matthews_corrcoef(np.array(test_y), y_test_pred)

print("Accuracy on test set: %s, ROC-AUC score for test set: %s, MCC: %s"  % (acc_test, roc_auc, mcc))

np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "y_test_pred_xgboost_ESM1b_ECFP.npy"), bst.predict(dtest))
np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "y_test_true_xgboost_ESM1b_ECFP.npy"), test_y)

Accuracy on test set: 0.8455397431447415, ROC-AUC score for test set: 0.90368561948571, MCC: 0.612486423183225


### (b) ESM1b and GNN:

#### (i) Creating numpy arrays with input vectors and output variables

In [12]:
def create_input_and_output_data(df):
    X = ();
    y = ();
        
    for ind in df.index:
        emb = df["ESM1b"][ind]
        ecfp = df["GNN rep"][ind]
                
        X = X +(np.concatenate([ecfp, emb]), );
        y = y + (df["Binding"][ind], );

    return(X,y)

train_X, train_y =  create_input_and_output_data(df = df_train)
test_X, test_y =  create_input_and_output_data(df = df_test)


feature_names =  ["GNN rep_" + str(i) for i in range(50)]
feature_names = feature_names + ["ESM1b_" + str(i) for i in range(1280)]

train_X = np.array(train_X)
test_X  = np.array(test_X)

train_y = np.array(train_y)
test_y  = np.array(test_y)

#### (ii) Performing hyperparameter optimization

In [7]:
def cross_validation_neg_acc_gradient_boosting(param):
    num_round = param["num_rounds"]
    param["tree_method"] = "gpu_hist"
    param["sampling_method"] = "gradient_based"
    param['objective'] = 'binary:logistic'
    weights = np.array([param["weight"] if binding == 0 else 1.0 for binding in df_train["Binding"]])
    
    del param["num_rounds"]
    del param["weight"]
    
    loss = []
    for i in range(5):
        train_index, test_index  = train_indices[i], test_indices[i]
        dtrain = xgb.DMatrix(np.array(train_X[train_index]), weight = weights[train_index],
                         label = np.array(train_y[train_index]))
        dvalid = xgb.DMatrix(np.array(train_X[test_index]))
        bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
        y_valid_pred = np.round(bst.predict(dvalid))
        validation_y = train_y[test_index]
    
        false_positive = 100*(1-np.mean(np.array(validation_y)[y_valid_pred == 1]))
        false_negative = 100*(np.mean(np.array(validation_y)[y_valid_pred == 0]))
        logging.info("False positive rate: " + str(false_positive)+ "; False negative rate: " + str(false_negative))
        loss.append(2*(false_negative**2) + false_positive**1.3)
    return(np.mean(loss))

#Defining search space for hyperparameter optimization
space_gradient_boosting = {"learning_rate": hp.uniform("learning_rate", 0.01, 0.5),
    "max_depth": hp.choice("max_depth", [9,10,11,12,13]),
    "reg_lambda": hp.uniform("reg_lambda", 0, 5),
    "reg_alpha": hp.uniform("reg_alpha", 0, 5),
    "max_delta_step": hp.uniform("max_delta_step", 0, 5),
    "min_child_weight": hp.uniform("min_child_weight", 0.1, 15),
    "num_rounds":  hp.uniform("num_rounds", 200, 400),
    "weight" : hp.uniform("weight", 0.1,0.33)}

NameError: name 'hp' is not defined

Performing a random grid search:

In [None]:
'''trials = Trials()

for i in range(1,2000):
    best = fmin(fn = cross_validation_neg_acc_gradient_boosting, space = space_gradient_boosting,
                algo = rand.suggest, max_evals = i, trials = trials)
    np.save(join(CURRENT_DIR, ".." ,"data", "results", "cross_validation_binding_ESM1b.npy"), trials.best_trial)
    np.save(join(CURRENT_DIR, ".." ,"data", "results", "cross_validation_binding_ESM1b_argmin.npy"), trials.argmin)
    logging.info(i)
    logging.info(trials.best_trial["result"]["loss"])
    logging.info(trials.argmin)''';

Best set of hyperparameters:

In [13]:
param = {'learning_rate': 0.14400235501414152,
         'max_delta_step': 0.814795683926195,
         'max_depth': 11,
         'min_child_weight': 4.985914250393847,
         'num_rounds': 326.61065381972924,
         'reg_alpha': 2.3376079081750634,
         'reg_lambda': 3.8256939813631936,
         'weight': 0.10867442666803548}

param = {'learning_rate': 0.18650490181254992,
         'max_delta_step': 3.747845574621026,
         'max_depth': 10,
         'min_child_weight': 0.3985828341503377,
         'num_rounds': 366.6289439088624,
         'reg_alpha': 0.8924081775198611,
         'reg_lambda': 4.888409483879253, 
         'weight': 0.14249550342115477}

#### (iii) Repeating 5-fold CV for best set of hyperparameters

In [14]:
num_round = param["num_rounds"]
param["tree_method"] = "gpu_hist"
param["sampling_method"] = "gradient_based"
param['objective'] = 'binary:logistic'
weights = np.array([param["weight"] if binding == 0 else 1.0 for binding in df_train["Binding"]])

del param["num_rounds"]
del param["weight"]

In [15]:
loss = []
accuracy = []
ROC_AUC = []

for i in range(5):
    train_index, test_index  = train_indices[i], test_indices[i]
    dtrain = xgb.DMatrix(np.array(train_X[train_index]), weight = weights[train_index],
                     label = np.array(train_y[train_index]))
    dvalid = xgb.DMatrix(np.array(train_X[test_index]))
    bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
    y_valid_pred = np.round(bst.predict(dvalid))
    validation_y = train_y[test_index]

    #calculate loss:
    false_positive = 100*(1-np.mean(np.array(validation_y)[y_valid_pred == 1]))
    false_negative = 100*(np.mean(np.array(validation_y)[y_valid_pred == 0]))
    logging.info("False positive rate: " + str(false_positive)+ "; False negative rate: " + str(false_negative))
    loss.append(2*(false_negative**2) + false_positive**1.3)
    #calculate accuracy:
    accuracy.append(np.mean(y_valid_pred == np.array(validation_y)))
    #calculate ROC-AUC score:
    ROC_AUC.append(roc_auc_score(np.array(validation_y), bst.predict(dvalid)))
    
print("Loss values: %s" %loss) 
print("Accuracies: %s" %accuracy)
print("ROC-AUC scores: %s" %ROC_AUC)

np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "acc_CV_xgboost_ESM1b_GNN.npy"), np.array(accuracy))
np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "loss_CV_xgboost_ESM1b_GNN.npy"), np.array(loss))
np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "ROC_AUC_CV_xgboost_ESM1b_GNN.npy"), np.array(ROC_AUC))

Loss values: [127.93542431344466, 122.79501284863223, 132.211203999234, 126.27687643490749, 131.02385081797956]
Accuracies: [0.9080177971488241, 0.9076071922544952, 0.906319241336524, 0.9075287865367582, 0.9075296031817771]
ROC-AUC scores: [0.9481174271320283, 0.9501233965053054, 0.9437746666563114, 0.9507466012024709, 0.9510292058347031]


#### (iv) 3. Training and validating the final model
Training the model and validating it on the test set:

In [16]:
dtrain = xgb.DMatrix(np.array(train_X), weight = weights, label = np.array(train_y),
                feature_names= feature_names)
dtest = xgb.DMatrix(np.array(test_X), label = np.array(test_y),
                    feature_names= feature_names)

bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
y_test_pred = np.round(bst.predict(dtest))
acc_test = np.mean(y_test_pred == np.array(test_y))
roc_auc = roc_auc_score(np.array(test_y), bst.predict(dtest))
mcc = matthews_corrcoef(np.array(test_y), y_test_pred)

print("Accuracy on test set: %s, ROC-AUC score for test set: %s, MCC: %s"  % (acc_test, roc_auc, mcc))

np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "y_test_pred_xgboost_ESM1b_GNN.npy"), bst.predict(dtest))
np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "y_test_true_xgboost_ESM1b_GNN.npy"), test_y)

Accuracy on test set: 0.9051646706586827, ROC-AUC score for test set: 0.9455840254566843, MCC: 0.7509981408842288


### (c) ESM1b_ts and ECFP:

#### (i) Creating numpy arrays with input vectors and output variables

In [17]:
def create_input_and_output_data(df):
    X = ();
    y = ();
    
    for ind in df.index:
        emb = df["ESM1b_ts"][ind]
        ecfp = np.array(list(df["ECFP"][ind])).astype(int)
                
        X = X +(np.concatenate([ecfp, emb]), );
        y = y + (df["Binding"][ind], );

    return(X,y)

train_X, train_y =  create_input_and_output_data(df = df_train)
test_X, test_y =  create_input_and_output_data(df = df_test)


feature_names =  ["ECFP_" + str(i) for i in range(1024)]
feature_names = feature_names + ["ESM1b_ts_" + str(i) for i in range(1280)]

train_X = np.array(train_X)
test_X  = np.array(test_X)

train_y = np.array(train_y)
test_y  = np.array(test_y)

#### (ii) Performing hyperparameter optimization

In [22]:
def cross_validation_neg_acc_gradient_boosting(param):
    num_round = param["num_rounds"]
    param["tree_method"] = "gpu_hist"
    param["sampling_method"] = "gradient_based"
    param['objective'] = 'binary:logistic'
    weights = np.array([param["weight"] if binding == 0 else 1.0 for binding in df_train["Binding"]])
    
    del param["num_rounds"]
    del param["weight"]
    
    loss = []
    for i in range(5):
        train_index, test_index  = train_indices[i], test_indices[i]
        dtrain = xgb.DMatrix(np.array(train_X[train_index]), weight = weights[train_index],
                         label = np.array(train_y[train_index]))
        dvalid = xgb.DMatrix(np.array(train_X[test_index]))
        bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
        y_valid_pred = np.round(bst.predict(dvalid))
        validation_y = train_y[test_index]
    
        false_positive = 100*(1-np.mean(np.array(validation_y)[y_valid_pred == 1]))
        false_negative = 100*(np.mean(np.array(validation_y)[y_valid_pred == 0]))
        logging.info("False positive rate: " + str(false_positive)+ "; False negative rate: " + str(false_negative))
        loss.append(2*(false_negative**2) + false_positive**1.3)
    return(np.mean(loss))

#Defining search space for hyperparameter optimization
space_gradient_boosting = {"learning_rate": hp.uniform("learning_rate", 0.01, 0.5),
                            "max_depth": hp.choice("max_depth", [9,10,11,12,13]),
                            "reg_lambda": hp.uniform("reg_lambda", 0, 5),
                            "reg_alpha": hp.uniform("reg_alpha", 0, 5),
                            "max_delta_step": hp.uniform("max_delta_step", 0, 5),
                            "min_child_weight": hp.uniform("min_child_weight", 0.1, 15),
                            "num_rounds":  hp.uniform("num_rounds", 200, 400),
                            "weight" : hp.uniform("weight", 0.1,0.33)}

Performing a random grid search:

In [23]:
'''trials = Trials()

for i in range(1,2000):
    best = fmin(fn = cross_validation_neg_acc_gradient_boosting, space = space_gradient_boosting,
                algo = rand.suggest, max_evals = i, trials = trials)
    np.save(join(CURRENT_DIR, ".." ,"data", "results", "cross_validation_binding_ESM1b.npy"), trials.best_trial)
    np.save(join(CURRENT_DIR, ".." ,"data", "results", "cross_validation_binding_ESM1b_argmin.npy"), trials.argmin)
    logging.info(i)
    logging.info(trials.best_trial["result"]["loss"])
    logging.info(trials.argmin)''';

Best set of hyperparameters:

In [18]:
param = {'learning_rate': 0.18394760595697052,
         'max_delta_step': 2.100681119544991,
         'max_depth': 11,
         'min_child_weight': 0.7270445069860024,
         'num_rounds': 325.1827926302196,
         'reg_alpha': 0.17887744610095235,
         'reg_lambda': 1.4959549363997264,
         'weight': 0.11976484745202556}

param = {'learning_rate': 0.31553117247348733,
         'max_delta_step': 1.7726044219753656,
         'max_depth': 10,
         'min_child_weight': 1.3845040588450772,
         'num_rounds': 342.68325188584106,
         'reg_alpha': 0.531395259755843,
         'reg_lambda': 3.744980563764689,
         'weight': 0.26187490421514203}

num_round = param["num_rounds"]
param["tree_method"] = "gpu_hist"
param["sampling_method"] = "gradient_based"
param['objective'] = 'binary:logistic'
weights = np.array([param["weight"] if binding == 0 else 1.0 for binding in df_train["Binding"]])

del param["num_rounds"]
del param["weight"]

#### (iii) Repeating 5-fold CV for best set of hyperparameters

In [19]:
loss = []
accuracy = []
ROC_AUC = []

for i in range(5):
    train_index, test_index  = train_indices[i], test_indices[i]
    dtrain = xgb.DMatrix(np.array(train_X[train_index]), weight = weights[train_index],
                     label = np.array(train_y[train_index]))
    dvalid = xgb.DMatrix(np.array(train_X[test_index]))
    bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
    y_valid_pred = np.round(bst.predict(dvalid))
    validation_y = train_y[test_index]

    #calculate loss:
    false_positive = 100*(1-np.mean(np.array(validation_y)[y_valid_pred == 1]))
    false_negative = 100*(np.mean(np.array(validation_y)[y_valid_pred == 0]))
    logging.info("False positive rate: " + str(false_positive)+ "; False negative rate: " + str(false_negative))
    loss.append(2*(false_negative**2) + false_positive**1.3)
    #calculate accuracy:
    accuracy.append(np.mean(y_valid_pred == np.array(validation_y)))
    #calculate ROC-AUC score:
    ROC_AUC.append(roc_auc_score(np.array(validation_y), bst.predict(dvalid)))
    
print("Loss values: %s" %loss) 
print("Accuracies: %s" %accuracy)
print("ROC-AUC scores: %s" %ROC_AUC)

np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "acc_CV_xgboost_ESM1b_ts_ECFP.npy"), np.array(accuracy))
np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "loss_CV_xgboost_ESM1b_ts_ECFP.npy"), np.array(loss))
np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "ROC_AUC_CV_xgboost_ESM1b_ts_ECFP.npy"), np.array(ROC_AUC))

Loss values: [118.64570140259957, 109.55544934268875, 136.58107225995332, 112.14992450710591, 116.58008809267714]
Accuracies: [0.9095614274039772, 0.9113877362840018, 0.8988744128334663, 0.9105403011514615, 0.9075296031817771]
ROC-AUC scores: [0.9531463757576579, 0.9543092505120799, 0.9443616020172131, 0.9560860893869755, 0.9515226897235054]


#### (iv) 3. Training and validating the final model
Training the model and validating it on the test set:

In [20]:
dtrain = xgb.DMatrix(np.array(train_X), weight = weights, label = np.array(train_y),
                feature_names= feature_names)
dtest = xgb.DMatrix(np.array(test_X), label = np.array(test_y),
                    feature_names= feature_names)

bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
y_test_pred = np.round(bst.predict(dtest))
acc_test = np.mean(y_test_pred == np.array(test_y))
roc_auc = roc_auc_score(np.array(test_y), bst.predict(dtest))
mcc = matthews_corrcoef(np.array(test_y), y_test_pred)

print("Accuracy on test set: %s, ROC-AUC score for test set: %s, MCC: %s"  % (acc_test, roc_auc, mcc))

np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "y_test_pred_xgboost_ESM1b_ts_ECFP.npy"), bst.predict(dtest))
np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "y_test_true_xgboost_ESM1b_ts_ECFP.npy"), test_y)

Accuracy on test set: 0.9063622754491018, ROC-AUC score for test set: 0.9499652082794869, MCC: 0.7594154904873446


### (d) ESM1b_ts and GNN:

#### (i) Creating numpy arrays with input vectors and output variables

In [21]:
def create_input_and_output_data(df):
    X = ();
    y = ();
    
    for ind in df.index:
        emb = df["ESM1b_ts"][ind]
        ecfp = df["GNN rep"][ind]
                
        X = X +(np.concatenate([ecfp, emb]), );
        y = y + (df["Binding"][ind], );

    return(X,y)

train_X, train_y =  create_input_and_output_data(df = df_train)
test_X, test_y =  create_input_and_output_data(df = df_test)


feature_names =  ["GNN rep_" + str(i) for i in range(50)]
feature_names = feature_names + ["ESM1b_" + str(i) for i in range(1280)]

train_X = np.array(train_X)
test_X  = np.array(test_X)

train_y = np.array(train_y)
test_y  = np.array(test_y)

#### (ii) Performing hyperparameter optimization

In [20]:
def cross_validation_neg_acc_gradient_boosting(param):
    num_round = param["num_rounds"]
    param["tree_method"] = "gpu_hist"
    param["sampling_method"] = "gradient_based"
    param['objective'] = 'binary:logistic'
    weights = np.array([param["weight"] if binding == 0 else 1.0 for binding in df_train["Binding"]])
    
    del param["num_rounds"]
    del param["weight"]
    
    loss = []
    for i in range(5):
        train_index, test_index  = train_indices[i], test_indices[i]
        dtrain = xgb.DMatrix(np.array(train_X[train_index]), weight = weights[train_index],
                         label = np.array(train_y[train_index]))
        dvalid = xgb.DMatrix(np.array(train_X[test_index]))
        bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
        y_valid_pred = np.round(bst.predict(dvalid))
        validation_y = train_y[test_index]
    
        false_positive = 100*(1-np.mean(np.array(validation_y)[y_valid_pred == 1]))
        false_negative = 100*(np.mean(np.array(validation_y)[y_valid_pred == 0]))
        logging.info("False positive rate: " + str(false_positive)+ "; False negative rate: " + str(false_negative))
        loss.append(2*(false_negative**2) + false_positive**1.3)
    return(np.mean(loss))

#Defining search space for hyperparameter optimization
space_gradient_boosting = {"learning_rate": hp.uniform("learning_rate", 0.01, 0.5),
    "max_depth": hp.choice("max_depth", [9,10,11,12,13]),
    "reg_lambda": hp.uniform("reg_lambda", 0, 5),
    "reg_alpha": hp.uniform("reg_alpha", 0, 5),
    "max_delta_step": hp.uniform("max_delta_step", 0, 5),
    "min_child_weight": hp.uniform("min_child_weight", 0.1, 15),
    "num_rounds":  hp.uniform("num_rounds", 200, 400),
    "weight" : hp.uniform("weight", 0.1,0.33)}

NameError: name 'hp' is not defined

Performing a random grid search:

In [None]:
'''trials = Trials()

for i in range(1,2000):
    best = fmin(fn = cross_validation_neg_acc_gradient_boosting, space = space_gradient_boosting,
                algo = rand.suggest, max_evals = i, trials = trials)
    np.save(join(CURRENT_DIR, ".." ,"data", "results", "cross_validation_binding_ESM1b.npy"), trials.best_trial)
    np.save(join(CURRENT_DIR, ".." ,"data", "results", "cross_validation_binding_ESM1b_argmin.npy"), trials.argmin)
    logging.info(i)
    logging.info(trials.best_trial["result"]["loss"])
    logging.info(trials.argmin)''';

Best set of hyperparameters:

In [22]:
param = {'learning_rate': 0.2450500663744065,
         'max_delta_step': 2.382647656857187,
         'max_depth': 11,
         'min_child_weight': 1.222014993565574, 
         'num_rounds': 379.3863424395678,
         'reg_alpha': 1.7242896864948025,
         'reg_lambda': 2.845463948389928,
         'weight': 0.10896532373464474}

param = {'learning_rate': 0.18444025726334898,
         'max_delta_step': 3.2748796106084117,
         'max_depth': 13,
         'min_child_weight': 3.1946753845027738,
         'num_rounds': 314.1036429221291,
         'reg_alpha': 0.48821021807600673,
         'reg_lambda': 2.6236829011598073,
         'weight': 0.1264521266931227}

num_round = param["num_rounds"]
param["tree_method"] = "gpu_hist"
param["sampling_method"] = "gradient_based"
param['objective'] = 'binary:logistic'
weights = np.array([param["weight"] if binding == 0 else 1.0 for binding in df_train["Binding"]])

del param["num_rounds"]
del param["weight"]

#### (iii) Repeating 5-fold CV for best set of hyperparameters

In [23]:
loss = []
accuracy = []
ROC_AUC = []

for i in range(5):
    train_index, test_index  = train_indices[i], test_indices[i]
    dtrain = xgb.DMatrix(np.array(train_X[train_index]), weight = weights[train_index],
                     label = np.array(train_y[train_index]))
    dvalid = xgb.DMatrix(np.array(train_X[test_index]))
    bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
    y_valid_pred = np.round(bst.predict(dvalid))
    validation_y = train_y[test_index]

    #calculate loss:
    false_positive = 100*(1-np.mean(np.array(validation_y)[y_valid_pred == 1]))
    false_negative = 100*(np.mean(np.array(validation_y)[y_valid_pred == 0]))
    logging.info("False positive rate: " + str(false_positive)+ "; False negative rate: " + str(false_negative))
    loss.append(2*(false_negative**2) + false_positive**1.3)
    #calculate accuracy:
    accuracy.append(np.mean(y_valid_pred == np.array(validation_y)))
    #calculate ROC-AUC score:
    ROC_AUC.append(roc_auc_score(np.array(validation_y), bst.predict(dvalid)))
    
print("Loss values: %s" %loss) 
print("Accuracies: %s" %accuracy)
print("ROC-AUC scores: %s" %ROC_AUC)

np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "acc_CV_xgboost_ESM1b_ts_GNN.npy"), np.array(accuracy))
np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "loss_CV_xgboost_ESM1b_ts_GNN.npy"), np.array(loss))
np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "ROC_AUC_CV_xgboost_ESM1b_ts_GNN.npy"), np.array(ROC_AUC))

Loss values: [107.10779217320857, 97.97889476323061, 117.6449558838506, 92.98319474363471, 102.69988687285533]
Accuracies: [0.9046581312993734, 0.9083448593822038, 0.90064699104848, 0.9066430469441984, 0.9074392117870379]
ROC-AUC scores: [0.9584373172944172, 0.9558062039027504, 0.9500524479937247, 0.9617684224032785, 0.9569448004040885]


#### (iv) 3. Training and validating the final model
Training the model and validating it on the test set:

In [24]:
dtrain = xgb.DMatrix(np.array(train_X), weight = weights, label = np.array(train_y),
                feature_names= feature_names)
dtest = xgb.DMatrix(np.array(test_X), label = np.array(test_y),
                    feature_names= feature_names)

bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
y_test_pred = np.round(bst.predict(dtest))
acc_test = np.mean(y_test_pred == np.array(test_y))
roc_auc = roc_auc_score(np.array(test_y), bst.predict(dtest))
mcc = matthews_corrcoef(np.array(test_y), y_test_pred)

print("Accuracy on test set: %s, ROC-AUC score for test set: %s, MCC: %s"  % (acc_test, roc_auc, mcc))

np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "y_test_pred_xgboost_ESM1b_ts_GNN.npy"), bst.predict(dtest))
np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "y_test_true_xgboost_ESM1b_ts_GNN.npy"), test_y)

Accuracy on test set: 0.9028443113772455, ROC-AUC score for test set: 0.9540429057244825, MCC: 0.7544314356676812
