# Training gradient boosting model for enzyme-substrate pair prediction with ESM-1b-vectors

### 1. Loading and preprocessing data for model training and evaluation
### 2. Hyperparameter optimization using a 5-fold cross-validation (CV)
### 3. Training and validating the final model

In [1]:
import pandas as pd
import numpy as np
import random
import pickle
import sys
import os
import logging
from os.path import join
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from hyperopt import fmin, tpe, hp, Trials, rand
import xgboost as xgb
from sklearn.metrics import matthews_corrcoef


sys.path.append('.\\additional_code')
#from data_preprocessing import *

CURRENT_DIR = os.getcwd()
print(CURRENT_DIR)



C:\Users\alexk\projects\ESP\notebooks_and_code


## 1. Loading and preprocessing data for model training and evaluation

In [2]:
def array_column_to_strings(df, column):
    df[column] = [str(list(df[column][ind])) for ind in df.index]
    return(df)

def string_column_to_array(df, column):
    df[column] = [np.array(eval(df[column][ind])) for ind in df.index]
    return(df)

### (a) Loading data: 
Only keeping data points from the GO Annotation database with experimental evidence

In [3]:
df_train = pd.read_pickle(join(CURRENT_DIR, ".." ,"data","splits", "df_train_with_ESM1b_ts_GNN.pkl"))
df_train = df_train.loc[df_train["ESM1b"] != ""]
df_train = df_train.loc[df_train["type"] != "engqvist"]
df_train = df_train.loc[df_train["GNN rep"] != ""]
df_train.reset_index(inplace = True, drop = True)

df_test  = pd.read_pickle(join(CURRENT_DIR, ".." ,"data","splits", "df_test_with_ESM1b_ts_GNN.pkl"))
df_test = df_test.loc[df_test["ESM1b"] != ""]
df_test = df_test.loc[df_test["type"] != "engqvist"]
df_test = df_test.loc[df_test["GNN rep"] != ""]
df_test.reset_index(inplace = True, drop = True)

  result = libops.scalar_compare(x.ravel(), y, op)
  result = libops.scalar_compare(x.ravel(), y, op)


### (b) Splitting training set into 5-folds for hyperparameter optimization:
The 5 folds are created in such a way that the same enzyme does not occure in two different folds

In [4]:
def split_dataframe(df, frac):
    df1 = pd.DataFrame(columns = list(df.columns))
    df2 = pd.DataFrame(columns = list(df.columns))
    try:
        df.drop(columns = ["level_0"], inplace = True)
    except: 
        pass
    df.reset_index(inplace = True)
    
    train_indices = []
    test_indices = []
    ind = 0
    while len(train_indices) +len(test_indices) < len(df):
        if ind not in train_indices and ind not in test_indices:
            if ind % frac != 0:
                n_old = len(train_indices)
                train_indices.append(ind)
                train_indices = list(set(train_indices))

                while n_old != len(train_indices):
                    n_old = len(train_indices)

                    training_seqs= list(set(df["ESM1b"].loc[train_indices]))

                    train_indices = train_indices + (list(df.loc[df["ESM1b"].isin(training_seqs)].index))
                    train_indices = list(set(train_indices))
                
            else:
                n_old = len(test_indices)
                test_indices.append(ind)
                test_indices = list(set(test_indices))

                while n_old != len(test_indices):
                    n_old = len(test_indices)

                    testing_seqs= list(set(df["ESM1b"].loc[test_indices]))

                    test_indices = test_indices + (list(df.loc[df["ESM1b"].isin(testing_seqs)].index))
                    test_indices = list(set(test_indices))
                
        ind +=1
    return(df.loc[train_indices], df.loc[test_indices])

In [5]:
data_train2 = df_train.copy()
data_train2 = array_column_to_strings(data_train2, column = "ESM1b")

data_train2, df_fold = split_dataframe(df = data_train2, frac=5)
indices_fold1 = list(df_fold["index"])
print(len(data_train2), len(indices_fold1))#

data_train2, df_fold = split_dataframe(df = data_train2, frac=4)
indices_fold2 = list(df_fold["index"])
print(len(data_train2), len(indices_fold2))

data_train2, df_fold = split_dataframe(df = data_train2, frac=3)
indices_fold3 = list(df_fold["index"])
print(len(data_train2), len(indices_fold3))

data_train2, df_fold = split_dataframe(df = data_train2, frac=2)
indices_fold4 = list(df_fold["index"])
indices_fold5 = list(data_train2["index"])
print(len(data_train2), len(indices_fold4))


fold_indices = [indices_fold1, indices_fold2, indices_fold3, indices_fold4, indices_fold5]

train_indices = [[], [], [], [], []]
test_indices = [[], [], [], [], []]

for i in range(5):
    for j in range(5):
        if i != j:
            train_indices[i] = train_indices[i] + fold_indices[j]
            
    test_indices[i] = fold_indices[i]
    
np.save(join(CURRENT_DIR, ".." ,"data","splits", "CV_train_indices.npy"), train_indices)
np.save(join(CURRENT_DIR, ".." ,"data","splits", "CV_test_indices.npy"), test_indices)

44733 11145
33555 11178
22014 11541
10952 11062


  arr = np.asanyarray(arr)


In [6]:
train_indices = list(np.load(join(CURRENT_DIR, ".." ,"data","splits", "CV_train_indices.npy"),  allow_pickle=True))
test_indices = list(np.load(join(CURRENT_DIR, ".." ,"data","splits", "CV_test_indices.npy"),  allow_pickle=True))

## 2. Hyperparameter optimization using a 5-fold cross-validation (CV)

### (a) ECFP and ESM1b:

#### (i) Creating numpy arrays with input vectors and output variables

In [7]:
def create_input_and_output_data(df):
    X = ();
    y = ();
    
    for ind in df.index:
        emb = df["ESM1b"][ind]
        ecfp = np.array(list(df["ECFP"][ind])).astype(int)
                
        X = X +(np.concatenate([ecfp, emb]), );
        y = y + (df["Binding"][ind], );

    return(X,y)

train_X, train_y =  create_input_and_output_data(df = df_train)
test_X, test_y =  create_input_and_output_data(df = df_test)


feature_names =  ["ECFP_" + str(i) for i in range(1024)]
feature_names = feature_names + ["ESM1b_" + str(i) for i in range(1280)]

train_X = np.array(train_X)
test_X  = np.array(test_X)

train_y = np.array(train_y)
test_y  = np.array(test_y)

#### (ii) Performing hyperparameter optimization

In [8]:
def cross_validation_neg_acc_gradient_boosting(param):
    num_round = param["num_rounds"]
    param["tree_method"] = "gpu_hist"
    param["sampling_method"] = "gradient_based"
    param['objective'] = 'binary:logistic'
    weights = np.array([param["weight"] if binding == 0 else 1.0 for binding in df_train["Binding"]])
    
    del param["num_rounds"]
    del param["weight"]
    
    loss = []
    for i in range(5):
        train_index, test_index  = train_indices[i], test_indices[i]
        dtrain = xgb.DMatrix(np.array(train_X[train_index]), weight = weights[train_index],
                         label = np.array(train_y[train_index]))
        dvalid = xgb.DMatrix(np.array(train_X[test_index]))
        bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
        y_valid_pred = np.round(bst.predict(dvalid))
        validation_y = train_y[test_index]
    
        false_positive = 100*(1-np.mean(np.array(validation_y)[y_valid_pred == 1]))
        false_negative = 100*(np.mean(np.array(validation_y)[y_valid_pred == 0]))
        logging.info("False positive rate: " + str(false_positive)+ "; False negative rate: " + str(false_negative))
        loss.append(2*(false_negative**2) + false_positive**1.3)
    return(np.mean(loss))

#Defining search space for hyperparameter optimization
space_gradient_boosting = {"learning_rate": hp.uniform("learning_rate", 0.01, 0.5),
    "max_depth": hp.choice("max_depth", [9,10,11,12,13]),
    "reg_lambda": hp.uniform("reg_lambda", 0, 5),
    "reg_alpha": hp.uniform("reg_alpha", 0, 5),
    "max_delta_step": hp.uniform("max_delta_step", 0, 5),
    "min_child_weight": hp.uniform("min_child_weight", 0.1, 15),
    "num_rounds":  hp.uniform("num_rounds", 200, 400),
    "weight" : hp.uniform("weight", 0.1,0.33)}

Performing a random grid search:

In [9]:
'''trials = Trials()

for i in range(1,2000):
    best = fmin(fn = cross_validation_neg_acc_gradient_boosting, space = space_gradient_boosting,
                algo = rand.suggest, max_evals = i, trials = trials)
    logging.info(i)
    logging.info(trials.best_trial["result"]["loss"])
    logging.info(trials.argmin)''';

Best set of hyperparameters:

In [10]:
param = {'learning_rate': 0.12771337495138718,
         'max_delta_step': 3.080851382419611,
         'max_depth': 13,
         'min_child_weight': 2.68947814956559,
         'num_rounds': 332.92969059815346,
         'reg_alpha': 1.4293630231664674,
         'reg_lambda': 0.12220981612600046,
         'weight': 0.11412319177763543}

#### (iii) Repeating 5-fold CV for best set of hyperparameters

In [11]:
num_round = param["num_rounds"]
param["tree_method"] = "gpu_hist"
param["sampling_method"] = "gradient_based"
param['objective'] = 'binary:logistic'
weights = np.array([param["weight"] if binding == 0 else 1.0 for binding in df_train["Binding"]])

del param["num_rounds"]
del param["weight"]


loss = []
accuracy = []
ROC_AUC = []

for i in range(5):
    train_index, test_index  = train_indices[i], test_indices[i]
    dtrain = xgb.DMatrix(np.array(train_X[train_index]), weight = weights[train_index],
                     label = np.array(train_y[train_index]))
    dvalid = xgb.DMatrix(np.array(train_X[test_index]))
    bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
    y_valid_pred = np.round(bst.predict(dvalid))
    validation_y = train_y[test_index]

    #calculate loss:
    false_positive = 100*(1-np.mean(np.array(validation_y)[y_valid_pred == 1]))
    false_negative = 100*(np.mean(np.array(validation_y)[y_valid_pred == 0]))
    logging.info("False positive rate: " + str(false_positive)+ "; False negative rate: " + str(false_negative))
    loss.append(2*(false_negative**2) + false_positive**1.3)
    #calculate accuracy:
    accuracy.append(np.mean(y_valid_pred == np.array(validation_y)))
    #calculate ROC-AUC score:
    ROC_AUC.append(roc_auc_score(np.array(validation_y), bst.predict(dvalid)))
    
print("Loss values: %s" %loss) 
print("Accuracies: %s" %accuracy)
print("ROC-AUC scores: %s" %ROC_AUC)

np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "acc_CV_xgboost_ESM1b_ECFP.npy"), np.array(accuracy))
np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "loss_CV_xgboost_ESM1b_ECFP.npy"), np.array(loss))
np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "ROC_AUC_CV_xgboost_ESM1b_ECFP.npy"), np.array(ROC_AUC))

Loss values: [135.64613843194581, 146.31281276919225, 149.789174479733, 159.6153340309748, 137.4009624484746]
Accuracies: [0.8691790040376851, 0.8684022186437645, 0.8662160991248592, 0.8606942686675104, 0.8758217677136596]
ROC-AUC scores: [0.9426084967554512, 0.9383771038523977, 0.9347969874349884, 0.9294492918142963, 0.9389172581369728]


#### (iv) 3. Training and validating the final model

Training the model and validating it on the test set:

In [12]:
dtrain = xgb.DMatrix(np.array(train_X), weight = weights, label = np.array(train_y),
                feature_names= feature_names)
dtest = xgb.DMatrix(np.array(test_X), label = np.array(test_y),
                    feature_names= feature_names)

bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
y_test_pred = np.round(bst.predict(dtest))
acc_test = np.mean(y_test_pred == np.array(test_y))
roc_auc = roc_auc_score(np.array(test_y), bst.predict(dtest))
mcc = matthews_corrcoef(np.array(test_y), y_test_pred)

print("Accuracy on test set: %s, ROC-AUC score for test set: %s, MCC: %s"  % (acc_test, roc_auc, mcc))

#np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "y_test_pred_xgboost_ESM1b_ECFP.npy"), bst.predict(dtest))
#np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "y_test_true_xgboost_ESM1b_ECFP.npy"), test_y)

Accuracy on test set: 0.8722044728434505, ROC-AUC score for test set: 0.9368380603439415, MCC: 0.6915652351366568


### (b) ESM1b and GNN (pre-trained):

#### (i) Creating numpy arrays with input vectors and output variables

In [13]:
def create_input_and_output_data(df):
    X = ();
    y = ();
        
    for ind in df.index:
        emb = df["ESM1b"][ind]
        ecfp = df["GNN rep (pretrained)"][ind]
                
        X = X +(np.concatenate([ecfp, emb]), );
        y = y + (df["Binding"][ind], );

    return(X,y)

train_X, train_y =  create_input_and_output_data(df = df_train)
test_X, test_y =  create_input_and_output_data(df = df_test)


feature_names =  ["GNN rep_" + str(i) for i in range(50)]
feature_names = feature_names + ["ESM1b_" + str(i) for i in range(1280)]

train_X = np.array(train_X)
test_X  = np.array(test_X)

train_y = np.array(train_y)
test_y  = np.array(test_y)

#### (ii) Performing hyperparameter optimization

In [14]:
def cross_validation_neg_acc_gradient_boosting(param):
    num_round = param["num_rounds"]
    param["tree_method"] = "gpu_hist"
    param["sampling_method"] = "gradient_based"
    param['objective'] = 'binary:logistic'
    weights = np.array([param["weight"] if binding == 0 else 1.0 for binding in df_train["Binding"]])
    
    del param["num_rounds"]
    del param["weight"]
    
    loss = []
    for i in range(5):
        train_index, test_index  = train_indices[i], test_indices[i]
        dtrain = xgb.DMatrix(np.array(train_X[train_index]), weight = weights[train_index],
                         label = np.array(train_y[train_index]))
        dvalid = xgb.DMatrix(np.array(train_X[test_index]))
        bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
        y_valid_pred = np.round(bst.predict(dvalid))
        validation_y = train_y[test_index]
    
        false_positive = 100*(1-np.mean(np.array(validation_y)[y_valid_pred == 1]))
        false_negative = 100*(np.mean(np.array(validation_y)[y_valid_pred == 0]))
        logging.info("False positive rate: " + str(false_positive)+ "; False negative rate: " + str(false_negative))
        loss.append(2*(false_negative**2) + false_positive**1.3)
    return(np.mean(loss))

#Defining search space for hyperparameter optimization
space_gradient_boosting = {"learning_rate": hp.uniform("learning_rate", 0.01, 0.5),
    "max_depth": hp.choice("max_depth", [9,10,11,12,13]),
    "reg_lambda": hp.uniform("reg_lambda", 0, 5),
    "reg_alpha": hp.uniform("reg_alpha", 0, 5),
    "max_delta_step": hp.uniform("max_delta_step", 0, 5),
    "min_child_weight": hp.uniform("min_child_weight", 0.1, 15),
    "num_rounds":  hp.uniform("num_rounds", 200, 400),
    "weight" : hp.uniform("weight", 0.1,0.33)}

Performing a random grid search:

In [15]:
'''trials = Trials()

for i in range(1,2000):
    best = fmin(fn = cross_validation_neg_acc_gradient_boosting, space = space_gradient_boosting,
                algo = rand.suggest, max_evals = i, trials = trials)
    logging.info(i)
    logging.info(trials.best_trial["result"]["loss"])
    logging.info(trials.argmin)''';

Best set of hyperparameters:

In [16]:
param = {'learning_rate': 0.09207371208675638,
         'max_delta_step': 1.6501026095681381,
         'max_depth': 12, 
         'min_child_weight': 4.385828776339477,
         'num_rounds': 361.4040208821599,
         'reg_alpha': 2.8139614176935313,
         'reg_lambda': 1.1521733347508363,
         'weight': 0.14162338902155536}

#### (iii) Repeating 5-fold CV for best set of hyperparameters

In [17]:
num_round = param["num_rounds"]
param["tree_method"] = "gpu_hist"
param["sampling_method"] = "gradient_based"
param['objective'] = 'binary:logistic'
weights = np.array([param["weight"] if binding == 0 else 1.0 for binding in df_train["Binding"]])

del param["num_rounds"]
del param["weight"]

In [18]:
loss = []
accuracy = []
ROC_AUC = []

for i in range(5):
    train_index, test_index  = train_indices[i], test_indices[i]
    dtrain = xgb.DMatrix(np.array(train_X[train_index]), weight = weights[train_index],
                     label = np.array(train_y[train_index]))
    dvalid = xgb.DMatrix(np.array(train_X[test_index]))
    bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
    y_valid_pred = np.round(bst.predict(dvalid))
    validation_y = train_y[test_index]

    #calculate loss:
    false_positive = 100*(1-np.mean(np.array(validation_y)[y_valid_pred == 1]))
    false_negative = 100*(np.mean(np.array(validation_y)[y_valid_pred == 0]))
    logging.info("False positive rate: " + str(false_positive)+ "; False negative rate: " + str(false_negative))
    loss.append(2*(false_negative**2) + false_positive**1.3)
    #calculate accuracy:
    accuracy.append(np.mean(y_valid_pred == np.array(validation_y)))
    #calculate ROC-AUC score:
    ROC_AUC.append(roc_auc_score(np.array(validation_y), bst.predict(dvalid)))
    
print("Loss values: %s" %loss) 
print("Accuracies: %s" %accuracy)
print("ROC-AUC scores: %s" %ROC_AUC)

np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "acc_CV_xgboost_ESM1b_GNN_pretrained.npy"), np.array(accuracy))
np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "loss_CV_xgboost_ESM1b_GNN_pretrained.npy"), np.array(loss))
np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "ROC_AUC_CV_xgboost_ESM1b_GNN_pretrained.npy"), np.array(ROC_AUC))

Loss values: [121.97575964696276, 132.66941366275, 143.92507718249186, 155.63028256825766, 139.25640504585414]
Accuracies: [0.8886496186630776, 0.8872785829307569, 0.8777402304826272, 0.8782317844874344, 0.8850438276113952]
ROC-AUC scores: [0.9474577315968011, 0.9433819919446351, 0.9373767455135212, 0.9336666463444948, 0.9437762941205631]


#### (iv) 3. Training and validating the final model
Training the model and validating it on the test set:

In [19]:
dtrain = xgb.DMatrix(np.array(train_X), weight = weights, label = np.array(train_y),
                feature_names= feature_names)
dtest = xgb.DMatrix(np.array(test_X), label = np.array(test_y),
                    feature_names= feature_names)

bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
y_test_pred = np.round(bst.predict(dtest))
acc_test = np.mean(y_test_pred == np.array(test_y))
roc_auc = roc_auc_score(np.array(test_y), bst.predict(dtest))
mcc = matthews_corrcoef(np.array(test_y), y_test_pred)

print("Accuracy on test set: %s, ROC-AUC score for test set: %s, MCC: %s"  % (acc_test, roc_auc, mcc))

np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "y_test_pred_xgboost_ESM1b_GNN_pretrained.npy"), bst.predict(dtest))
np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "y_test_true_xgboost_ESM1b_GNN_pretrained.npy"), test_y)

Accuracy on test set: 0.8944943903707556, ROC-AUC score for test set: 0.9415865958288083, MCC: 0.7327206623426592


### (c) ESM1b_ts and ECFP:

#### (i) Creating numpy arrays with input vectors and output variables

In [20]:
def create_input_and_output_data(df):
    X = ();
    y = ();
    
    for ind in df.index:
        emb = df["ESM1b_ts"][ind]
        ecfp = np.array(list(df["ECFP"][ind])).astype(int)
                
        X = X +(np.concatenate([ecfp, emb]), );
        y = y + (df["Binding"][ind], );

    return(X,y)

train_X, train_y =  create_input_and_output_data(df = df_train)
test_X, test_y =  create_input_and_output_data(df = df_test)


feature_names =  ["ECFP_" + str(i) for i in range(1024)]
feature_names = feature_names + ["ESM1b_ts_" + str(i) for i in range(1280)]

train_X = np.array(train_X)
test_X  = np.array(test_X)

train_y = np.array(train_y)
test_y  = np.array(test_y)

#### (ii) Performing hyperparameter optimization

In [22]:
def cross_validation_neg_acc_gradient_boosting(param):
    num_round = param["num_rounds"]
    param["tree_method"] = "gpu_hist"
    param["sampling_method"] = "gradient_based"
    param['objective'] = 'binary:logistic'
    weights = np.array([param["weight"] if binding == 0 else 1.0 for binding in df_train["Binding"]])
    
    del param["num_rounds"]
    del param["weight"]
    
    loss = []
    for i in range(5):
        train_index, test_index  = train_indices[i], test_indices[i]
        dtrain = xgb.DMatrix(np.array(train_X[train_index]), weight = weights[train_index],
                         label = np.array(train_y[train_index]))
        dvalid = xgb.DMatrix(np.array(train_X[test_index]))
        bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
        y_valid_pred = np.round(bst.predict(dvalid))
        validation_y = train_y[test_index]
    
        false_positive = 100*(1-np.mean(np.array(validation_y)[y_valid_pred == 1]))
        false_negative = 100*(np.mean(np.array(validation_y)[y_valid_pred == 0]))
        logging.info("False positive rate: " + str(false_positive)+ "; False negative rate: " + str(false_negative))
        loss.append(2*(false_negative**2) + false_positive**1.3)
    return(np.mean(loss))

#Defining search space for hyperparameter optimization
space_gradient_boosting = {"learning_rate": hp.uniform("learning_rate", 0.01, 0.5),
                            "max_depth": hp.choice("max_depth", [9,10,11,12,13]),
                            "reg_lambda": hp.uniform("reg_lambda", 0, 5),
                            "reg_alpha": hp.uniform("reg_alpha", 0, 5),
                            "max_delta_step": hp.uniform("max_delta_step", 0, 5),
                            "min_child_weight": hp.uniform("min_child_weight", 0.1, 15),
                            "num_rounds":  hp.uniform("num_rounds", 200, 400),
                            "weight" : hp.uniform("weight", 0.1,0.33)}

Performing a random grid search:

In [23]:
'''trials = Trials()

for i in range(1,2000):
    best = fmin(fn = cross_validation_neg_acc_gradient_boosting, space = space_gradient_boosting,
                algo = rand.suggest, max_evals = i, trials = trials)
    logging.info(i)
    logging.info(trials.best_trial["result"]["loss"])
    logging.info(trials.argmin)''';

Best set of hyperparameters:

In [24]:
param = {'learning_rate': 0.31553117247348733,
         'max_delta_step': 1.7726044219753656,
         'max_depth': 10,
         'min_child_weight': 1.3845040588450772,
         'num_rounds': 342.68325188584106,
         'reg_alpha': 0.531395259755843,
         'reg_lambda': 3.744980563764689,
         'weight': 0.26187490421514203}

num_round = param["num_rounds"]
param["tree_method"] = "gpu_hist"
param["sampling_method"] = "gradient_based"
param['objective'] = 'binary:logistic'
weights = np.array([param["weight"] if binding == 0 else 1.0 for binding in df_train["Binding"]])

del param["num_rounds"]
del param["weight"]

#### (iii) Repeating 5-fold CV for best set of hyperparameters

In [25]:
loss = []
accuracy = []
ROC_AUC = []

for i in range(5):
    train_index, test_index  = train_indices[i], test_indices[i]
    dtrain = xgb.DMatrix(np.array(train_X[train_index]), weight = weights[train_index],
                     label = np.array(train_y[train_index]))
    dvalid = xgb.DMatrix(np.array(train_X[test_index]))
    bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
    y_valid_pred = np.round(bst.predict(dvalid))
    validation_y = train_y[test_index]

    #calculate loss:
    false_positive = 100*(1-np.mean(np.array(validation_y)[y_valid_pred == 1]))
    false_negative = 100*(np.mean(np.array(validation_y)[y_valid_pred == 0]))
    logging.info("False positive rate: " + str(false_positive)+ "; False negative rate: " + str(false_negative))
    loss.append(2*(false_negative**2) + false_positive**1.3)
    #calculate accuracy:
    accuracy.append(np.mean(y_valid_pred == np.array(validation_y)))
    #calculate ROC-AUC score:
    ROC_AUC.append(roc_auc_score(np.array(validation_y), bst.predict(dvalid)))
    
print("Loss values: %s" %loss) 
print("Accuracies: %s" %accuracy)
print("ROC-AUC scores: %s" %ROC_AUC)

np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "acc_CV_xgboost_ESM1b_ts_ECFP.npy"), np.array(accuracy))
np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "loss_CV_xgboost_ESM1b_ts_ECFP.npy"), np.array(loss))
np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "ROC_AUC_CV_xgboost_ESM1b_ts_ECFP.npy"), np.array(ROC_AUC))

Loss values: [108.68044529838687, 113.38285635774665, 116.75072442009302, 125.35861908227079, 119.36638679297242]
Accuracies: [0.9103633916554509, 0.9104490964394346, 0.9081535395546313, 0.9067980473693726, 0.908235938641344]
ROC-AUC scores: [0.956404258764795, 0.9524134068104446, 0.9484047612505799, 0.9470567532760683, 0.951539293645603]


#### (iv) 3. Training and validating the final model
Training the model and validating it on the test set:

In [26]:
dtrain = xgb.DMatrix(np.array(train_X), weight = weights, label = np.array(train_y),
                feature_names= feature_names)
dtest = xgb.DMatrix(np.array(test_X), label = np.array(test_y),
                    feature_names= feature_names)

bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
y_test_pred = np.round(bst.predict(dtest))
acc_test = np.mean(y_test_pred == np.array(test_y))
roc_auc = roc_auc_score(np.array(test_y), bst.predict(dtest))
mcc = matthews_corrcoef(np.array(test_y), y_test_pred)

print("Accuracy on test set: %s, ROC-AUC score for test set: %s, MCC: %s"  % (acc_test, roc_auc, mcc))

np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "y_test_pred_xgboost_ESM1b_ts_ECFP.npy"), bst.predict(dtest))
np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "y_test_true_xgboost_ESM1b_ts_ECFP.npy"), test_y)

Accuracy on test set: 0.904599152983134, ROC-AUC score for test set: 0.9494417844088786, MCC: 0.7541937162526161


### (d) ESM1b_ts and ECFP (512-dimensional):

#### (i) Creating numpy arrays with input vectors and output variables

In [27]:
def create_input_and_output_data(df):
    X = ();
    y = ();
    
    for ind in df.index:
        emb = df["ESM1b_ts"][ind]
        ecfp = np.array(list(df["ECFP_512"][ind])).astype(int)
                
        X = X +(np.concatenate([ecfp, emb]), );
        y = y + (df["Binding"][ind], );

    return(X,y)

train_X, train_y =  create_input_and_output_data(df = df_train)
test_X, test_y =  create_input_and_output_data(df = df_test)

feature_names =  ["ECFP_" + str(i) for i in range(512)]
feature_names = feature_names + ["ESM1b_ts_" + str(i) for i in range(1280)]

train_X = np.array(train_X)
test_X  = np.array(test_X)

train_y = np.array(train_y)
test_y  = np.array(test_y)

#### (ii) Performing hyperparameter optimization

In [28]:
def cross_validation_neg_acc_gradient_boosting(param):
    num_round = param["num_rounds"]
    param["tree_method"] = "gpu_hist"
    param["sampling_method"] = "gradient_based"
    param['objective'] = 'binary:logistic'
    weights = np.array([param["weight"] if binding == 0 else 1.0 for binding in df_train["Binding"]])
    
    del param["num_rounds"]
    del param["weight"]
    
    loss = []
    for i in range(5):
        train_index, test_index  = train_indices[i], test_indices[i]
        dtrain = xgb.DMatrix(np.array(train_X[train_index]), weight = weights[train_index],
                         label = np.array(train_y[train_index]))
        dvalid = xgb.DMatrix(np.array(train_X[test_index]))
        bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
        y_valid_pred = np.round(bst.predict(dvalid))
        validation_y = train_y[test_index]
    
        false_positive = 100*(1-np.mean(np.array(validation_y)[y_valid_pred == 1]))
        false_negative = 100*(np.mean(np.array(validation_y)[y_valid_pred == 0]))
        logging.info("False positive rate: " + str(false_positive)+ "; False negative rate: " + str(false_negative))
        loss.append(2*(false_negative**2) + false_positive**1.3)
    return(np.mean(loss))

#Defining search space for hyperparameter optimization
space_gradient_boosting = {"learning_rate": hp.uniform("learning_rate", 0.01, 0.5),
                            "max_depth": hp.choice("max_depth", [9,10,11,12,13]),
                            "reg_lambda": hp.uniform("reg_lambda", 0, 5),
                            "reg_alpha": hp.uniform("reg_alpha", 0, 5),
                            "max_delta_step": hp.uniform("max_delta_step", 0, 5),
                            "min_child_weight": hp.uniform("min_child_weight", 0.1, 15),
                            "num_rounds":  hp.uniform("num_rounds", 200, 400),
                            "weight" : hp.uniform("weight", 0.1,0.33)}

Performing a random grid search:

In [29]:
'''trials = Trials()

for i in range(1,2000):
    best = fmin(fn = cross_validation_neg_acc_gradient_boosting, space = space_gradient_boosting,
                algo = rand.suggest, max_evals = i, trials = trials)
    logging.info(i)
    logging.info(trials.best_trial["result"]["loss"])
    logging.info(trials.argmin)''';

Best set of hyperparameters:

In [30]:
param = {'learning_rate': 0.20031456821679422,
         'max_delta_step': 3.723458003552047,
         'max_depth': 10,
         'min_child_weight': 2.0109208762032678,
         'num_rounds': 347.78525681188614,
         'reg_alpha': 2.213525607682663,
         'reg_lambda': 4.5822546393906025,
         'weight': 0.16604653557737126}

num_round = param["num_rounds"]
param["tree_method"] = "gpu_hist"
param["sampling_method"] = "gradient_based"
param['objective'] = 'binary:logistic'
weights = np.array([param["weight"] if binding == 0 else 1.0 for binding in df_train["Binding"]])

del param["num_rounds"]
del param["weight"]

#### (iii) Repeating 5-fold CV for best set of hyperparameters

In [31]:
loss = []
accuracy = []
ROC_AUC = []

for i in range(5):
    train_index, test_index  = train_indices[i], test_indices[i]
    dtrain = xgb.DMatrix(np.array(train_X[train_index]), weight = weights[train_index],
                     label = np.array(train_y[train_index]))
    dvalid = xgb.DMatrix(np.array(train_X[test_index]))
    bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
    y_valid_pred = np.round(bst.predict(dvalid))
    validation_y = train_y[test_index]

    #calculate loss:
    false_positive = 100*(1-np.mean(np.array(validation_y)[y_valid_pred == 1]))
    false_negative = 100*(np.mean(np.array(validation_y)[y_valid_pred == 0]))
    logging.info("False positive rate: " + str(false_positive)+ "; False negative rate: " + str(false_negative))
    loss.append(2*(false_negative**2) + false_positive**1.3)
    #calculate accuracy:
    accuracy.append(np.mean(y_valid_pred == np.array(validation_y)))
    #calculate ROC-AUC score:
    ROC_AUC.append(roc_auc_score(np.array(validation_y), bst.predict(dvalid)))
    
print("Loss values: %s" %loss) 
print("Accuracies: %s" %accuracy)
print("ROC-AUC scores: %s" %ROC_AUC)

np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "acc_CV_xgboost_ESM1b_ts_ECFP_512.npy"), np.array(accuracy))
np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "loss_CV_xgboost_ESM1b_ts_ECFP_512.npy"), np.array(loss))
np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "ROC_AUC_CV_xgboost_ESM1b_ts_ECFP_512.npy"), np.array(ROC_AUC))

Loss values: [104.16043379763315, 118.19757372302232, 124.29384769765849, 132.1245948240752, 119.5747769265525]
Accuracies: [0.8947510094212652, 0.8884415816782967, 0.8868382289229703, 0.8823901645272103, 0.8911614317019723]
ROC-AUC scores: [0.9559755847455434, 0.9496475323966114, 0.9486398091770489, 0.9454227918700563, 0.9495638094101112]


#### (iv) 3. Training and validating the final model
Training the model and validating it on the test set:

In [32]:
dtrain = xgb.DMatrix(np.array(train_X), weight = weights, label = np.array(train_y),
                feature_names= feature_names)
dtest = xgb.DMatrix(np.array(test_X), label = np.array(test_y),
                    feature_names= feature_names)

bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
y_test_pred = np.round(bst.predict(dtest))
acc_test = np.mean(y_test_pred == np.array(test_y))
roc_auc = roc_auc_score(np.array(test_y), bst.predict(dtest))
mcc = matthews_corrcoef(np.array(test_y), y_test_pred)

print("Accuracy on test set: %s, ROC-AUC score for test set: %s, MCC: %s"  % (acc_test, roc_auc, mcc))

np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "y_test_pred_xgboost_ESM1b_ts_ECFP_512.npy"), bst.predict(dtest))
np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "y_test_true_xgboost_ESM1b_ts_ECFP_512.npy"), test_y)

Accuracy on test set: 0.8875102162122, ROC-AUC score for test set: 0.9467608865614003, MCC: 0.7238779924304148


### (e) ESM1b_ts and ECFP (2048-dimensional):

#### (i) Creating numpy arrays with input vectors and output variables

In [33]:
def create_input_and_output_data(df):
    X = ();
    y = ();
    
    for ind in df.index:
        emb = df["ESM1b_ts"][ind]
        ecfp = np.array(list(df["ECFP_2048"][ind])).astype(int)
                
        X = X +(np.concatenate([ecfp, emb]), );
        y = y + (df["Binding"][ind], );

    return(X,y)

train_X, train_y =  create_input_and_output_data(df = df_train)
test_X, test_y =  create_input_and_output_data(df = df_test)

feature_names =  ["ECFP_" + str(i) for i in range(2048)]
feature_names = feature_names + ["ESM1b_ts_" + str(i) for i in range(1280)]

train_X = np.array(train_X)
test_X  = np.array(test_X)

train_y = np.array(train_y)
test_y  = np.array(test_y)

#### (ii) Performing hyperparameter optimization

In [34]:
def cross_validation_neg_acc_gradient_boosting(param):
    num_round = param["num_rounds"]
    param["tree_method"] = "gpu_hist"
    param["sampling_method"] = "gradient_based"
    param['objective'] = 'binary:logistic'
    weights = np.array([param["weight"] if binding == 0 else 1.0 for binding in df_train["Binding"]])
    
    del param["num_rounds"]
    del param["weight"]
    
    loss = []
    for i in range(5):
        train_index, test_index  = train_indices[i], test_indices[i]
        dtrain = xgb.DMatrix(np.array(train_X[train_index]), weight = weights[train_index],
                         label = np.array(train_y[train_index]))
        dvalid = xgb.DMatrix(np.array(train_X[test_index]))
        bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
        y_valid_pred = np.round(bst.predict(dvalid))
        validation_y = train_y[test_index]
    
        false_positive = 100*(1-np.mean(np.array(validation_y)[y_valid_pred == 1]))
        false_negative = 100*(np.mean(np.array(validation_y)[y_valid_pred == 0]))
        logging.info("False positive rate: " + str(false_positive)+ "; False negative rate: " + str(false_negative))
        loss.append(2*(false_negative**2) + false_positive**1.3)
    return(np.mean(loss))

#Defining search space for hyperparameter optimization
space_gradient_boosting = {"learning_rate": hp.uniform("learning_rate", 0.01, 0.5),
                            "max_depth": hp.choice("max_depth", [9,10,11,12,13]),
                            "reg_lambda": hp.uniform("reg_lambda", 0, 5),
                            "reg_alpha": hp.uniform("reg_alpha", 0, 5),
                            "max_delta_step": hp.uniform("max_delta_step", 0, 5),
                            "min_child_weight": hp.uniform("min_child_weight", 0.1, 15),
                            "num_rounds":  hp.uniform("num_rounds", 200, 400),
                            "weight" : hp.uniform("weight", 0.1,0.33)}

Performing a random grid search:

In [35]:
'''trials = Trials()

for i in range(1,2000):
    best = fmin(fn = cross_validation_neg_acc_gradient_boosting, space = space_gradient_boosting,
                algo = rand.suggest, max_evals = i, trials = trials)
    logging.info(i)
    logging.info(trials.best_trial["result"]["loss"])
    logging.info(trials.argmin)''';

Best set of hyperparameters:

In [36]:
param = {'learning_rate': 0.38572930012069273,
         'max_delta_step': 2.3691090709866254,
         'max_depth': 13,
         'min_child_weight': 0.11946441222742316,
         'num_rounds': 185.5041665008057,
         'reg_alpha': 0.4492202436042625,
         'reg_lambda': 0.8927451484733545,
         'weight': 0.10881477775175043}

num_round = param["num_rounds"]
param["tree_method"] = "gpu_hist"
param["sampling_method"] = "gradient_based"
param['objective'] = 'binary:logistic'
weights = np.array([param["weight"] if binding == 0 else 1.0 for binding in df_train["Binding"]])

del param["num_rounds"]
del param["weight"]

#### (iii) Repeating 5-fold CV for best set of hyperparameters

In [37]:
loss = []
accuracy = []
ROC_AUC = []

for i in range(5):
    train_index, test_index  = train_indices[i], test_indices[i]
    dtrain = xgb.DMatrix(np.array(train_X[train_index]), weight = weights[train_index],
                     label = np.array(train_y[train_index]))
    dvalid = xgb.DMatrix(np.array(train_X[test_index]))
    bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
    y_valid_pred = np.round(bst.predict(dvalid))
    validation_y = train_y[test_index]

    #calculate loss:
    false_positive = 100*(1-np.mean(np.array(validation_y)[y_valid_pred == 1]))
    false_negative = 100*(np.mean(np.array(validation_y)[y_valid_pred == 0]))
    logging.info("False positive rate: " + str(false_positive)+ "; False negative rate: " + str(false_negative))
    loss.append(2*(false_negative**2) + false_positive**1.3)
    #calculate accuracy:
    accuracy.append(np.mean(y_valid_pred == np.array(validation_y)))
    #calculate ROC-AUC score:
    ROC_AUC.append(roc_auc_score(np.array(validation_y), bst.predict(dvalid)))
    
print("Loss values: %s" %loss) 
print("Accuracies: %s" %accuracy)
print("ROC-AUC scores: %s" %ROC_AUC)

np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "acc_CV_xgboost_ESM1b_ts_ECFP_2048.npy"), np.array(accuracy))
np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "loss_CV_xgboost_ESM1b_ts_ECFP_2048.npy"), np.array(loss))
np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "ROC_AUC_CV_xgboost_ESM1b_ts_ECFP_2048.npy"), np.array(ROC_AUC))

Loss values: [105.30552432607526, 115.14142937211616, 113.2335149181909, 133.1620701477483, 122.36331116450947]
Accuracies: [0.9023777478689996, 0.8997137233852209, 0.9020015596568755, 0.8933285120231423, 0.8975529583637691]
ROC-AUC scores: [0.9565390334782213, 0.9511082440869931, 0.9489242322179354, 0.9441804521532098, 0.9494393660817402]


#### (iv) 3. Training and validating the final model
Training the model and validating it on the test set:

In [38]:
dtrain = xgb.DMatrix(np.array(train_X), weight = weights, label = np.array(train_y),
                feature_names= feature_names)
dtest = xgb.DMatrix(np.array(test_X), label = np.array(test_y),
                    feature_names= feature_names)

bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
y_test_pred = np.round(bst.predict(dtest))
acc_test = np.mean(y_test_pred == np.array(test_y))
roc_auc = roc_auc_score(np.array(test_y), bst.predict(dtest))
mcc = matthews_corrcoef(np.array(test_y), y_test_pred)

print("Accuracy on test set: %s, ROC-AUC score for test set: %s, MCC: %s"  % (acc_test, roc_auc, mcc))

np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "y_test_pred_xgboost_ESM1b_ts_ECFP_2048.npy"), bst.predict(dtest))
np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "y_test_true_xgboost_ESM1b_ts_ECFP_2048.npy"), test_y)

Accuracy on test set: 0.8977635782747604, ROC-AUC score for test set: 0.9484819112091449, MCC: 0.7428505008103589


### (f) ESM1b_ts and GNN (pretrained):

#### (i) Creating numpy arrays with input vectors and output variables

In [39]:
def create_input_and_output_data(df):
    X = ();
    y = ();
    
    for ind in df.index:
        emb = df["ESM1b_ts"][ind]
        ecfp = df["GNN rep (pretrained)"][ind]
                
        X = X +(np.concatenate([ecfp, emb]), );
        y = y + (df["Binding"][ind], );

    return(X,y)

train_X, train_y =  create_input_and_output_data(df = df_train)
test_X, test_y =  create_input_and_output_data(df = df_test)


feature_names =  ["GNN rep_" + str(i) for i in range(50)]
feature_names = feature_names + ["ESM1b_" + str(i) for i in range(1280)]

train_X = np.array(train_X)
test_X  = np.array(test_X)

train_y = np.array(train_y)
test_y  = np.array(test_y)

#### (ii) Performing hyperparameter optimization

In [40]:
def cross_validation_neg_acc_gradient_boosting(param):
    num_round = param["num_rounds"]
    param["tree_method"] = "gpu_hist"
    param["sampling_method"] = "gradient_based"
    param['objective'] = 'binary:logistic'
    weights = np.array([param["weight"] if binding == 0 else 1.0 for binding in df_train["Binding"]])
    
    del param["num_rounds"]
    del param["weight"]
    
    loss = []
    for i in range(5):
        train_index, test_index  = train_indices[i], test_indices[i]
        dtrain = xgb.DMatrix(np.array(train_X[train_index]), weight = weights[train_index],
                         label = np.array(train_y[train_index]))
        dvalid = xgb.DMatrix(np.array(train_X[test_index]))
        bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
        y_valid_pred = np.round(bst.predict(dvalid))
        validation_y = train_y[test_index]
    
        false_positive = 100*(1-np.mean(np.array(validation_y)[y_valid_pred == 1]))
        false_negative = 100*(np.mean(np.array(validation_y)[y_valid_pred == 0]))
        logging.info("False positive rate: " + str(false_positive)+ "; False negative rate: " + str(false_negative))
        loss.append(2*(false_negative**2) + false_positive**1.3)
    return(np.mean(loss))

#Defining search space for hyperparameter optimization
space_gradient_boosting = {"learning_rate": hp.uniform("learning_rate", 0.01, 0.5),
    "max_depth": hp.choice("max_depth", [9,10,11,12,13]),
    "reg_lambda": hp.uniform("reg_lambda", 0, 5),
    "reg_alpha": hp.uniform("reg_alpha", 0, 5),
    "max_delta_step": hp.uniform("max_delta_step", 0, 5),
    "min_child_weight": hp.uniform("min_child_weight", 0.1, 15),
    "num_rounds":  hp.uniform("num_rounds", 200, 400),
    "weight" : hp.uniform("weight", 0.1,0.33)}

Performing a random grid search:

In [41]:
'''trials = Trials()

for i in range(1,2000):
    best = fmin(fn = cross_validation_neg_acc_gradient_boosting, space = space_gradient_boosting,
                algo = rand.suggest, max_evals = i, trials = trials)
    logging.info(i)
    logging.info(trials.best_trial["result"]["loss"])
    logging.info(trials.argmin)''';

Best set of hyperparameters:

In [42]:
param = {'learning_rate': 0.19789627044374644,
 'max_delta_step': 3.815106738298364,
 'max_depth': 12,
 'min_child_weight': 0.9568708633806051,
 'num_rounds': 358.42154962618235,
 'reg_alpha': 0.3726209284173021,
 'reg_lambda': 4.442065146895246,
 'weight': 0.11281944917093198}

num_round = param["num_rounds"]
param["tree_method"] = "gpu_hist"
param["sampling_method"] = "gradient_based"
param['objective'] = 'binary:logistic'
weights = np.array([param["weight"] if binding == 0 else 1.0 for binding in df_train["Binding"]])

del param["num_rounds"]
del param["weight"]

#### (iii) Repeating 5-fold CV for best set of hyperparameters

In [43]:
loss = []
accuracy = []
ROC_AUC = []

for i in range(5):
    train_index, test_index  = train_indices[i], test_indices[i]
    dtrain = xgb.DMatrix(np.array(train_X[train_index]), weight = weights[train_index],
                     label = np.array(train_y[train_index]))
    dvalid = xgb.DMatrix(np.array(train_X[test_index]))
    bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
    y_valid_pred = np.round(bst.predict(dvalid))
    validation_y = train_y[test_index]

    #calculate loss:
    false_positive = 100*(1-np.mean(np.array(validation_y)[y_valid_pred == 1]))
    false_negative = 100*(np.mean(np.array(validation_y)[y_valid_pred == 0]))
    logging.info("False positive rate: " + str(false_positive)+ "; False negative rate: " + str(false_negative))
    loss.append(2*(false_negative**2) + false_positive**1.3)
    #calculate accuracy:
    accuracy.append(np.mean(y_valid_pred == np.array(validation_y)))
    #calculate ROC-AUC score:
    ROC_AUC.append(roc_auc_score(np.array(validation_y), bst.predict(dvalid)))
    
print("Loss values: %s" %loss) 
print("Accuracies: %s" %accuracy)
print("ROC-AUC scores: %s" %ROC_AUC)

np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "acc_CV_xgboost_ESM1b_ts_GNN_pretrained.npy"), np.array(accuracy))
np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "loss_CV_xgboost_ESM1b_ts_GNN_pretrained.npy"), np.array(loss))
np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "ROC_AUC_CV_xgboost_ESM1b_ts_GNN_pretrained.npy"), np.array(ROC_AUC))

Loss values: [91.88073293137697, 99.67882761750269, 109.23993476370345, 120.01888413247693, 106.0349108806078]
Accuracies: [0.917900403768506, 0.9150116299874754, 0.9098864916385062, 0.9069788465015368, 0.9120708546384222]
ROC-AUC scores: [0.9604579328670103, 0.9560894222080177, 0.9530202250038822, 0.9515046007045915, 0.9566354023173045]


In [44]:
dtrain = xgb.DMatrix(np.array(train_X), weight = weights, label = np.array(train_y),
                feature_names= feature_names)
dtest = xgb.DMatrix(np.array(test_X), label = np.array(test_y),
                    feature_names= feature_names)

bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
y_test_pred = np.round(bst.predict(dtest))
acc_test = np.mean(y_test_pred == np.array(test_y))
roc_auc = roc_auc_score(np.array(test_y), bst.predict(dtest))
mcc = matthews_corrcoef(np.array(test_y), y_test_pred)

print("Accuracy on test set: %s, ROC-AUC score for test set: %s, MCC: %s"  % (acc_test, roc_auc, mcc))

np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "y_test_pred_xgboost_ESM1b_ts_GNN_pretrained.npy"), bst.predict(dtest))
np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "y_test_true_xgboost_ESM1b_ts_GNN_pretrained.npy"), test_y)

Accuracy on test set: 0.9181217029496991, ROC-AUC score for test set: 0.9562220354082519, MCC: 0.7889261156069193


### (g) ESM1b_ts (mean representaion) and GNN (pretrained):

#### (i) Creating numpy arrays with input vectors and output variables

In [45]:
def create_input_and_output_data(df):
    X = ();
    y = ();
    
    for ind in df.index:
        emb = df["ESM1b_ts_mean"][ind]
        ecfp = df["GNN rep (pretrained)"][ind]
                
        X = X +(np.concatenate([ecfp, emb]), );
        y = y + (df["Binding"][ind], );

    return(X,y)

train_X, train_y =  create_input_and_output_data(df = df_train)
test_X, test_y =  create_input_and_output_data(df = df_test)


feature_names =  ["GNN rep_" + str(i) for i in range(50)]
feature_names = feature_names + ["ESM1b_" + str(i) for i in range(1280)]

train_X = np.array(train_X)
test_X  = np.array(test_X)

train_y = np.array(train_y)
test_y  = np.array(test_y)

#### (ii) Performing hyperparameter optimization

In [46]:
def cross_validation_neg_acc_gradient_boosting(param):
    num_round = param["num_rounds"]
    param["tree_method"] = "gpu_hist"
    param["sampling_method"] = "gradient_based"
    param['objective'] = 'binary:logistic'
    weights = np.array([param["weight"] if binding == 0 else 1.0 for binding in df_train["Binding"]])
    
    del param["num_rounds"]
    del param["weight"]
    
    loss = []
    for i in range(5):
        train_index, test_index  = train_indices[i], test_indices[i]
        dtrain = xgb.DMatrix(np.array(train_X[train_index]), weight = weights[train_index],
                         label = np.array(train_y[train_index]))
        dvalid = xgb.DMatrix(np.array(train_X[test_index]))
        bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
        y_valid_pred = np.round(bst.predict(dvalid))
        validation_y = train_y[test_index]
    
        false_positive = 100*(1-np.mean(np.array(validation_y)[y_valid_pred == 1]))
        false_negative = 100*(np.mean(np.array(validation_y)[y_valid_pred == 0]))
        logging.info("False positive rate: " + str(false_positive)+ "; False negative rate: " + str(false_negative))
        loss.append(2*(false_negative**2) + false_positive**1.3)
    return(np.mean(loss))

#Defining search space for hyperparameter optimization
space_gradient_boosting = {"learning_rate": hp.uniform("learning_rate", 0.01, 0.5),
    "max_depth": hp.choice("max_depth", [9,10,11,12,13]),
    "reg_lambda": hp.uniform("reg_lambda", 0, 5),
    "reg_alpha": hp.uniform("reg_alpha", 0, 5),
    "max_delta_step": hp.uniform("max_delta_step", 0, 5),
    "min_child_weight": hp.uniform("min_child_weight", 0.1, 15),
    "num_rounds":  hp.uniform("num_rounds", 200, 400),
    "weight" : hp.uniform("weight", 0.1,0.33)}

Performing a random grid search:

In [47]:
'''trials = Trials()

for i in range(1,2000):
    best = fmin(fn = cross_validation_neg_acc_gradient_boosting, space = space_gradient_boosting,
                algo = rand.suggest, max_evals = i, trials = trials)
    logging.info(i)
    logging.info(trials.best_trial["result"]["loss"])
    logging.info(trials.argmin)''';

Best set of hyperparameters:

In [48]:
param = {'learning_rate': 0.1156069031228949,
         'max_delta_step': 2.6918966267028415,
         'max_depth': 12,
         'min_child_weight': 1.9758135134127655,
         'num_rounds': 127.079967978755,
         'reg_alpha': 2.3233749696436714,
         'reg_lambda': 2.163811098458429,
         'weight': 0.28341863683389795}

num_round = param["num_rounds"]
param["tree_method"] = "gpu_hist"
param["sampling_method"] = "gradient_based"
param['objective'] = 'binary:logistic'
weights = np.array([param["weight"] if binding == 0 else 1.0 for binding in df_train["Binding"]])

del param["num_rounds"]
del param["weight"]

#### (iii) Repeating 5-fold CV for best set of hyperparameters

In [49]:
loss = []
accuracy = []
ROC_AUC = []

for i in range(5):
    train_index, test_index  = train_indices[i], test_indices[i]
    dtrain = xgb.DMatrix(np.array(train_X[train_index]), weight = weights[train_index],
                     label = np.array(train_y[train_index]))
    dvalid = xgb.DMatrix(np.array(train_X[test_index]))
    bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
    y_valid_pred = np.round(bst.predict(dvalid))
    validation_y = train_y[test_index]

    #calculate loss:
    false_positive = 100*(1-np.mean(np.array(validation_y)[y_valid_pred == 1]))
    false_negative = 100*(np.mean(np.array(validation_y)[y_valid_pred == 0]))
    logging.info("False positive rate: " + str(false_positive)+ "; False negative rate: " + str(false_negative))
    loss.append(2*(false_negative**2) + false_positive**1.3)
    #calculate accuracy:
    accuracy.append(np.mean(y_valid_pred == np.array(validation_y)))
    #calculate ROC-AUC score:
    ROC_AUC.append(roc_auc_score(np.array(validation_y), bst.predict(dvalid)))
    
print("Loss values: %s" %loss) 
print("Accuracies: %s" %accuracy)
print("ROC-AUC scores: %s" %ROC_AUC)

np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "acc_CV_xgboost_ESM1b_ts_mean_GNN_pretrained.npy"), np.array(accuracy))
np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "loss_CV_xgboost_ESM1b_ts_mean_GNN_pretrained.npy"), np.array(loss))
np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "ROC_AUC_CV_xgboost_ESM1b_ts_mean_GNN_pretrained.npy"), np.array(ROC_AUC))

Loss values: [84.09770237588249, 93.2762385821601, 105.77465665299039, 112.21969637517623, 101.68673959966944]
Accuracies: [0.916733961417676, 0.9144748613347647, 0.9059873494497878, 0.9043572590851564, 0.910427319211103]
ROC-AUC scores: [0.9636427269607835, 0.9587725902992912, 0.9556605491176462, 0.9545229260971622, 0.9585052255048616]


In [50]:
dtrain = xgb.DMatrix(np.array(train_X), weight = weights, label = np.array(train_y),
                feature_names= feature_names)
dtest = xgb.DMatrix(np.array(test_X), label = np.array(test_y),
                    feature_names= feature_names)

bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
y_test_pred = np.round(bst.predict(dtest))
acc_test = np.mean(y_test_pred == np.array(test_y))
roc_auc = roc_auc_score(np.array(test_y), bst.predict(dtest))
mcc = matthews_corrcoef(np.array(test_y), y_test_pred)

print("Accuracy on test set: %s, ROC-AUC score for test set: %s, MCC: %s"  % (acc_test, roc_auc, mcc))

np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "y_test_pred_xgboost_ESM1b_ts_mean_GNN_pretrained.npy"), bst.predict(dtest))
np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "y_test_true_xgboost_ESM1b_ts_mean_GNN_pretrained.npy"), test_y)

Accuracy on test set: 0.9101716323649602, ROC-AUC score for test set: 0.956069174706691, MCC: 0.7725739053747791


### (h) ESM1b_ts and GNN (not pre-trained):

#### (i) Creating numpy arrays with input vectors and output variables

In [51]:
def create_input_and_output_data(df):
    X = ();
    y = ();
    
    for ind in df.index:
        emb = df["ESM1b_ts"][ind]
        ecfp = df["GNN rep"][ind]
                
        X = X +(np.concatenate([ecfp, emb]), );
        y = y + (df["Binding"][ind], );

    return(X,y)

train_X, train_y =  create_input_and_output_data(df = df_train)
test_X, test_y =  create_input_and_output_data(df = df_test)


feature_names =  ["GNN rep_" + str(i) for i in range(50)]
feature_names = feature_names + ["ESM1b_" + str(i) for i in range(1280)]

train_X = np.array(train_X)
test_X  = np.array(test_X)

train_y = np.array(train_y)
test_y  = np.array(test_y)

#### (ii) Performing hyperparameter optimization

In [52]:
def cross_validation_neg_acc_gradient_boosting(param):
    num_round = param["num_rounds"]
    param["tree_method"] = "gpu_hist"
    param["sampling_method"] = "gradient_based"
    param['objective'] = 'binary:logistic'
    weights = np.array([param["weight"] if binding == 0 else 1.0 for binding in df_train["Binding"]])
    
    del param["num_rounds"]
    del param["weight"]
    
    loss = []
    for i in range(5):
        train_index, test_index  = train_indices[i], test_indices[i]
        dtrain = xgb.DMatrix(np.array(train_X[train_index]), weight = weights[train_index],
                         label = np.array(train_y[train_index]))
        dvalid = xgb.DMatrix(np.array(train_X[test_index]))
        bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
        y_valid_pred = np.round(bst.predict(dvalid))
        validation_y = train_y[test_index]
    
        false_positive = 100*(1-np.mean(np.array(validation_y)[y_valid_pred == 1]))
        false_negative = 100*(np.mean(np.array(validation_y)[y_valid_pred == 0]))
        logging.info("False positive rate: " + str(false_positive)+ "; False negative rate: " + str(false_negative))
        loss.append(2*(false_negative**2) + false_positive**1.3)
    return(np.mean(loss))

#Defining search space for hyperparameter optimization
space_gradient_boosting = {"learning_rate": hp.uniform("learning_rate", 0.01, 0.5),
    "max_depth": hp.choice("max_depth", [9,10,11,12,13]),
    "reg_lambda": hp.uniform("reg_lambda", 0, 5),
    "reg_alpha": hp.uniform("reg_alpha", 0, 5),
    "max_delta_step": hp.uniform("max_delta_step", 0, 5),
    "min_child_weight": hp.uniform("min_child_weight", 0.1, 15),
    "num_rounds":  hp.uniform("num_rounds", 200, 400),
    "weight" : hp.uniform("weight", 0.1,0.33)}

Performing a random grid search:

In [53]:
'''trials = Trials()

for i in range(1,2000):
    best = fmin(fn = cross_validation_neg_acc_gradient_boosting, space = space_gradient_boosting,
                algo = rand.suggest, max_evals = i, trials = trials)
    logging.info(i)
    logging.info(trials.best_trial["result"]["loss"])
    logging.info(trials.argmin)''';

Best set of hyperparameters:

In [54]:
param = {'learning_rate': 0.18444025726334898,
         'max_delta_step': 3.2748796106084117,
         'max_depth': 13,
         'min_child_weight': 3.1946753845027738,
         'num_rounds': 314.1036429221291,
         'reg_alpha': 0.48821021807600673,
         'reg_lambda': 2.6236829011598073,
         'weight': 0.1264521266931227}

num_round = param["num_rounds"]
param["tree_method"] = "gpu_hist"
param["sampling_method"] = "gradient_based"
param['objective'] = 'binary:logistic'
weights = np.array([param["weight"] if binding == 0 else 1.0 for binding in df_train["Binding"]])

del param["num_rounds"]
del param["weight"]

#### (iii) Repeating 5-fold CV for best set of hyperparameters

In [55]:
loss = []
accuracy = []
ROC_AUC = []

for i in range(5):
    train_index, test_index  = train_indices[i], test_indices[i]
    dtrain = xgb.DMatrix(np.array(train_X[train_index]), weight = weights[train_index],
                     label = np.array(train_y[train_index]))
    dvalid = xgb.DMatrix(np.array(train_X[test_index]))
    bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
    y_valid_pred = np.round(bst.predict(dvalid))
    validation_y = train_y[test_index]

    #calculate loss:
    false_positive = 100*(1-np.mean(np.array(validation_y)[y_valid_pred == 1]))
    false_negative = 100*(np.mean(np.array(validation_y)[y_valid_pred == 0]))
    logging.info("False positive rate: " + str(false_positive)+ "; False negative rate: " + str(false_negative))
    loss.append(2*(false_negative**2) + false_positive**1.3)
    #calculate accuracy:
    accuracy.append(np.mean(y_valid_pred == np.array(validation_y)))
    #calculate ROC-AUC score:
    ROC_AUC.append(roc_auc_score(np.array(validation_y), bst.predict(dvalid)))
    
print("Loss values: %s" %loss) 
print("Accuracies: %s" %accuracy)
print("ROC-AUC scores: %s" %ROC_AUC)

np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "acc_CV_xgboost_ESM1b_ts_GNN.npy"), np.array(accuracy))
np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "loss_CV_xgboost_ESM1b_ts_GNN.npy"), np.array(loss))
np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "ROC_AUC_CV_xgboost_ESM1b_ts_GNN.npy"), np.array(ROC_AUC))

Loss values: [95.85892125174448, 101.18538008374776, 107.3774676337244, 113.8545092756927, 103.56995296940914]
Accuracies: [0.9084791386271871, 0.9079441760601181, 0.9039944545533316, 0.9011028747062014, 0.910336011687363]
ROC-AUC scores: [0.9591609388450508, 0.9585739135344192, 0.9547930983970586, 0.9530322066116591, 0.9565295911826541]


#### (iv) 3. Training and validating the final model
Training the model and validating it on the test set:

In [56]:
dtrain = xgb.DMatrix(np.array(train_X), weight = weights, label = np.array(train_y),
                feature_names= feature_names)
dtest = xgb.DMatrix(np.array(test_X), label = np.array(test_y),
                    feature_names= feature_names)

bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
y_test_pred = np.round(bst.predict(dtest))
acc_test = np.mean(y_test_pred == np.array(test_y))
roc_auc = roc_auc_score(np.array(test_y), bst.predict(dtest))
mcc = matthews_corrcoef(np.array(test_y), y_test_pred)

print("Accuracy on test set: %s, ROC-AUC score for test set: %s, MCC: %s"  % (acc_test, roc_auc, mcc))

np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "y_test_pred_xgboost_ESM1b_ts_GNN.npy"), bst.predict(dtest))
np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "y_test_true_xgboost_ESM1b_ts_GNN.npy"), test_y)

Accuracy on test set: 0.9063823463853183, ROC-AUC score for test set: 0.9546895380125349, MCC: 0.763088327957358


## 3.Training the best model with training and test set (production mode):

In [57]:
def create_input_and_output_data(df):
    X = ();
    y = ();
    
    for ind in df.index:
        emb = df["ESM1b_ts"][ind]
        ecfp = np.array(list(df["ECFP"][ind])).astype(int)
                
        X = X +(np.concatenate([ecfp, emb]), );
        y = y + (df["Binding"][ind], );

    return(X,y)

train_X, train_y =  create_input_and_output_data(df = df_train)
test_X, test_y =  create_input_and_output_data(df = df_test)


feature_names =  ["ECFP_" + str(i) for i in range(1024)]
feature_names = feature_names + ["ESM1b_ts_" + str(i) for i in range(1280)]

train_X = np.array(train_X)
test_X  = np.array(test_X)

train_y = np.array(train_y)
test_y  = np.array(test_y)

In [58]:
train_X = np.concatenate([train_X, test_X])
train_y = np.concatenate([train_y, test_y])

In [59]:
param = {'learning_rate': 0.31553117247348733,
         'max_delta_step': 1.7726044219753656,
         'max_depth': 10,
         'min_child_weight': 1.3845040588450772,
         'num_rounds': 342.68325188584106,
         'reg_alpha': 0.531395259755843,
         'reg_lambda': 3.744980563764689,
         'weight': 0.26187490421514203}

num_round = param["num_rounds"]
param["tree_method"] = "gpu_hist"
param["sampling_method"] = "gradient_based"
param['objective'] = 'binary:logistic'
weights = np.array([param["weight"] if binding == 0 else 1.0 for binding in train_y])

del param["num_rounds"]
del param["weight"]

In [60]:
dtrain = xgb.DMatrix(np.array(train_X), weight = weights, label = np.array(train_y),
                feature_names= feature_names)
dtest = xgb.DMatrix(np.array(test_X), label = np.array(test_y),
                    feature_names= feature_names)

bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
y_test_pred = np.round(bst.predict(dtest))
acc_test = np.mean(y_test_pred == np.array(test_y))
roc_auc = roc_auc_score(np.array(test_y), bst.predict(dtest))
mcc = matthews_corrcoef(np.array(test_y), y_test_pred)

print("Accuracy on test set: %s, ROC-AUC score for test set: %s, MCC: %s"  % (acc_test, roc_auc, mcc))


Accuracy on test set: 0.995690615944721, ROC-AUC score for test set: 0.999973562453436, MCC: 0.9890341165154867


In [61]:
y_train_pred = np.round(bst.predict(dtrain))
acc_train = np.mean(y_train_pred == np.array(train_y))
roc_auc = roc_auc_score(np.array(train_y), bst.predict(dtrain))
mcc = matthews_corrcoef(np.array(train_y), y_train_pred)

print("Accuracy on train set: %s, ROC-AUC score for train set: %s, MCC: %s"  % (acc_train, roc_auc, mcc))


Accuracy on train set: 0.9952117916840937, ROC-AUC score for train set: 0.9999666505894647, MCC: 0.9878260302574451


In [62]:
pickle.dump(bst, open(join(CURRENT_DIR, ".." ,"data", "model_weights",
                           "xgboost_model_production_mode.dat"), "wb"))