# Training gradient boosting model for enzyme-substrate pair prediction with ESM-1b-vectors

### 1. Loading and preprocessing data for model training and evaluation
### 2. Hyperparameter optimization using a 5-fold cross-validation (CV)
### 3. Training and validating the final model

In [1]:
import pandas as pd
import numpy as np
import random
import pickle
import sys
import os
import logging
from os.path import join
from sklearn.model_selection import KFold
#from hyperopt import fmin, tpe, hp, Trials, rand
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import matthews_corrcoef

sys.path.append('.\\additional_code')
#from data_preprocessing import *

CURRENT_DIR = os.getcwd()
print(CURRENT_DIR)

C:\Users\alexk\Documents\GitHub\SubFinder\notebooks_and_code


## 1. Loading and preprocessing data for model training and evaluation

### (a) Loading data:

In [2]:
df_test  = pd.read_pickle(join(CURRENT_DIR, ".." ,"data","splits", "df_test_with_ESM1b_ts.pkl"))
df_test = df_test.loc[df_test["ESM1b_ts"] != ""]
df_test.reset_index(inplace = True, drop = True)

df_Mou  = pd.read_pickle(join(CURRENT_DIR, ".." ,"data", "Mou_data", "Mou_df.pkl"))
df_Mou = df_Mou.loc[df_test["ESM1b_ts"] != ""]
df_Mou.reset_index(inplace = True, drop = True)


  result = libops.scalar_compare(x.ravel(), y, op)
  result = libops.scalar_compare(x.ravel(), y, op)


Loading new dataset:

In [3]:
def create_input_and_output_data(df):
    X = ();
    y = ();
    
    for ind in df.index:
        emb = df["ESM1b_ts"][ind]
        ecfp = np.array(list(df["ECFP"][ind])).astype(int)
                
        X = X +(np.concatenate([ecfp, emb]), );
        y = y + (df["Binding"][ind], );

    return(np.array(X),np.array(y))

feature_names =  ["ECFP_" + str(i) for i in range(1024)]
feature_names = feature_names + ["ESM1b_ts_" + str(i) for i in range(1280)]


### Adding Mou et al. data to the training set:

In [4]:

df_train = pd.read_pickle(join(CURRENT_DIR, ".." ,"data",
                               "splits", "df_train_with_ESM1b_ts.pkl"))
df_train = df_train.loc[df_train["ESM1b_ts"] != ""]
df_train.reset_index(inplace = True, drop = True)


train_X, train_y =  create_input_and_output_data(df = df_train)
test_X, test_y =  create_input_and_output_data(df = df_test)

df_test_new = df_Mou.copy()
df_test_new["Binding"] = [y > 2 for y in df_test_new["activity"]]
test_new_X, test_new_y =  create_input_and_output_data(df = df_test_new)



from sklearn.model_selection import train_test_split
#same split as in Mou et al paper:
X_train_Mou, X_test_Mou, y_train_Mou, y_test_Mou = train_test_split(test_new_X, test_new_y,
                                                                    test_size = 0.20, random_state = 888)

train_X = np.concatenate([train_X, X_train_Mou])
train_y = np.concatenate([train_y, y_train_Mou])


  result = libops.scalar_compare(x.ravel(), y, op)


In [6]:
param = {'learning_rate': 0.31553117247348733,
         'max_delta_step': 1.7726044219753656,
         'max_depth': 10,
         'min_child_weight': 1.3845040588450772,
         'num_rounds': 342.68325188584106,
         'reg_alpha': 0.531395259755843,
         'reg_lambda': 3.744980563764689,
         'weight': 0.26187490421514203}



num_round = param["num_rounds"]
#param["tree_method"] = "gpu_hist"
#param["sampling_method"] = "gradient_based"
param['objective'] = 'binary:logistic'

weights1 = np.array([param["weight"] if binding == 0 else 1.0 for binding in df_train["Binding"]])
weights2 = np.array([param["weight"] if binding == 0 else 1.0 for binding in y_train_Mou])

weights = np.concatenate([weights1, weights3])2


del param["num_rounds"]
del param["weight"]

dtrain = xgb.DMatrix(np.array(train_X), weight = weights, label = np.array(train_y),
            feature_names= feature_names)
dtest = xgb.DMatrix(np.array(test_X), label = np.array(test_y),
                    feature_names= feature_names)

dtest_new = xgb.DMatrix(np.array(X_test_Mou), label = np.array(y_test_Mou), feature_names= feature_names)

evallist = [(dtest_new, 'eval'), (dtrain, 'train')]

bst = xgb.train(param,  dtrain, int(num_round),evallist, verbose_eval=1)
y_test_pred = np.round(bst.predict(dtest))
acc_test = np.mean(y_test_pred == np.array(test_y))
roc_auc = roc_auc_score(np.array(test_y), bst.predict(dtest))

print("Accuracy on test set: %s, ROC-AUC score for test set: %s"  % (acc_test, roc_auc))


y_test_new_pred = np.round(bst.predict(dtest_new))
acc_test_new = np.mean(y_test_new_pred == np.array(y_test_Mou))
roc_auc_new = roc_auc_score(np.array(y_test_Mou), bst.predict(dtest_new))
mcc = matthews_corrcoef(np.array(y_test_Mou), y_test_new_pred)

print("Accuracy on test set: %s, ROC-AUC score for test set: %s, MCC: %s"  % (acc_test_new, roc_auc_new, mcc))


[0]	eval-logloss:0.69533	train-logloss:0.60050
[1]	eval-logloss:0.68843	train-logloss:0.53780
[2]	eval-logloss:0.69631	train-logloss:0.50153
[3]	eval-logloss:0.69656	train-logloss:0.47159
[4]	eval-logloss:0.66151	train-logloss:0.44987
[5]	eval-logloss:0.66181	train-logloss:0.43055
[6]	eval-logloss:0.64271	train-logloss:0.41778
[7]	eval-logloss:0.65235	train-logloss:0.40439
[8]	eval-logloss:0.61257	train-logloss:0.38855
[9]	eval-logloss:0.59625	train-logloss:0.37997
[10]	eval-logloss:0.55848	train-logloss:0.36813
[11]	eval-logloss:0.54859	train-logloss:0.36277
[12]	eval-logloss:0.54421	train-logloss:0.35815
[13]	eval-logloss:0.53491	train-logloss:0.35172
[14]	eval-logloss:0.53478	train-logloss:0.34750
[15]	eval-logloss:0.52499	train-logloss:0.34362
[16]	eval-logloss:0.51831	train-logloss:0.33721
[17]	eval-logloss:0.51682	train-logloss:0.33202
[18]	eval-logloss:0.51769	train-logloss:0.32736
[19]	eval-logloss:0.51469	train-logloss:0.31924
[20]	eval-logloss:0.51767	train-logloss:0.31460
[2

[164]	eval-logloss:0.34977	train-logloss:0.09392
[165]	eval-logloss:0.34954	train-logloss:0.09295
[166]	eval-logloss:0.34388	train-logloss:0.09255
[167]	eval-logloss:0.34417	train-logloss:0.09213
[168]	eval-logloss:0.34417	train-logloss:0.09174
[169]	eval-logloss:0.34357	train-logloss:0.09148
[170]	eval-logloss:0.34277	train-logloss:0.09103
[171]	eval-logloss:0.34068	train-logloss:0.09071
[172]	eval-logloss:0.33902	train-logloss:0.09032
[173]	eval-logloss:0.33729	train-logloss:0.08982
[174]	eval-logloss:0.33502	train-logloss:0.08938
[175]	eval-logloss:0.33413	train-logloss:0.08892
[176]	eval-logloss:0.33417	train-logloss:0.08755
[177]	eval-logloss:0.33417	train-logloss:0.08705
[178]	eval-logloss:0.33422	train-logloss:0.08654
[179]	eval-logloss:0.33422	train-logloss:0.08618
[180]	eval-logloss:0.33254	train-logloss:0.08574
[181]	eval-logloss:0.33254	train-logloss:0.08534
[182]	eval-logloss:0.33486	train-logloss:0.08503
[183]	eval-logloss:0.34113	train-logloss:0.08442
[184]	eval-logloss:0

[332]	eval-logloss:0.30960	train-logloss:0.04404
[333]	eval-logloss:0.30854	train-logloss:0.04392
[334]	eval-logloss:0.30917	train-logloss:0.04368
[335]	eval-logloss:0.30907	train-logloss:0.04360
[336]	eval-logloss:0.31012	train-logloss:0.04344
[337]	eval-logloss:0.31007	train-logloss:0.04330
[338]	eval-logloss:0.31245	train-logloss:0.04311
[339]	eval-logloss:0.31142	train-logloss:0.04290
[340]	eval-logloss:0.30861	train-logloss:0.04274
[341]	eval-logloss:0.30925	train-logloss:0.04262
Accuracy on test set: 0.9049895428742157, ROC-AUC score for test set: 0.9497074664177353
Accuracy on test set: 0.875, ROC-AUC score for test set: 0.9392361111111112, MCC: 0.75


### Adding no Mou et al. data to the training set:

In [None]:

df_train = pd.read_pickle(join(CURRENT_DIR, ".." ,"data",
                               "splits", "df_train_with_ESM1b_ts.pkl"))
df_train = df_train.loc[df_train["ESM1b_ts"] != ""]
df_train.reset_index(inplace = True, drop = True)


train_X, train_y =  create_input_and_output_data(df = df_train)
test_X, test_y =  create_input_and_output_data(df = df_test)

df_test_new = df_Mou.copy()
df_test_new["Binding"] = [y > 2 for y in df_test_new["activity"]]
test_new_X, test_new_y =  create_input_and_output_data(df = df_test_new)


from sklearn.model_selection import train_test_split
#same split as in Mou et al paper:
X_train_Mou, X_test_Mou, y_train_Mou, y_test_Mou = train_test_split(test_new_X, test_new_y,
                                                                    test_size = 0.20, random_state = 888)




param = {'learning_rate': 0.31553117247348733,
         'max_delta_step': 1.7726044219753656,
         'max_depth': 10,
         'min_child_weight': 1.3845040588450772,
         'num_rounds': 342.68325188584106,
         'reg_alpha': 0.531395259755843,
         'reg_lambda': 3.744980563764689,
         'weight': 0.26187490421514203}



num_round = param["num_rounds"]
#param["tree_method"] = "gpu_hist"
#param["sampling_method"] = "gradient_based"
param['objective'] = 'binary:logistic'

weights =  np.array([param["weight"] if binding == 0 else 1.0 for binding in np.array(train_y)])

del param["num_rounds"]
del param["weight"]

dtrain = xgb.DMatrix(np.array(train_X), weight = weights, label = np.array(train_y),
            feature_names= feature_names)
dtest = xgb.DMatrix(np.array(test_X), label = np.array(test_y),
                    feature_names= feature_names)

dtest_new = xgb.DMatrix(np.array(X_test_Mou), label = np.array(y_test_Mou), feature_names= feature_names)

evallist = [(dtest_new, 'eval'), (dtrain, 'train')]

bst = xgb.train(param,  dtrain, int(num_round),evallist, verbose_eval=1)
y_test_pred = np.round(bst.predict(dtest))
acc_test = np.mean(y_test_pred == np.array(test_y))
roc_auc = roc_auc_score(np.array(test_y), bst.predict(dtest))

print("Accuracy on test set: %s, ROC-AUC score for test set: %s"  % (acc_test, roc_auc))


y_test_new_pred = np.round(bst.predict(dtest_new))
acc_test_new = np.mean(y_test_new_pred == np.array(y_test_Mou))
roc_auc_new = roc_auc_score(np.array(y_test_Mou), bst.predict(dtest_new))
mcc = matthews_corrcoef(np.array(y_test_Mou), y_test_new_pred)

print("Accuracy on test set: %s, ROC-AUC score for test set: %s, MCC: %s"  % (acc_test_new, roc_auc_new, mcc))


  result = libops.scalar_compare(x.ravel(), y, op)


[0]	eval-logloss:0.72657	train-logloss:0.60033
[1]	eval-logloss:0.72145	train-logloss:0.54160
[2]	eval-logloss:0.70856	train-logloss:0.50341
[3]	eval-logloss:0.72505	train-logloss:0.47235
[4]	eval-logloss:0.74472	train-logloss:0.45163
[5]	eval-logloss:0.74446	train-logloss:0.42985
[6]	eval-logloss:0.74310	train-logloss:0.41524
[7]	eval-logloss:0.78860	train-logloss:0.39985
[8]	eval-logloss:0.78448	train-logloss:0.38306
[9]	eval-logloss:0.81698	train-logloss:0.37547
[10]	eval-logloss:0.80540	train-logloss:0.36375
[11]	eval-logloss:0.80235	train-logloss:0.35612
[12]	eval-logloss:0.79059	train-logloss:0.34820
[13]	eval-logloss:0.78559	train-logloss:0.34451
[14]	eval-logloss:0.79603	train-logloss:0.33858
[15]	eval-logloss:0.82471	train-logloss:0.33341
[16]	eval-logloss:0.89267	train-logloss:0.32915
[17]	eval-logloss:0.91536	train-logloss:0.32404
[18]	eval-logloss:0.90621	train-logloss:0.31899
[19]	eval-logloss:0.90562	train-logloss:0.31530
[20]	eval-logloss:0.89434	train-logloss:0.31040
[2

[164]	eval-logloss:1.46034	train-logloss:0.09464
[165]	eval-logloss:1.46955	train-logloss:0.09414
[166]	eval-logloss:1.48177	train-logloss:0.09364
[167]	eval-logloss:1.48618	train-logloss:0.09301
[168]	eval-logloss:1.48571	train-logloss:0.09244
[169]	eval-logloss:1.49730	train-logloss:0.09177
[170]	eval-logloss:1.49102	train-logloss:0.09139
[171]	eval-logloss:1.47989	train-logloss:0.09072
[172]	eval-logloss:1.48629	train-logloss:0.09010
[173]	eval-logloss:1.49598	train-logloss:0.08942
[174]	eval-logloss:1.50819	train-logloss:0.08903
[175]	eval-logloss:1.49070	train-logloss:0.08856
[176]	eval-logloss:1.49049	train-logloss:0.08816
[177]	eval-logloss:1.49424	train-logloss:0.08751
[178]	eval-logloss:1.52745	train-logloss:0.08679
[179]	eval-logloss:1.52871	train-logloss:0.08637
[180]	eval-logloss:1.52662	train-logloss:0.08547
[181]	eval-logloss:1.53573	train-logloss:0.08508
[182]	eval-logloss:1.53593	train-logloss:0.08477
[183]	eval-logloss:1.55252	train-logloss:0.08441
[184]	eval-logloss:1