# Training gradient boosting model for enzyme-substrate pair prediction with ESM-1b-vectors

### 1. Loading and preprocessing data for model training and evaluation
### 2. Hyperparameter optimization using a 5-fold cross-validation (CV)
### 3. Training and validating the final model

In [1]:
import pandas as pd
import numpy as np
import random
import pickle
import sys
import os
import logging
from os.path import join
from sklearn.model_selection import KFold
#from hyperopt import fmin, tpe, hp, Trials, rand
import xgboost as xgb
from sklearn.metrics import roc_auc_score

sys.path.append('.\\additional_code')
#from data_preprocessing import *

CURRENT_DIR = os.getcwd()
print(CURRENT_DIR)

C:\Users\alexk\projects\SubFinder\notebooks_and_code


## 1. Loading and preprocessing data for model training and evaluation

### (a) Loading data:

In [2]:
df_test  = pd.read_pickle(join(CURRENT_DIR, ".." ,"data","splits", "df_test_with_ESM1b_ts.pkl"))
df_test = df_test.loc[df_test["ESM1b_ts"] != ""]
df_test.reset_index(inplace = True, drop = True)

df_Mou  = pd.read_pickle(join(CURRENT_DIR, ".." ,"data", "Min_data", "Min_df.pkl"))
df_Mou = df_Mou.loc[df_Mou["ESM1b_ts"] != ""]
df_Mou.reset_index(inplace = True, drop = True)

df_Berry  = pd.read_pickle(join(CURRENT_DIR, ".." ,"data", "Min_data", "Min_validation_Berry_df.pkl"))
df_Berry = df_Berry.loc[df_Berry["ESM1b_ts"] != ""]
df_Berry.reset_index(inplace = True, drop = True)

df_Oat  = pd.read_pickle(join(CURRENT_DIR, ".." ,"data", "Min_data", "Min_validation_Oat_df.pkl"))
df_Oat = df_Oat.loc[df_Oat["ESM1b_ts"] != ""]
df_Oat.reset_index(inplace = True, drop = True)

  result = libops.scalar_compare(x.ravel(), y, op)
  result = libops.scalar_compare(x.ravel(), y, op)
  result = libops.scalar_compare(x.ravel(), y, op)
  result = libops.scalar_compare(x.ravel(), y, op)


In [3]:
np.mean(df_Mou["Binding"]), np.mean(df_Berry["Binding"]), np.mean(df_Oat["Binding"])

(0.17983842641376888, 0.3131578947368421, 0.30451127819548873)

Loading new dataset:

In [4]:
def create_input_and_output_data(df):
    X = ();
    y = ();
    
    for ind in df.index:
        emb = df["ESM1b_ts"][ind]
        ecfp = np.array(list(df["ECFP"][ind])).astype(int)
                
        X = X +(np.concatenate([ecfp, emb]), );
        y = y + (df["Binding"][ind], );

    return(np.array(X),np.array(y))

feature_names =  ["ECFP_" + str(i) for i in range(1024)]
feature_names = feature_names + ["ESM1b_ts_" + str(i) for i in range(1280)]

#import matplotlib.pyplot as plt

### Adding Yang et al. data to the training set:

In [5]:

df_train = pd.read_pickle(join(CURRENT_DIR, ".." ,"data",
                               "splits", "df_train_with_ESM1b_ts.pkl"))
df_train = df_train.loc[df_train["ESM1b_ts"] != ""]
df_train.reset_index(inplace = True, drop = True)


train_X, train_y =  create_input_and_output_data(df = df_train)
test_X, test_y =  create_input_and_output_data(df = df_test)

df_test_new = df_Mou.copy()
test_new_X, test_new_y =  create_input_and_output_data(df = df_test_new)



X_train_Mou = test_new_X
y_train_Mou = test_new_y

train_X = np.concatenate([train_X, X_train_Mou])
train_y = np.concatenate([train_y, y_train_Mou])


Berry_X, Berry_y =  create_input_and_output_data(df = df_Berry)
Oat_X, Oat_y =  create_input_and_output_data(df = df_Oat)


param = {'learning_rate': 0.31553117247348733,
         'max_delta_step': 1.7726044219753656,
         'max_depth': 10,
         'min_child_weight': 1.3845040588450772,
         'num_rounds': 342.68325188584106,
         'reg_alpha': 0.531395259755843,
         'reg_lambda': 3.744980563764689,
         'weight': 0.26187490421514203}

num_round = param["num_rounds"]
param["tree_method"] = "gpu_hist"
param["sampling_method"] = "gradient_based"
param['objective'] = 'binary:logistic'


weights =  np.array([param["weight"] if binding == 0 else 1.0 for binding in np.array(train_y)])


del param["num_rounds"]
del param["weight"]

dtrain = xgb.DMatrix(np.array(train_X), weight = weights, label = np.array(train_y),
            feature_names= feature_names)
dtest = xgb.DMatrix(np.array(test_X), label = np.array(test_y),
                    feature_names= feature_names)

dtest_new = xgb.DMatrix(np.array(test_new_X), label = np.array(test_new_y), feature_names= feature_names)
dOat = xgb.DMatrix(np.array(Oat_X), label = np.array(Oat_y), feature_names= feature_names)

evallist = [(dOat, 'eval'), (dtrain, 'train')]

bst = xgb.train(param,  dtrain, int(num_round),evallist, verbose_eval=1)
y_test_pred = np.round(bst.predict(dtest))
acc_test = np.mean(y_test_pred == np.array(test_y))
roc_auc = roc_auc_score(np.array(test_y), bst.predict(dtest))

print("Accuracy on test set: %s, ROC-AUC score for test set: %s"  % (acc_test, roc_auc))


y_test_new_pred = np.round(bst.predict(dtest_new))
acc_test_new = np.mean(y_test_new_pred == np.array(test_new_y))
roc_auc_new = roc_auc_score(np.array(test_new_y), bst.predict(dtest_new))

print("All enzymes:")
print("Accuracy on test set: %s, ROC-AUC score for test set: %s"  % (acc_test_new, roc_auc_new))

dBerry = xgb.DMatrix(np.array(Berry_X), label = np.array(Berry_y), feature_names= feature_names)

y_test_Berry_pred = np.round(bst.predict(dBerry))
acc_test_Berry = np.mean(y_test_Berry_pred == np.array(Berry_y))
roc_auc_Berry = roc_auc_score(np.array(Berry_y), bst.predict(dBerry))
print("Accuracy on Berry validation set: %s, ROC-AUC score for test set: %s"  % (acc_test_Berry, roc_auc_Berry))

dOat = xgb.DMatrix(np.array(Oat_X), label = np.array(Oat_y), feature_names= feature_names)

y_test_Oat_pred = np.round(bst.predict(dOat))
acc_test_Oat = np.mean(y_test_Oat_pred == np.array(Oat_y))
roc_auc_Oat = roc_auc_score(np.array(Oat_y), bst.predict(dOat))
print("Accuracy on Oat validation set: %s, ROC-AUC score for test set: %s"  % (acc_test_Oat, roc_auc_Oat))

  result = libops.scalar_compare(x.ravel(), y, op)


[0]	eval-logloss:0.64568	train-logloss:0.60286
[1]	eval-logloss:0.61630	train-logloss:0.54505
[2]	eval-logloss:0.60495	train-logloss:0.50130
[3]	eval-logloss:0.60057	train-logloss:0.47120
[4]	eval-logloss:0.59920	train-logloss:0.44867
[5]	eval-logloss:0.59698	train-logloss:0.43298
[6]	eval-logloss:0.58226	train-logloss:0.42292
[7]	eval-logloss:0.58323	train-logloss:0.41463
[8]	eval-logloss:0.59773	train-logloss:0.40060
[9]	eval-logloss:0.60508	train-logloss:0.39339
[10]	eval-logloss:0.60520	train-logloss:0.38366
[11]	eval-logloss:0.60211	train-logloss:0.37483
[12]	eval-logloss:0.59695	train-logloss:0.37063
[13]	eval-logloss:0.60000	train-logloss:0.36133
[14]	eval-logloss:0.59444	train-logloss:0.35452
[15]	eval-logloss:0.58879	train-logloss:0.35034
[16]	eval-logloss:0.59021	train-logloss:0.34154
[17]	eval-logloss:0.59142	train-logloss:0.33598
[18]	eval-logloss:0.59351	train-logloss:0.32917
[19]	eval-logloss:0.59167	train-logloss:0.32501
[20]	eval-logloss:0.59093	train-logloss:0.31764
[2

[164]	eval-logloss:0.65879	train-logloss:0.09434
[165]	eval-logloss:0.66387	train-logloss:0.09380
[166]	eval-logloss:0.66316	train-logloss:0.09317
[167]	eval-logloss:0.66649	train-logloss:0.09269
[168]	eval-logloss:0.66834	train-logloss:0.09199
[169]	eval-logloss:0.66765	train-logloss:0.09142
[170]	eval-logloss:0.66833	train-logloss:0.09095
[171]	eval-logloss:0.66954	train-logloss:0.09043
[172]	eval-logloss:0.66822	train-logloss:0.08982
[173]	eval-logloss:0.67053	train-logloss:0.08948
[174]	eval-logloss:0.67389	train-logloss:0.08920
[175]	eval-logloss:0.67388	train-logloss:0.08878
[176]	eval-logloss:0.67424	train-logloss:0.08840
[177]	eval-logloss:0.67439	train-logloss:0.08794
[178]	eval-logloss:0.67385	train-logloss:0.08745
[179]	eval-logloss:0.67389	train-logloss:0.08703
[180]	eval-logloss:0.67362	train-logloss:0.08661
[181]	eval-logloss:0.67334	train-logloss:0.08626
[182]	eval-logloss:0.67316	train-logloss:0.08578
[183]	eval-logloss:0.67305	train-logloss:0.08544
[184]	eval-logloss:0

[332]	eval-logloss:0.74462	train-logloss:0.04549
[333]	eval-logloss:0.74543	train-logloss:0.04535
[334]	eval-logloss:0.74745	train-logloss:0.04519
[335]	eval-logloss:0.74772	train-logloss:0.04503
[336]	eval-logloss:0.74894	train-logloss:0.04495
[337]	eval-logloss:0.74886	train-logloss:0.04475
[338]	eval-logloss:0.74919	train-logloss:0.04467
[339]	eval-logloss:0.74972	train-logloss:0.04454
[340]	eval-logloss:0.75028	train-logloss:0.04433
[341]	eval-logloss:0.75118	train-logloss:0.04423
Accuracy on test set: 0.9030475052285629, ROC-AUC score for test set: 0.9472670686419001
All enzymes:
Accuracy on test set: 0.9957850368809273, ROC-AUC score for test set: 1.0
Accuracy on Berry validation set: 0.781578947368421, ROC-AUC score for test set: 0.8380501625937732
Accuracy on Oat validation set: 0.7819548872180451, ROC-AUC score for test set: 0.8029362696029363


In [6]:
from sklearn.metrics import matthews_corrcoef
matthews_corrcoef(Oat_y, np.round(bst.predict(dOat))), matthews_corrcoef(Berry_y, np.round(bst.predict(dBerry)))

(0.5169972652560235, 0.4844586955185631)

### Adding no Yang et al. data to the training set:

In [7]:

df_train = pd.read_pickle(join(CURRENT_DIR, ".." ,"data",
                               "splits", "df_train_with_ESM1b_ts.pkl"))
df_train = df_train.loc[df_train["ESM1b_ts"] != ""]
df_train.reset_index(inplace = True, drop = True)


train_X, train_y =  create_input_and_output_data(df = df_train)
test_X, test_y =  create_input_and_output_data(df = df_test)

df_test_new = df_Mou.copy()
test_new_X, test_new_y =  create_input_and_output_data(df = df_test_new)



X_train_Mou = test_new_X
y_train_Mou = test_new_y


Berry_X, Berry_y =  create_input_and_output_data(df = df_Berry)
Oat_X, Oat_y =  create_input_and_output_data(df = df_Oat)


param = {'learning_rate': 0.31553117247348733,
         'max_delta_step': 1.7726044219753656,
         'max_depth': 10,
         'min_child_weight': 1.3845040588450772,
         'num_rounds': 342.68325188584106,
         'reg_alpha': 0.531395259755843,
         'reg_lambda': 3.744980563764689,
         'weight': 0.26187490421514203}

num_round = param["num_rounds"]
param["tree_method"] = "gpu_hist"
param["sampling_method"] = "gradient_based"
param['objective'] = 'binary:logistic'


weights =  np.array([param["weight"] if binding == 0 else 1.0 for binding in np.array(train_y)])


del param["num_rounds"]
del param["weight"]

dtrain = xgb.DMatrix(np.array(train_X), weight = weights, label = np.array(train_y),
            feature_names= feature_names)
dtest = xgb.DMatrix(np.array(test_X), label = np.array(test_y),
                    feature_names= feature_names)

dtest_new = xgb.DMatrix(np.array(test_new_X), label = np.array(test_new_y), feature_names= feature_names)
dOat = xgb.DMatrix(np.array(Oat_X), label = np.array(Oat_y), feature_names= feature_names)

evallist = [(dOat, 'eval'), (dtrain, 'train')]

bst = xgb.train(param,  dtrain, int(num_round),evallist, verbose_eval=1)
y_test_pred = np.round(bst.predict(dtest))
acc_test = np.mean(y_test_pred == np.array(test_y))
roc_auc = roc_auc_score(np.array(test_y), bst.predict(dtest))

print("Accuracy on test set: %s, ROC-AUC score for test set: %s"  % (acc_test, roc_auc))


y_test_new_pred = np.round(bst.predict(dtest_new))
acc_test_new = np.mean(y_test_new_pred == np.array(test_new_y))
roc_auc_new = roc_auc_score(np.array(test_new_y), bst.predict(dtest_new))

print("All enzymes:")
print("Accuracy on test set: %s, ROC-AUC score for test set: %s"  % (acc_test_new, roc_auc_new))

dBerry = xgb.DMatrix(np.array(Berry_X), label = np.array(Berry_y), feature_names= feature_names)

y_test_Berry_pred = np.round(bst.predict(dBerry))
acc_test_Berry = np.mean(y_test_Berry_pred == np.array(Berry_y))
roc_auc_Berry = roc_auc_score(np.array(Berry_y), bst.predict(dBerry))
print("Accuracy on Berry validation set: %s, ROC-AUC score for test set: %s"  % (acc_test_Berry, roc_auc_Berry))

dOat = xgb.DMatrix(np.array(Oat_X), label = np.array(Oat_y), feature_names= feature_names)

y_test_Oat_pred = np.round(bst.predict(dOat))
acc_test_Oat = np.mean(y_test_Oat_pred == np.array(Oat_y))
roc_auc_Oat = roc_auc_score(np.array(Oat_y), bst.predict(dOat))
print("Accuracy on Oat validation set: %s, ROC-AUC score for test set: %s"  % (acc_test_Oat, roc_auc_Oat))

  result = libops.scalar_compare(x.ravel(), y, op)


[0]	eval-logloss:0.71850	train-logloss:0.60039
[1]	eval-logloss:0.72579	train-logloss:0.53749
[2]	eval-logloss:0.74438	train-logloss:0.50110
[3]	eval-logloss:0.73915	train-logloss:0.46993
[4]	eval-logloss:0.74773	train-logloss:0.44709
[5]	eval-logloss:0.73408	train-logloss:0.42411
[6]	eval-logloss:0.72672	train-logloss:0.41035
[7]	eval-logloss:0.72813	train-logloss:0.39732
[8]	eval-logloss:0.72528	train-logloss:0.38801
[9]	eval-logloss:0.72715	train-logloss:0.37413
[10]	eval-logloss:0.74048	train-logloss:0.36245
[11]	eval-logloss:0.73173	train-logloss:0.35791
[12]	eval-logloss:0.72918	train-logloss:0.34602
[13]	eval-logloss:0.73268	train-logloss:0.33752
[14]	eval-logloss:0.71653	train-logloss:0.33311
[15]	eval-logloss:0.70905	train-logloss:0.32405
[16]	eval-logloss:0.70941	train-logloss:0.32001
[17]	eval-logloss:0.71234	train-logloss:0.31638
[18]	eval-logloss:0.71050	train-logloss:0.31020
[19]	eval-logloss:0.69440	train-logloss:0.30532
[20]	eval-logloss:0.69363	train-logloss:0.30052
[2

[164]	eval-logloss:0.69821	train-logloss:0.08786
[165]	eval-logloss:0.69782	train-logloss:0.08750
[166]	eval-logloss:0.69797	train-logloss:0.08707
[167]	eval-logloss:0.70249	train-logloss:0.08658
[168]	eval-logloss:0.70677	train-logloss:0.08617
[169]	eval-logloss:0.70782	train-logloss:0.08555
[170]	eval-logloss:0.71241	train-logloss:0.08453
[171]	eval-logloss:0.71293	train-logloss:0.08410
[172]	eval-logloss:0.71016	train-logloss:0.08380
[173]	eval-logloss:0.70948	train-logloss:0.08350
[174]	eval-logloss:0.71331	train-logloss:0.08326
[175]	eval-logloss:0.71472	train-logloss:0.08274
[176]	eval-logloss:0.71597	train-logloss:0.08218
[177]	eval-logloss:0.71648	train-logloss:0.08165
[178]	eval-logloss:0.71549	train-logloss:0.08118
[179]	eval-logloss:0.71477	train-logloss:0.08075
[180]	eval-logloss:0.71448	train-logloss:0.08044
[181]	eval-logloss:0.70504	train-logloss:0.07954
[182]	eval-logloss:0.70544	train-logloss:0.07922
[183]	eval-logloss:0.70581	train-logloss:0.07888
[184]	eval-logloss:0

[332]	eval-logloss:0.77896	train-logloss:0.04277
[333]	eval-logloss:0.78326	train-logloss:0.04266
[334]	eval-logloss:0.78384	train-logloss:0.04255
[335]	eval-logloss:0.78561	train-logloss:0.04234
[336]	eval-logloss:0.78621	train-logloss:0.04221
[337]	eval-logloss:0.78672	train-logloss:0.04208
[338]	eval-logloss:0.79092	train-logloss:0.04196
[339]	eval-logloss:0.79200	train-logloss:0.04179
[340]	eval-logloss:0.79240	train-logloss:0.04167
[341]	eval-logloss:0.79287	train-logloss:0.04150
Accuracy on test set: 0.9060352554526442, ROC-AUC score for test set: 0.9502409054481795
All enzymes:
Accuracy on test set: 0.7024938531787847, ROC-AUC score for test set: 0.4906107802462527
Accuracy on Berry validation set: 0.6578947368421053, ROC-AUC score for test set: 0.5614475675327602
Accuracy on Oat validation set: 0.6804511278195489, ROC-AUC score for test set: 0.587721054387721


In [8]:
from sklearn.metrics import matthews_corrcoef
matthews_corrcoef(Oat_y, np.round(bst.predict(dOat))), matthews_corrcoef(Berry_y, np.round(bst.predict(dBerry)))

(0.12029878923247857, 0.0060553002732513465)