# Training gradient boosting model for enzyme-substrate pair prediction with ESM-1b-vectors

### 1. Loading and preprocessing data for model training and evaluation
### 2. Hyperparameter optimization using a 5-fold cross-validation (CV)
### 3. Training and validating the final model

In [4]:
import pandas as pd
import numpy as np
import random
import pickle
import sys
import os
import logging
from os.path import join
from sklearn.model_selection import KFold
#from hyperopt import fmin, tpe, hp, Trials, rand
import xgboost as xgb
from sklearn.metrics import roc_auc_score

sys.path.append('.\\additional_code')
#from data_preprocessing import *

CURRENT_DIR = os.getcwd()
print(CURRENT_DIR)

C:\Users\alexk\projects\SubFinder\notebooks_and_code


## 1. Loading and preprocessing data for model training and evaluation

### (a) Loading data:

In [5]:
df_train = pd.read_pickle(join(CURRENT_DIR, ".." ,"data","splits", "df_train_with_EC1_1_3_15_with_ESM1b_ts_GNN_V2.pkl"))
df_train = df_train.loc[df_train["ESM1b_ts"] != ""]
df_train = df_train.loc[df_train["GNN rep"] != ""]

df_train.reset_index(inplace = True, drop = True)

df_test  = pd.read_pickle(join(CURRENT_DIR, ".." ,"data","splits", "df_test_with_EC1_1_3_15_with_ESM1b_ts_GNN_V2.pkl"))
df_test = df_test.loc[df_test["ESM1b_ts"] != ""]
df_test = df_test.loc[df_test["GNN rep"] != ""]
df_test.reset_index(inplace = True, drop = True)

  result = libops.scalar_compare(x.ravel(), y, op)
  result = libops.scalar_compare(x.ravel(), y, op)


Loading new dataset:

In [6]:
df_test_new = pd.read_pickle(join(CURRENT_DIR, ".." ,"data", "splits", "df_EC1_1_3_15_with_enzyme_reps_GNN_V2.pkl"))

Randomly sample 10 enzymes that will be part of the training set:

In [8]:
def create_input_and_output_data(df):
    X = ();
    y = ();
    
    for ind in df.index:
        emb = df["ESM1b_ts"][ind]
        ecfp = np.array(list(df["ECFP"][ind])).astype(int)
                
        X = X +(np.concatenate([ecfp, emb]), );
        y = y + (df["Binding"][ind], );

    return(np.array(X),np.array(y))

feature_names =  ["ECFP_" + str(i) for i in range(1024)]
feature_names = feature_names + ["ESM1b_ts_" + str(i) for i in range(1280)]

#import matplotlib.pyplot as plt

In [None]:
new_enzymes =  list(set(df_test_new["Uniprot ID"]))
random.seed(2)
random.shuffle(new_enzymes)
enzyme_folds = []

#enzyme_folds = [new_enzymes[0:5], new_enzymes[5:10], new_enzymes[10:15], new_enzymes[15:]]
enzyme_folds =[["A0A011QK89", "A0A087D1R1", "R0EVG9",'B1HZY7', 'C2K1F0',"A4YVE0"],
                ["A0A0U6K8E5", "C9Y9E7","A4YVE0","E6SCX5","B7N6P4", 'B8MKR3'],
                ['Q5WIP4', 'A0A087RXW1', "A9QH69", 'B7RR92' , "E6SCX5"],
                ["A0A077SBA9", "D4MUV9", "C4VMW0", "S2DJ52", "D4N087"]]

for i in range(4):
    test_enzymes = enzyme_folds[i]

    df_train = pd.read_pickle(join(CURRENT_DIR, ".." ,"data",
                                   "splits", "df_train_with_EC1_1_3_15_with_ESM1b_ts_GNN_V2.pkl"))
    df_train = df_train.loc[df_train["ESM1b_ts"] != ""]
    df_train.reset_index(inplace = True, drop = True)


    df_train = df_train.loc[~df_train["Uniprot ID"].isin(test_enzymes)]
    
    
    train_X, train_y =  create_input_and_output_data(df = df_train)
    test_X, test_y =  create_input_and_output_data(df = df_test)
    test_new_X, test_new_y =  create_input_and_output_data(df = df_test_new)
    
    
    param = {'learning_rate': 0.21593894516312712,
             'max_delta_step': 2.3197168781549893,
             'max_depth': 12,
              'min_child_weight': 0.6315847279633628,
               'num_rounds': 396.91340133372364, 
               'reg_alpha': 0.30340726745680807, 
               'reg_lambda': 1.1575318518353965, 
               'weight': 0.11364941242322603}
    
    param =  {'learning_rate': 0.2450500663744065,
         'max_delta_step': 2.382647656857187,
         'max_depth': 11,
         'min_child_weight': 1.222014993565574, 
         'num_rounds': 379.3863424395678,
         'reg_alpha': 1.7242896864948025,
         'reg_lambda': 2.845463948389928,
         'weight': 0.10896532373464474}

    num_round = param["num_rounds"]
    param["tree_method"] = "gpu_hist"
    param["sampling_method"] = "gradient_based"
    param['objective'] = 'binary:logistic'
    weights = np.array([param["weight"] if binding == 0 else 1.0 for binding in df_train["Binding"]])
    #weights = [weights[i]*5 if dtype =="engqvist" else weights[i] for i, dtype in enumerate(df_train["type"])]

    del param["num_rounds"]
    del param["weight"]
    
    dtrain = xgb.DMatrix(np.array(train_X), weight = weights, label = np.array(train_y),
                feature_names= feature_names)
    dtest = xgb.DMatrix(np.array(test_X), label = np.array(test_y),
                        feature_names= feature_names)

    bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
    y_test_pred = np.round(bst.predict(dtest))
    acc_test = np.mean(y_test_pred == np.array(test_y))
    roc_auc = roc_auc_score(np.array(test_y), bst.predict(dtest))

    print("Accuracy on test set: %s, ROC-AUC score for test set: %s"  % (acc_test, roc_auc))
    
    dtest_new = xgb.DMatrix(np.array(test_new_X), label = np.array(test_new_y),
                    feature_names= feature_names)

    y_test_new_pred = np.round(bst.predict(dtest_new))
    acc_test_new = np.mean(y_test_new_pred == np.array(test_new_y))
    roc_auc_new = roc_auc_score(np.array(test_new_y), bst.predict(dtest_new))

    print("All enzymes:")
    print("Accuracy on test set: %s, ROC-AUC score for test set: %s"  % (acc_test_new, roc_auc_new))
    
    df_test_new_test = df_test_new.loc[df_test_new["Uniprot ID"].isin(test_enzymes)]
    test_new_X2, test_new_y2 =  create_input_and_output_data(df = df_test_new_test)
    test_new_X2  = np.array(test_new_X2)
    test_new_y2  = np.array(test_new_y2)

    dtest_new = xgb.DMatrix(np.array(test_new_X2), label = np.array(test_new_y2),
                        feature_names= feature_names)
    y_test_new_pred = np.round(bst.predict(dtest_new))
    acc_test_new = np.mean(y_test_new_pred == np.array(test_new_y2))
    roc_auc_new = roc_auc_score(np.array(test_new_y2), bst.predict(dtest_new))

    print("Enzymes not in training set:")
    print("Accuracy on test set: %s, ROC-AUC score for test set: %s"  % (acc_test_new, roc_auc_new))
    for i, enz in enumerate(test_enzymes):
        acc = np.mean(y_test_new_pred[i*6: (i+1)*6] == np.array(test_new_y2)[i*6: (i+1)*6])
        print(enz, acc)
    #plt.hist(bst.predict(dtest_new), bins = 20, rwidth=0.9)
    #plt.show()

  result = libops.scalar_compare(x.ravel(), y, op)


In [5]:
df_train = df_train.loc[~df_train["Uniprot ID"].isin(test_enzymes)]

In [6]:
def create_input_and_output_data(df):
    X = ();
    y = ();
    
    for ind in df.index:
        emb = df["ESM1b_ts"][ind]
        ecfp = np.array(list(df["GNN rep"][ind])).astype(int)
                
        X = X +(np.concatenate([ecfp, emb]), );
        y = y + (df["Binding"][ind], );

    return(X,y)

train_X, train_y =  create_input_and_output_data(df = df_train)
test_X, test_y =  create_input_and_output_data(df = df_test)
test_new_X, test_new_y =  create_input_and_output_data(df = df_test_new)


feature_names =  ["GNN rep_" + str(i) for i in range(50)]
feature_names = feature_names + ["ESM1b_ts_" + str(i) for i in range(1280)]

train_X = np.array(train_X)
test_X  = np.array(test_X)
test_new_X  = np.array(test_new_X)

train_y = np.array(train_y)
test_y  = np.array(test_y)
test_new_y  = np.array(test_new_y)

#### (iv) 3. Training and validating the final model
Training the model and validating it on the test set:

In [20]:
weight_new_data_points = 1

param = {'learning_rate': 0.21593894516312712,
         'max_delta_step': 2.3197168781549893,
         'max_depth': 12,
          'min_child_weight': 0.6315847279633628,
           'num_rounds': 396.91340133372364, 
           'reg_alpha': 0.30340726745680807, 
           'reg_lambda': 1.1575318518353965, 
           'weight': 0.11364941242322603}

num_round = param["num_rounds"]
param["tree_method"] = "gpu_hist"
param["sampling_method"] = "gradient_based"
param['objective'] = 'binary:logistic'
weights = np.array([param["weight"] if binding == 0 else 1.0 for binding in df_train["Binding"]])
weights = [weights[i]*weight_new_data_points if dtype =="engqvist" else weights[i] for i, dtype in enumerate(df_train["type"])]

del param["num_rounds"]
del param["weight"]

dtrain = xgb.DMatrix(np.array(train_X), weight = weights, label = np.array(train_y),
                feature_names= feature_names)
dtest = xgb.DMatrix(np.array(test_X), label = np.array(test_y),
                    feature_names= feature_names)

bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
y_test_pred = np.round(bst.predict(dtest))
acc_test = np.mean(y_test_pred == np.array(test_y))
roc_auc = roc_auc_score(np.array(test_y), bst.predict(dtest))

print("Accuracy on test set: %s, ROC-AUC score for test set: %s"  % (acc_test, roc_auc))

#np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "y_test_pred_xgboost_ESM1b_ts_ECFP.npy"), bst.predict(dtest))
#np.save(join(CURRENT_DIR, ".." ,"data", "training_results", "y_test_true_xgboost_ESM1b_ts_ECFP.npy"), test_y)

Accuracy on test set: 0.9187125748502994, ROC-AUC score for test set: 0.955980597458338


Validation of new test set:

In [21]:
dtest_new = xgb.DMatrix(np.array(test_new_X), label = np.array(test_new_y),
                    feature_names= feature_names)

y_test_new_pred = np.round(bst.predict(dtest_new))
acc_test_new = np.mean(y_test_new_pred == np.array(test_new_y))
roc_auc_new = roc_auc_score(np.array(test_new_y), bst.predict(dtest_new))

print("All enzymes:")
print("Accuracy on test set: %s, ROC-AUC score for test set: %s"  % (acc_test_new, roc_auc_new))

All enzymes:
Accuracy on test set: 0.5, ROC-AUC score for test set: 0.3832532253984316


In [22]:
df_test_new_test = df_test_new.loc[df_test_new["Uniprot ID"].isin(test_enzymes)]
test_new_X2, test_new_y2 =  create_input_and_output_data(df = df_test_new_test)
test_new_X2  = np.array(test_new_X2)
test_new_y2  = np.array(test_new_y2)

dtest_new = xgb.DMatrix(np.array(test_new_X2), label = np.array(test_new_y2),
                    feature_names= feature_names)
y_test_new_pred = np.round(bst.predict(dtest_new))
acc_test_new = np.mean(y_test_new_pred == np.array(test_new_y2))
roc_auc_new = roc_auc_score(np.array(test_new_y2), bst.predict(dtest_new))

print("Enzymes not in training set:")
print("Accuracy on test set: %s, ROC-AUC score for test set: %s"  % (acc_test_new, roc_auc_new))

Enzymes not in training set:
Accuracy on test set: 0.5303030303030303, ROC-AUC score for test set: 0.40645773979107314


In [12]:

for weight_new in range(1,15):
    weight_new_data_points = weight_new
    print("Weight for new data points: %s" % weight_new_data_points)
    param = {'learning_rate': 0.21593894516312712,
             'max_delta_step': 2.3197168781549893,
             'max_depth': 12,
              'min_child_weight': 0.6315847279633628,
               'num_rounds': 396.91340133372364, 
               'reg_alpha': 0.30340726745680807, 
               'reg_lambda': 1.1575318518353965, 
               'weight': 0.11364941242322603}

    num_round = param["num_rounds"]
    param["tree_method"] = "gpu_hist"
    param["sampling_method"] = "gradient_based"
    param['objective'] = 'binary:logistic'
    weights = np.array([param["weight"] if binding == 0 else 1.0 for binding in df_train["Binding"]])
    weights = [weights[i]*weight_new_data_points if dtype =="engqvist" else weights[i] for i, dtype in enumerate(df_train["type"])]

    del param["num_rounds"]
    del param["weight"]

    dtrain = xgb.DMatrix(np.array(train_X), weight = weights, label = np.array(train_y),
                    feature_names= feature_names)
    dtest = xgb.DMatrix(np.array(test_X), label = np.array(test_y),
                        feature_names= feature_names)

    bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
    y_test_pred = np.round(bst.predict(dtest))
    acc_test = np.mean(y_test_pred == np.array(test_y))
    roc_auc = roc_auc_score(np.array(test_y), bst.predict(dtest))

    print("Accuracy on test set: %s, ROC-AUC score for test set: %s"  % (acc_test, roc_auc))



    dtest_new = xgb.DMatrix(np.array(test_new_X), label = np.array(test_new_y),
                        feature_names= feature_names)

    y_test_new_pred = np.round(bst.predict(dtest_new))
    acc_test_new = np.mean(y_test_new_pred == np.array(test_new_y))
    roc_auc_new = roc_auc_score(np.array(test_new_y), bst.predict(dtest_new))
    print("All enzymes:")
    print("Accuracy on test set: %s, ROC-AUC score for test set: %s"  % (acc_test_new, roc_auc_new))


    df_test_new_test = df_test_new.loc[df_test_new["Uniprot ID"].isin(test_enzymes)]
    test_new_X, test_new_y =  create_input_and_output_data(df = df_test_new_test)
    test_new_X  = np.array(test_new_X)
    test_new_y  = np.array(test_new_y)
    dtest_new = xgb.DMatrix(np.array(test_new_X), label = np.array(test_new_y),
                        feature_names= feature_names)
    y_test_new_pred = np.round(bst.predict(dtest_new))
    acc_test_new = np.mean(y_test_new_pred == np.array(test_new_y))
    roc_auc = roc_auc_score(np.array(test_new_y), bst.predict(dtest_new))

    print("Enzymes not in training set:")
    print("Accuracy on test set: %s, ROC-AUC score for test set: %s"  % (acc_test_new, roc_auc_new))

Weight for new data points: 1
Accuracy on test set: 0.918562874251497, ROC-AUC score for test set: 0.955903955078284
All enzymes:
Accuracy on test set: 0.5454545454545454, ROC-AUC score for test set: 0.5351851851851852
Enzymes not in training set:
Accuracy on test set: 0.5454545454545454, ROC-AUC score for test set: 0.5351851851851852
Weight for new data points: 2
Accuracy on test set: 0.9187874251497006, ROC-AUC score for test set: 0.9565065878578927
All enzymes:
Accuracy on test set: 0.5606060606060606, ROC-AUC score for test set: 0.42500000000000004
Enzymes not in training set:
Accuracy on test set: 0.5606060606060606, ROC-AUC score for test set: 0.42500000000000004
Weight for new data points: 3
Accuracy on test set: 0.9193113772455089, ROC-AUC score for test set: 0.9567346270417203
All enzymes:
Accuracy on test set: 0.5606060606060606, ROC-AUC score for test set: 0.5037037037037038
Enzymes not in training set:
Accuracy on test set: 0.5606060606060606, ROC-AUC score for test set: 0.

KeyboardInterrupt: 