In [1]:
import numpy as np
import pickle
import pandas as pd
import os
from os.path import join
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import r2_score
from scipy import stats
import xgboost as xgb

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import matplotlib as mpl
plt.style.use('CCB_plot_style_0v4.mplstyle')
c_styles      = mpl.rcParams['axes.prop_cycle'].by_key()['color']   # fetch the defined color styles
high_contrast = ['#004488', '#DDAA33', '#BB5566', '#000000']


Bad key text.latex.preview in file CCB_plot_style_0v4.mplstyle, line 55 ('text.latex.preview  : False')
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.5.2/matplotlibrc.template
or from the matplotlib source distribution

Bad key mathtext.fallback_to_cm in file CCB_plot_style_0v4.mplstyle, line 63 ('mathtext.fallback_to_cm : True ## When True, use symbols from the Computer Modern fonts')
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.5.2/matplotlibrc.template
or from the matplotlib source distribution


## Loading training and test data:

In [2]:
data_train = pd.read_pickle(join("..", "..", "data", "kcat_data", "splits", "train_df_kcat.pkl"))
data_test = pd.read_pickle(join("..", "..", "data", "kcat_data", "splits", "test_df_kcat.pkl"))


data_train.rename(columns = {"geomean_kcat" :"log10_kcat"}, inplace = True)
data_test.rename(columns = {"geomean_kcat" :"log10_kcat"}, inplace = True)
len(data_train), len(data_test)

(3421, 850)

In [3]:
train_indices = list(np.load(join("..", "..", "data", "kcat_data", "splits", "CV_train_indices.npy"), allow_pickle = True))
test_indices = list(np.load(join("..", "..", "data", "kcat_data", "splits", "CV_test_indices.npy"), allow_pickle = True))

## 1. Training a model with only sequence information (ESM-1b):

#### (a) Creating input matrices:

In [4]:
train_ESM1b = np.array(list(data_train["ESM1b"]))
train_X = train_ESM1b
train_Y = np.array(list(data_train["log10_kcat"]))

test_ESM1b = np.array(list(data_test["ESM1b"]))
test_X = test_ESM1b
test_Y = np.array(list(data_test["log10_kcat"]))

#### (b) Hyperparameter optimization:

In [5]:
'''def cross_validation_mse_gradient_boosting(param):
    num_round = param["num_rounds"]
    del param["num_rounds"]
    param["max_depth"] = int(np.round(param["max_depth"]))
    param["tree_method"] = "gpu_hist"
    param["sampling_method"] = "gradient_based"
    
    MSE = []
    R2 = []
    for i in range(5):
        train_index, test_index  = train_indices[i], test_indices[i]
        dtrain = xgb.DMatrix(train_X[train_index], label = train_Y[train_index])
        dvalid = xgb.DMatrix(train_X[test_index])
        bst = xgb.train(param, dtrain, int(num_round), verbose_eval=False)
        y_valid_pred = bst.predict(dvalid)
        MSE.append(np.mean(abs(np.reshape(train_Y[test_index], (-1)) - y_valid_pred)**2))
        R2.append(r2_score(np.reshape(train_Y[test_index], (-1)),  y_valid_pred))
    return(-np.mean(R2))


from hyperopt import fmin, tpe, rand, hp, Trials

space_gradient_boosting = {
    "learning_rate": hp.uniform("learning_rate", 0.01, 1),
    "max_depth": hp.uniform("max_depth", 4,12),
    #"subsample": hp.uniform("subsample", 0.7, 1),
    "reg_lambda": hp.uniform("reg_lambda", 0, 5),
    "reg_alpha": hp.uniform("reg_alpha", 0, 5),
    "max_delta_step": hp.uniform("max_delta_step", 0, 5),
    "min_child_weight": hp.uniform("min_child_weight", 0.1, 15),
    "num_rounds":  hp.uniform("num_rounds", 20, 200)}


trials = Trials()
best = fmin(fn = cross_validation_mse_gradient_boosting, space = space_gradient_boosting,
            algo=rand.suggest, max_evals = 200, trials=trials)''';

#### (c) Training and validating model:

In [6]:
param = {'learning_rate': 0.051447544749765035,
         'max_delta_step': 2.956459783615207,
         'max_depth': 5.034202474908222,
         'min_child_weight': 7.457989829577018,
         'num_rounds': 297.50601395689256,
         'reg_alpha': 1.0858835704466614, 
         'reg_lambda': 1.1385559144302175}

num_round = param["num_rounds"]
param["max_depth"] = int(np.round(param["max_depth"]))

del param["num_rounds"]

In [7]:
R2 = []
MSE = []
Pearson = []

for i in range(5):
    train_index, test_index  = train_indices[i], test_indices[i]
    dtrain = xgb.DMatrix(train_X[train_index], label = train_Y[train_index])
    dvalid = xgb.DMatrix(train_X[test_index])
    
    bst = xgb.train(param, dtrain, int(num_round), verbose_eval=False)
    
    y_valid_pred = bst.predict(dvalid)
    MSE.append(np.mean(abs(np.reshape(train_Y[test_index], (-1)) - y_valid_pred)**2))
    R2.append(r2_score(np.reshape(train_Y[test_index], (-1)), y_valid_pred))
    Pearson.append(stats.pearsonr(np.reshape(train_Y[test_index], (-1)), y_valid_pred)[0])

print(Pearson)
print(MSE)
print(R2)

np.save(join("..", "..", "data", "training_results", "Pearson_CV_xgboost_ESM1b.npy"), np.array(Pearson))
np.save(join("..", "..", "data",  "training_results", "MSE_CV_xgboost_ESM1b.npy"), np.array(MSE))
np.save(join("..", "..", "data", "training_results", "R2_CV_xgboost_ESM1b.npy"), np.array(R2))

[0.6156521084065655, 0.5927689215737512, 0.5450373027741603, 0.6635468722921142, 0.5465250301214235]
[0.8251322874089426, 0.8206497271502332, 0.9401031382230501, 0.9269849644397952, 1.061793264104686]
[0.3720653400563064, 0.34881598167570627, 0.2894002515317964, 0.4219034383575203, 0.2977699878192147]


In [8]:
dtrain = xgb.DMatrix(train_X, label = train_Y)
dtest = xgb.DMatrix(test_X)

bst = xgb.train(param, dtrain, int(num_round), verbose_eval=False)

y_test_pred = bst.predict(dtest)
MSE_dif_fp_test = np.mean(abs(np.reshape(test_Y, (-1)) - y_test_pred)**2)
R2_dif_fp_test = r2_score(np.reshape(test_Y, (-1)), y_test_pred)
Pearson = stats.pearsonr(np.reshape(test_Y, (-1)), y_test_pred)

print(np.round(Pearson[0],3) ,np.round(MSE_dif_fp_test,3), np.round(R2_dif_fp_test,3))

np.save(join("..", "..", "data", "training_results", "y_test_pred_xgboost_ESM1b.npy"), bst.predict(dtest))
np.save(join("..", "..", "data", "training_results", "y_test_true_xgboost_ESM1b.npy"), test_Y)

0.59 0.939 0.345


## 2. Training a model with only sequence information (ESM-1b_ts):

#### (a) Creating input matrices:

In [9]:
train_ESM1b = np.array(list(data_train["ESM1b_ts"]))
train_X = train_ESM1b
train_Y = np.array(list(data_train["log10_kcat"]))

test_ESM1b = np.array(list(data_test["ESM1b_ts"]))
test_X = test_ESM1b
test_Y = np.array(list(data_test["log10_kcat"]))

#### (b) Hyperparameter optimization:

In [10]:
'''def cross_validation_mse_gradient_boosting(param):
    num_round = param["num_rounds"]
    del param["num_rounds"]
    param["max_depth"] = int(np.round(param["max_depth"]))
    param["tree_method"] = "gpu_hist"
    param["sampling_method"] = "gradient_based"
    
    MSE = []
    R2 = []
    for i in range(5):
        train_index, test_index  = train_indices[i], test_indices[i]
        dtrain = xgb.DMatrix(train_X[train_index], label = train_Y[train_index])
        dvalid = xgb.DMatrix(train_X[test_index])
        bst = xgb.train(param, dtrain, int(num_round), verbose_eval=False)
        y_valid_pred = bst.predict(dvalid)
        MSE.append(np.mean(abs(np.reshape(train_Y[test_index], (-1)) - y_valid_pred)**2))
        R2.append(r2_score(np.reshape(train_Y[test_index], (-1)),  y_valid_pred))
    return(-np.mean(R2))


from hyperopt import fmin, tpe, rand, hp, Trials

space_gradient_boosting = {
    "learning_rate": hp.uniform("learning_rate", 0.01, 1),
    "max_depth": hp.uniform("max_depth", 4,12),
    #"subsample": hp.uniform("subsample", 0.7, 1),
    "reg_lambda": hp.uniform("reg_lambda", 0, 5),
    "reg_alpha": hp.uniform("reg_alpha", 0, 5),
    "max_delta_step": hp.uniform("max_delta_step", 0, 5),
    "min_child_weight": hp.uniform("min_child_weight", 0.1, 15),
    "num_rounds":  hp.uniform("num_rounds", 20, 200)}


trials = Trials()
best = fmin(fn = cross_validation_mse_gradient_boosting, space = space_gradient_boosting,
            algo=rand.suggest, max_evals = 200, trials=trials)''';

#### (c) Training and validating model:

In [11]:
param = {'learning_rate': 0.2831145406836757,
         'max_delta_step': 0.07686715986169101, 
         'max_depth': 4.96836783761305,
          'min_child_weight': 6.905400087083855,
           'num_rounds': 313.1498988074061,
            'reg_alpha': 1.717314107718892,
             'reg_lambda': 2.470354543039016}

num_round = param["num_rounds"]
param["max_depth"] = int(np.round(param["max_depth"]))

del param["num_rounds"]

In [12]:
R2 = []
MSE = []
Pearson = []


for i in range(5):
    train_index, test_index  = train_indices[i], test_indices[i]
    dtrain = xgb.DMatrix(train_X[train_index], label = train_Y[train_index])
    dvalid = xgb.DMatrix(train_X[test_index])
    
    bst = xgb.train(param, dtrain, int(num_round), verbose_eval=False)
    
    y_valid_pred = bst.predict(dvalid)
    MSE.append(np.mean(abs(np.reshape(train_Y[test_index], (-1)) - y_valid_pred)**2))
    R2.append(r2_score(np.reshape(train_Y[test_index], (-1)), y_valid_pred))
    Pearson.append(stats.pearsonr(np.reshape(train_Y[test_index], (-1)), y_valid_pred)[0])

print(Pearson)
print(MSE)
print(R2)

np.save(join("..", "..", "data", "training_results", "Pearson_CV_xgboost_ESM1b_ts.npy"), np.array(Pearson))
np.save(join("..", "..", "data",  "training_results", "MSE_CV_xgboost_ESM1b_ts.npy"), np.array(MSE))
np.save(join("..", "..", "data", "training_results", "R2_CV_xgboost_ESM1b_ts.npy"), np.array(R2))

[0.6089419355661412, 0.5895091873624034, 0.5917286045189503, 0.642803324015263, 0.5233641744058668]
[0.830309790220053, 0.8254878887860261, 0.8642504226487573, 0.9497311132104741, 1.1100478926967097]
[0.3681252040118652, 0.3449769094978131, 0.34673536553812156, 0.4077182348220084, 0.26585619671739524]


In [13]:
dtrain = xgb.DMatrix(train_X, label = train_Y)
dtest = xgb.DMatrix(test_X)

bst = xgb.train(param, dtrain, int(num_round), verbose_eval=False)

y_test_pred = bst.predict(dtest)
MSE_dif_fp_test = np.mean(abs(np.reshape(test_Y, (-1)) - y_test_pred)**2)
R2_dif_fp_test = r2_score(np.reshape(test_Y, (-1)), y_test_pred)
Pearson = stats.pearsonr(np.reshape(test_Y, (-1)), y_test_pred)

print(np.round(Pearson[0],3) ,np.round(MSE_dif_fp_test,3), np.round(R2_dif_fp_test,3))

np.save(join("..", "..", "data", "training_results", "y_test_pred_xgboost_ESM1b_ts.npy"), bst.predict(dtest))
np.save(join("..", "..", "data", "training_results", "y_test_true_xgboost_ESM1b_ts.npy"), test_Y)

0.608 0.905 0.369


#### (d) Training model with test and train data for production mode:

In [14]:
train_ESM1b = np.array(list(data_train["ESM1b_ts"]))
train_Y = np.array(list(data_train["log10_kcat"]))

test_ESM1b = np.array(list(data_test["ESM1b_ts"]))
test_Y = np.array(list(data_test["log10_kcat"]))

train_X = np.concatenate([train_ESM1b, test_ESM1b])
train_Y = np.concatenate([train_Y, test_Y])

In [15]:
param = {'learning_rate': 0.2831145406836757,
         'max_delta_step': 0.07686715986169101, 
         'max_depth': 4.96836783761305,
          'min_child_weight': 6.905400087083855,
           'num_rounds': 313.1498988074061,
            'reg_alpha': 1.717314107718892,
             'reg_lambda': 2.470354543039016}

num_round = param["num_rounds"]
param["max_depth"] = int(np.round(param["max_depth"]))

del param["num_rounds"]

dtrain = xgb.DMatrix(train_X, label = train_Y)
dtest = xgb.DMatrix(test_X)

bst = xgb.train(param, dtrain, int(num_round), verbose_eval=False)

y_test_pred = bst.predict(dtest)
MSE_dif_fp_test = np.mean(abs(np.reshape(test_Y, (-1)) - y_test_pred)**2)
R2_dif_fp_test = r2_score(np.reshape(test_Y, (-1)), y_test_pred)
Pearson = stats.pearsonr(np.reshape(test_Y, (-1)), y_test_pred)

print(Pearson, MSE_dif_fp_test, R2_dif_fp_test)

pickle.dump(bst, open(join("..", "..", "data", "training_results", "saved_models",
                          "xgboost_sequence_only_train_and_test.pkl"), "wb"))

(0.9839122286376513, 0.0) 0.05006636413883383 0.9650757482138885


## 3. Training a model with only reaction information (difference fingerprint):

#### (a) Creating input matrices:

In [16]:
train_X = np.array(list(data_train["difference_fp"]))
train_Y = np.array(list(data_train["log10_kcat"]))

test_X = np.array(list(data_test["difference_fp"]))
test_Y = np.array(list(data_test["log10_kcat"]))

#### (b) Hyperparameter optimization:

In [17]:
'''def cross_validation_mse_gradient_boosting(param):
    num_round = param["num_rounds"]
    del param["num_rounds"]
    param["max_depth"] = int(np.round(param["max_depth"]))
    param["tree_method"] = "gpu_hist"
    param["sampling_method"] = "gradient_based"
    
    MSE = []
    R2 = []
    for i in range(5):
        train_index, test_index  = train_indices[i], test_indices[i]
        dtrain = xgb.DMatrix(train_X[train_index], label = train_Y[train_index])
        dvalid = xgb.DMatrix(train_X[test_index])
        bst = xgb.train(param, dtrain, int(num_round), verbose_eval=False)
        y_valid_pred = bst.predict(dvalid)
        MSE.append(np.mean(abs(np.reshape(train_Y[test_index], (-1)) - y_valid_pred)**2))
        R2.append(r2_score(np.reshape(train_Y[test_index], (-1)),  y_valid_pred))
    return(-np.mean(R2))


from hyperopt import fmin, tpe, rand, hp, Trials

space_gradient_boosting = {
    "learning_rate": hp.uniform("learning_rate", 0.01, 1),
    "max_depth": hp.uniform("max_depth", 4,12),
    #"subsample": hp.uniform("subsample", 0.7, 1),
    "reg_lambda": hp.uniform("reg_lambda", 0, 5),
    "reg_alpha": hp.uniform("reg_alpha", 0, 5),
    "max_delta_step": hp.uniform("max_delta_step", 0, 5),
    "min_child_weight": hp.uniform("min_child_weight", 0.1, 15),
    "num_rounds":  hp.uniform("num_rounds", 20, 200)}


trials = Trials()
best = fmin(fn = cross_validation_mse_gradient_boosting, space = space_gradient_boosting,
            algo=rand.suggest, max_evals = 200, trials=trials)''';

#### (c) Training and validating model:

In [18]:
param = {'learning_rate': 0.14154883958006167,
         'max_delta_step': 0.02234358170535966,
         'max_depth': 10.869653004093198,
         'min_child_weight': 1.7936882442746056,
         'num_rounds': 361.6168542774665,
         'reg_alpha': 4.825525325323308, 
         'reg_lambda': 2.74944090578774}


num_round = param["num_rounds"]
param["max_depth"] = int(np.round(param["max_depth"]))

del param["num_rounds"]

In [19]:
R2 = []
MSE = []
Pearson = []

for i in range(5):
    train_index, test_index  = train_indices[i], test_indices[i]
    dtrain = xgb.DMatrix(train_X[train_index], label = train_Y[train_index])
    dvalid = xgb.DMatrix(train_X[test_index])
    
    bst = xgb.train(param, dtrain, int(num_round), verbose_eval=False)
    
    y_valid_pred = bst.predict(dvalid)
    MSE.append(np.mean(abs(np.reshape(train_Y[test_index], (-1)) - y_valid_pred)**2))
    R2.append(r2_score(np.reshape(train_Y[test_index], (-1)), y_valid_pred))
    Pearson.append(stats.pearsonr(np.reshape(train_Y[test_index], (-1)), y_valid_pred)[0])

print(Pearson)
print(MSE)
print(R2)

np.save(join("..", "..", "data", "training_results", "Pearson_CV_xgboost_diff_fp.npy"), np.array(Pearson))
np.save(join("..", "..", "data", "training_results", "MSE_CV_xgboost_diff_fp.npy"), np.array(MSE))
np.save(join("..", "..", "data", "training_results", "R2_CV_xgboost_diff_fp.npy"), np.array(R2))

[0.5933818458131599, 0.5290768248822284, 0.542607402759881, 0.5889341652634299, 0.5248561000592911]
[0.8733503195891379, 0.9112072460648096, 0.9525515259896388, 1.0924558329257095, 1.1054594918609029]
[0.3353708922662406, 0.27695876037247324, 0.27999083584524465, 0.31871067494359473, 0.2688907919476974]


In [20]:
dtrain = xgb.DMatrix(train_X, label = train_Y)
dtest = xgb.DMatrix(test_X)

bst = xgb.train(param, dtrain, int(num_round), verbose_eval=False)

y_test_pred = bst.predict(dtest)
MSE_dif_fp_test = np.mean(abs(np.reshape(test_Y, (-1)) - y_test_pred)**2)
R2_dif_fp_test = r2_score(np.reshape(test_Y, (-1)), y_test_pred)
Pearson = stats.pearsonr(np.reshape(test_Y, (-1)), y_test_pred)

print(np.round(Pearson[0],3) ,np.round(MSE_dif_fp_test,3), np.round(R2_dif_fp_test,3))


np.save(join("..", "..", "data", "training_results", "y_test_pred_xgboost_diff_fp.npy"), bst.predict(dtest))
np.save(join("..", "..", "data", "training_results", "y_test_true_xgboost_diff_fp.npy"), test_Y)

0.6 0.948 0.339


## 4. Training a model with only reaction information (structural fingerprint):

#### (a) Creating input matrices:

In [21]:
train_X = ();
for ind in data_train.index:
    train_X = train_X + (np.array(list(data_train["structural_fp"][ind])).astype(int), )
train_X = np.array(train_X)
train_Y = np.array(list(data_train["log10_kcat"]))


test_X = ();
for ind in data_test.index:
    test_X = test_X + (np.array(list(data_test["structural_fp"][ind])).astype(int), )
test_X = np.array(test_X)
test_Y = np.array(list(data_test["log10_kcat"]))

#### (b) Hyperparameter optimization:

In [22]:
'''def cross_validation_mse_gradient_boosting(param):
    num_round = param["num_rounds"]
    del param["num_rounds"]
    param["max_depth"] = int(np.round(param["max_depth"]))
    param["tree_method"] = "gpu_hist"
    param["sampling_method"] = "gradient_based"
    
    MSE = []
    R2 = []
    for i in range(5):
        train_index, test_index  = train_indices[i], test_indices[i]
        dtrain = xgb.DMatrix(train_X[train_index], label = train_Y[train_index])
        dvalid = xgb.DMatrix(train_X[test_index])
        bst = xgb.train(param, dtrain, int(num_round), verbose_eval=False)
        y_valid_pred = bst.predict(dvalid)
        MSE.append(np.mean(abs(np.reshape(train_Y[test_index], (-1)) - y_valid_pred)**2))
        R2.append(r2_score(np.reshape(train_Y[test_index], (-1)),  y_valid_pred))
    return(-np.mean(R2))


from hyperopt import fmin, tpe, rand, hp, Trials

space_gradient_boosting = {
    "learning_rate": hp.uniform("learning_rate", 0.01, 1),
    "max_depth": hp.uniform("max_depth", 4,12),
    #"subsample": hp.uniform("subsample", 0.7, 1),
    "reg_lambda": hp.uniform("reg_lambda", 0, 5),
    "reg_alpha": hp.uniform("reg_alpha", 0, 5),
    "max_delta_step": hp.uniform("max_delta_step", 0, 5),
    "min_child_weight": hp.uniform("min_child_weight", 0.1, 15),
    "num_rounds":  hp.uniform("num_rounds", 20, 200)}


trials = Trials()
best = fmin(fn = cross_validation_mse_gradient_boosting, space = space_gradient_boosting,
            algo=rand.suggest, max_evals = 200, trials=trials)''';

#### (c) Training and validating model:

In [23]:
param = {'learning_rate': 0.01126910440903659,
         'max_delta_step': 0.5777120839605732,
         'max_depth': 5.486901609313889,
         'min_child_weight': 6.14467742389769,
         'num_rounds': 488.943459090126,
         'reg_alpha': 4.629840853377147,
         'reg_lambda': 2.1047561335691745}

num_round = param["num_rounds"]
param["max_depth"] = int(np.round(param["max_depth"]))

del param["num_rounds"]

In [24]:
R2 = []
MSE = []
Pearson = []

for i in range(5):
    train_index, test_index  = train_indices[i], test_indices[i]
    dtrain = xgb.DMatrix(train_X[train_index], label = train_Y[train_index])
    dvalid = xgb.DMatrix(train_X[test_index])
    
    bst = xgb.train(param, dtrain, int(num_round), verbose_eval=False)
    
    y_valid_pred = bst.predict(dvalid)
    MSE.append(np.mean(abs(np.reshape(train_Y[test_index], (-1)) - y_valid_pred)**2))
    R2.append(r2_score(np.reshape(train_Y[test_index], (-1)), y_valid_pred))
    Pearson.append(stats.pearsonr(np.reshape(train_Y[test_index], (-1)), y_valid_pred)[0])

print(Pearson)
print(MSE)
print(R2)

np.save(join("..", "..", "data", "training_results", "Pearson_CV_xgboost_str_fp.npy"), np.array(Pearson))
np.save(join("..", "..", "data", "training_results", "MSE_CV_xgboost_str_fp.npy"), np.array(MSE))
np.save(join("..", "..", "data", "training_results", "R2_CV_xgboost_str_fp.npy"), np.array(R2))

[0.5536292775258077, 0.532314381623744, 0.4889091394899996, 0.61040565169482, 0.5039256159780713]
[0.917613544024189, 0.9056644419444148, 1.015607786275439, 1.064522657133946, 1.1289411204418558]
[0.3016860962550283, 0.2813569650394473, 0.23232823280029657, 0.3361306693344771, 0.2533609285723336]


In [25]:
dtrain = xgb.DMatrix(train_X, label = train_Y)
dtest = xgb.DMatrix(test_X)

bst = xgb.train(param, dtrain, int(num_round), verbose_eval=False)

y_test_pred = bst.predict(dtest)
MSE_dif_fp_test = np.mean(abs(np.reshape(test_Y, (-1)) - y_test_pred)**2)
R2_dif_fp_test = r2_score(np.reshape(test_Y, (-1)), y_test_pred)
Pearson = stats.pearsonr(np.reshape(test_Y, (-1)), y_test_pred)

print(np.round(Pearson[0],3) ,np.round(MSE_dif_fp_test,3), np.round(R2_dif_fp_test,3))


np.save(join("..", "..", "data", "training_results", "y_test_pred_xgboost_str_fp.npy"), bst.predict(dtest))
np.save(join("..", "..", "data", "training_results", "y_test_true_xgboost_str_fp.npy"), test_Y)

0.561 0.994 0.307


## 5. Training a model with enzyme and reaction information (ESM1b_ts/diff_fp):

#### (a) Creating input matrices:

In [26]:
train_X = np.array(list(data_train["difference_fp"]))
train_X = np.concatenate([train_X, np.array(list(data_train["ESM1b_ts"]))], axis = 1)
train_Y = np.array(list(data_train["log10_kcat"]))

test_X = np.array(list(data_test["difference_fp"]))
test_X = np.concatenate([test_X, np.array(list(data_test["ESM1b_ts"]))], axis = 1)
test_Y = np.array(list(data_test["log10_kcat"]))

#### (b) Hyperparameter optimization:

In [27]:
'''def cross_validation_mse_gradient_boosting(param):
    num_round = param["num_rounds"]
    del param["num_rounds"]
    param["max_depth"] = int(np.round(param["max_depth"]))
    param["tree_method"] = "gpu_hist"
    param["sampling_method"] = "gradient_based"
    
    MSE = []
    R2 = []
    for i in range(5):
        train_index, test_index  = train_indices[i], test_indices[i]
        dtrain = xgb.DMatrix(train_X[train_index], label = train_Y[train_index])
        dvalid = xgb.DMatrix(train_X[test_index])
        bst = xgb.train(param, dtrain, int(num_round), verbose_eval=False)
        y_valid_pred = bst.predict(dvalid)
        MSE.append(np.mean(abs(np.reshape(train_Y[test_index], (-1)) - y_valid_pred)**2))
        R2.append(r2_score(np.reshape(train_Y[test_index], (-1)),  y_valid_pred))
    return(-np.mean(R2))


from hyperopt import fmin, tpe, rand, hp, Trials

space_gradient_boosting = {
    "learning_rate": hp.uniform("learning_rate", 0.01, 1),
    "max_depth": hp.uniform("max_depth", 4,12),
    #"subsample": hp.uniform("subsample", 0.7, 1),
    "reg_lambda": hp.uniform("reg_lambda", 0, 5),
    "reg_alpha": hp.uniform("reg_alpha", 0, 5),
    "max_delta_step": hp.uniform("max_delta_step", 0, 5),
    "min_child_weight": hp.uniform("min_child_weight", 0.1, 15),
    "num_rounds":  hp.uniform("num_rounds", 20, 200)}


trials = Trials()
best = fmin(fn = cross_validation_mse_gradient_boosting, space = space_gradient_boosting,
            algo=rand.suggest, max_evals = 200, trials=trials)''';

#### (c) Training and validating model:

In [28]:
param = {'learning_rate': 0.5727401435817077, 
         'max_delta_step': 0.022572978136953803,
         'max_depth': 9.734956573895278,
         'min_child_weight': 2.026404280518698,
         'num_rounds': 259.69265795096726,
         'reg_alpha': 7.333074414515098,
         'reg_lambda': 0.8545111451043885}

num_round = param["num_rounds"]
param["max_depth"] = int(np.round(param["max_depth"]))

del param["num_rounds"]

In [31]:
R2 = []
MSE = []
Pearson = []

for i in range(5):
    train_index, test_index  = train_indices[i], test_indices[i]
    dtrain = xgb.DMatrix(train_X[train_index], label = train_Y[train_index])
    dvalid = xgb.DMatrix(train_X[test_index])
    
    bst = xgb.train(param, dtrain, int(num_round), verbose_eval=False)
    
    y_valid_pred = bst.predict(dvalid)
    MSE.append(np.mean(abs(np.reshape(train_Y[test_index], (-1)) - y_valid_pred)**2))
    R2.append(r2_score(np.reshape(train_Y[test_index], (-1)), y_valid_pred))
    Pearson.append(stats.pearsonr(np.reshape(train_Y[test_index], (-1)), y_valid_pred)[0])

print(Pearson)
print(MSE)
print(R2)

np.save(join("..", "..", "data", "training_results", "Pearson_CV_xgboost_ESM1b_ts_diff_fp.npy"), np.array(Pearson))
np.save(join("..", "..", "data", "training_results", "MSE_CV_xgboost_ESM1b_ts_diff_fp.npy"), np.array(MSE))
np.save(join("..", "..", "data", "training_results", "R2_CV_xgboost_ESM1b_ts_diff_fp.npy"), np.array(R2))

[0.6445379171956692, 0.5663538267118359, 0.5813292687854973, 0.6603671022176677, 0.5437710214323066]
[0.7778549566615786, 0.8621287188783706, 0.8811802952099859, 0.9212592601438562, 1.0674061317478523]
[0.40804390380771904, 0.31590247958588724, 0.33393851092241733, 0.4254741650612315, 0.2940578488872041]


In [33]:
dtrain = xgb.DMatrix(train_X, label = train_Y)
dtest = xgb.DMatrix(test_X, label = test_Y)


bst = xgb.train(param, dtrain, int(num_round), verbose_eval=False)

y_test_pred = bst.predict(dtest)
MSE_dif_fp_test = np.mean(abs(np.reshape(test_Y, (-1)) - y_test_pred)**2)
R2_dif_fp_test = r2_score(np.reshape(test_Y, (-1)), y_test_pred)
Pearson = stats.pearsonr(np.reshape(test_Y, (-1)), y_test_pred)

print(np.round(Pearson[0],3) ,np.round(MSE_dif_fp_test,3), np.round(R2_dif_fp_test,3))


np.save(join("..", "..", "data", "training_results", "y_test_pred_xgboost_ESM1b_ts_diff_fp.npy"), bst.predict(dtest))
np.save(join("..", "..", "data", "training_results", "y_test_true_xgboost_ESM1b_ts_diff_fp.npy"), test_Y)

0.629 0.868 0.394


#### (d) Training model with test and train data for production mode:

In [34]:
train_X = np.array(list(data_train["difference_fp"]))
train_X = np.concatenate([train_X, np.array(list(data_train["ESM1b_ts"]))], axis = 1)
train_Y = np.array(list(data_train["log10_kcat"]))

test_X = np.array(list(data_test["difference_fp"]))
test_X = np.concatenate([test_X, np.array(list(data_test["ESM1b_ts"]))], axis = 1)
test_Y = np.array(list(data_test["log10_kcat"]))

train_X = np.concatenate([train_X, test_X])
train_Y = np.concatenate([train_Y, test_Y])

In [35]:
param = {'learning_rate': 0.5727401435817077, 
         'max_delta_step': 0.022572978136953803,
         'max_depth': 9.734956573895278,
         'min_child_weight': 2.026404280518698,
         'num_rounds': 259.69265795096726,
         'reg_alpha': 7.333074414515098,
         'reg_lambda': 0.8545111451043885}

num_round = param["num_rounds"]
param["max_depth"] = int(np.round(param["max_depth"]))

del param["num_rounds"]

dtrain = xgb.DMatrix(train_X, label = train_Y)
dtest = xgb.DMatrix(test_X)

bst = xgb.train(param, dtrain, int(num_round), verbose_eval=False)

y_test_pred = bst.predict(dtest)
MSE_dif_fp_test = np.mean(abs(np.reshape(test_Y, (-1)) - y_test_pred)**2)
R2_dif_fp_test = r2_score(np.reshape(test_Y, (-1)), y_test_pred)
Pearson = stats.pearsonr(np.reshape(test_Y, (-1)), y_test_pred)

print(Pearson, MSE_dif_fp_test, R2_dif_fp_test)

pickle.dump(bst, open(join("..", "..", "data", "training_results", "saved_models",
                          "xgboost_train_and_test.pkl"), "wb"))

(0.9910659621749647, 0.0) 0.027904669605058757 0.9805348416235519


## 6. Training a model with enzyme rep., reaction information, and additional features (ESM1b_ts/diff_fp/flux/KM):

Loading dataset with additonal features:

In [36]:
data_train = pd.read_pickle(join("..", "..", "data", "kcat_data", "splits", "train_df_kcat_with_KM_and_flux.pkl"))
data_test = pd.read_pickle(join("..", "..", "data", "kcat_data", "splits", "test_df_kcat_with_KM_and_flux.pkl"))

data_train.rename(columns = {"geomean_kcat" :"log10_kcat"}, inplace = True)
data_test.rename(columns = {"geomean_kcat" :"log10_kcat"}, inplace = True)
len(data_train), len(data_test)

(3421, 850)

In [37]:
train_X = ();
train_X = np.array(list(data_train["difference_fp"]))
train_X = np.concatenate([train_X, np.array(list(data_train["ESM1b_ts"])),
                          np.reshape(np.array(list(data_train["KM"])), (-1,1)),
                          np.reshape(np.array(list(data_train["flux"])), (-1,1))], axis = 1)
train_Y = np.array(list(data_train["log10_kcat"]))

test_X = ();
test_X = np.array(list(data_test["difference_fp"]))
test_X = np.concatenate([test_X, np.array(list(data_test["ESM1b_ts"])),
                          np.reshape(np.array(list(data_test["KM"])), (-1,1)),
                          np.reshape(np.array(list(data_test["flux"])), (-1,1))], axis = 1)
test_Y = np.array(list(data_test["log10_kcat"]))

In [38]:
'''def cross_validation_mse_gradient_boosting(param):
    num_round = param["num_rounds"]
    del param["num_rounds"]
    param["max_depth"] = int(np.round(param["max_depth"]))
    param["tree_method"] = "gpu_hist"
    param["sampling_method"] = "gradient_based"
    
    MSE = []
    R2 = []
    for i in range(5):
        train_index, test_index  = train_indices[i], test_indices[i]
        dtrain = xgb.DMatrix(train_X[train_index], label = train_Y[train_index])
        dvalid = xgb.DMatrix(train_X[test_index])
        bst = xgb.train(param, dtrain, int(num_round), verbose_eval=False)
        y_valid_pred = bst.predict(dvalid)
        MSE.append(np.mean(abs(np.reshape(train_Y[test_index], (-1)) - y_valid_pred)**2))
        R2.append(r2_score(np.reshape(train_Y[test_index], (-1)),  y_valid_pred))
    return(-np.mean(R2))


from hyperopt import fmin, tpe, rand, hp, Trials

space_gradient_boosting = {
    "learning_rate": hp.uniform("learning_rate", 0.01, 1),
    "max_depth": hp.uniform("max_depth", 4,12),
    #"subsample": hp.uniform("subsample", 0.7, 1),
    "reg_lambda": hp.uniform("reg_lambda", 0, 5),
    "reg_alpha": hp.uniform("reg_alpha", 0, 5),
    "max_delta_step": hp.uniform("max_delta_step", 0, 5),
    "min_child_weight": hp.uniform("min_child_weight", 0.1, 15),
    "num_rounds":  hp.uniform("num_rounds", 20, 200)}


trials = Trials()
best = fmin(fn = cross_validation_mse_gradient_boosting, space = space_gradient_boosting,
            algo=rand.suggest, max_evals = 200, trials=trials)''';

In [39]:
param = {'learning_rate': 0.15055870206296312,
         'max_delta_step': 0.13821534396910307,
         'max_depth': 5.338955142881738,
         'min_child_weight': 14.84613730467497,
         'num_rounds': 294.13028718637383,
         'reg_alpha': 2.6752278199969153,
         'reg_lambda': 0.6063171152564584}

num_round = param["num_rounds"]
param["max_depth"] = int(np.round(param["max_depth"]))

del param["num_rounds"]

In [40]:
R2 = []
MSE = []
Pearson = []

for i in range(5):
    train_index, test_index  = train_indices[i], test_indices[i]
    dtrain = xgb.DMatrix(train_X[train_index], label = train_Y[train_index])
    dvalid = xgb.DMatrix(train_X[test_index])
    
    bst = xgb.train(param, dtrain, int(num_round), verbose_eval=False)
    
    y_valid_pred = bst.predict(dvalid)
    MSE.append(np.mean(abs(np.reshape(train_Y[test_index], (-1)) - y_valid_pred)**2))
    R2.append(r2_score(np.reshape(train_Y[test_index], (-1)), y_valid_pred))
    Pearson.append(stats.pearsonr(np.reshape(train_Y[test_index], (-1)), y_valid_pred)[0])

print(Pearson)
print(MSE)
print(R2)

np.save(join("..", "..", "data", "training_results", "Pearson_CV_xgboost_ESM1b_diff_fp_flux_KM.npy"), np.array(Pearson))
np.save(join("..", "..", "data", "training_results", "MSE_CV_xgboost_ESM1b_diff_fp_flux_KM.npy"), np.array(MSE))
np.save(join("..", "..", "data", "training_results", "R2_CV_xgboost_ESM1b_diff_fp_flux_KM.npy"), np.array(R2))

[0.6381726941853013, 0.5782540928851373, 0.5816409524748034, 0.6501114403040922, 0.5359555917016532]
[0.7820629557858503, 0.843033028077367, 0.8823309866269835, 0.9415727118823687, 1.085306625916712]
[0.40484157063101067, 0.33105487439829584, 0.33306873291802874, 0.412806066811956, 0.28221913728185244]


In [41]:
dtrain = xgb.DMatrix(train_X, label = train_Y)
dtest = xgb.DMatrix(test_X)

bst = xgb.train(param, dtrain, int(num_round), verbose_eval=False)

y_test_pred = bst.predict(dtest)
MSE_dif_fp_test = np.mean(abs(np.reshape(test_Y, (-1)) - y_test_pred)**2)
R2_dif_fp_test = r2_score(np.reshape(test_Y, (-1)), y_test_pred)
Pearson = stats.pearsonr(np.reshape(test_Y, (-1)), y_test_pred)

print(Pearson[0], MSE_dif_fp_test, R2_dif_fp_test)


np.save(join("..", "..", "data", "training_results", "y_test_pred_xgboost_ESM1b_diff_fp_flux_KM.npy"), bst.predict(dtest))
np.save(join("..", "..", "data", "training_results", "y_test_true_xgboost_ESM1b_diff_fp_flux_KM.npy"), test_Y)

0.6203728110199708 0.8836397180206959 0.3836090051439036
