In [17]:
import copy

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV
from sklearn.metrics import r2_score
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.metrics import root_mean_squared_error
import src.generate_encodings as ge
import src.prediction_models_es as pm
import tqdm
import os, sys
from joblib import parallel_backend
import ast


In [18]:
class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

In [19]:
"""Extract Sequence-Embeddings and Scores """

e_type = "blosum80"

import_data = "../Data/NOD.csv"
embeddings = []
labels = []
with open(import_data, "r") as infile:
    for line in infile.readlines()[1:]:
        line = line[:-1].split(",")
        sequence = line[0]
        label = line[2]
        representation = ge.generate_sequence_encodings(e_type, [sequence])[0]
        embeddings.append(representation)
        labels.append(label)

# print(labels)
# print(embeddings)


In [22]:
import random

"""Train the Model"""
rmse = 999
regressor = pm.ActivityPredictor(model_type="xgboost", x_arr=embeddings, y_arr=labels,
                                 split=(80, 10, 10))
regressor.train(k_folds=5)
while rmse > 0.18:
    with HiddenPrints():
        regressor = pm.ActivityPredictor(model_type="xgboost", x_arr=embeddings, y_arr=labels)
        regressor.train(k_folds=5)
    rmse = regressor.get_performance()[1]

x_train = regressor.get_data(prepared=True)["x_train"]
y_train = regressor.get_data(prepared=True)["y_train"]

x_valr = regressor.get_data(prepared=False)["x_val"]
y_valr = regressor.get_data(prepared=False)["y_val"]


[0]	validation_0-rmse:0.32696
[1]	validation_0-rmse:0.28319
[2]	validation_0-rmse:0.26110
[3]	validation_0-rmse:0.24661
[4]	validation_0-rmse:0.24132
[5]	validation_0-rmse:0.24141
[6]	validation_0-rmse:0.23873
[7]	validation_0-rmse:0.23779
[8]	validation_0-rmse:0.23751
[9]	validation_0-rmse:0.23561
[10]	validation_0-rmse:0.23407
[11]	validation_0-rmse:0.23286
[12]	validation_0-rmse:0.23418
[13]	validation_0-rmse:0.23331
[14]	validation_0-rmse:0.23401
[15]	validation_0-rmse:0.23424
[16]	validation_0-rmse:0.23460
[17]	validation_0-rmse:0.23583
[18]	validation_0-rmse:0.23589
[19]	validation_0-rmse:0.23537
[20]	validation_0-rmse:0.23589
[21]	validation_0-rmse:0.23605
[0]	validation_0-rmse:0.28331
[1]	validation_0-rmse:0.24845
[2]	validation_0-rmse:0.22723
[3]	validation_0-rmse:0.22119
[4]	validation_0-rmse:0.21784
[5]	validation_0-rmse:0.21851
[6]	validation_0-rmse:0.21843
[7]	validation_0-rmse:0.22150
[8]	validation_0-rmse:0.22101
[9]	validation_0-rmse:0.22164
[10]	validation_0-rmse:0.222

In [23]:
y_pred = regressor.predict(x_valr)
rmse = root_mean_squared_error(y_valr, y_pred)
models = regressor.get_model()

best_iteration = 0

xgb_hyperCV = {
    "max_depth": [4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
    "eta": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
    "n_estimators": [50, 100, 200, 300, 400, 500],
    "reg_alpha": [0.0, 0.1, 0.5, 1, 1.5],
    "reg_lambda": [0.0, 0.1, 0.5, 1, 1.5],
    # "min_samples_split": [2, 3, 4, 5],
    # "min_samples_leaf": [1, 2, 3, 4, 5],
    # "bootstrap": [False, True]
}

if isinstance(models, list):
    hyped_models = []
    ensemble = regressor.get_model()
    for model in ensemble:
        search = HalvingGridSearchCV(estimator=model, factor=2, param_grid=xgb_hyperCV, cv=5,
                                     scoring="neg_root_mean_squared_error",
                                     n_jobs=20)

        search.fit(x_train, y_train)
        print(search.best_params_)
        hyped_models.append(model)
        regressor.set_model(hyped_models, is_trained=True)
else:
    search = HalvingGridSearchCV(estimator=models, factor=2, param_grid=xgb_hyperCV, cv=5,
                                 scoring="neg_root_mean_squared_error",
                                 n_jobs=20)
    search.fit(x_train, y_train)
    print(search.best_params_)
    regressor.set_model(models, is_trained=True)

    print()

ValueError: 
All the 45000 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45000 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/conda/envs/MAP/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/conda/envs/MAP/lib/python3.12/site-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "/opt/conda/conda/envs/MAP/lib/python3.12/site-packages/xgboost/sklearn.py", line 1170, in fit
    self._Booster = train(
                    ^^^^^^
  File "/opt/conda/conda/envs/MAP/lib/python3.12/site-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "/opt/conda/conda/envs/MAP/lib/python3.12/site-packages/xgboost/training.py", line 182, in train
    if cb_container.after_iteration(bst, i, dtrain, evals):
       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/conda/envs/MAP/lib/python3.12/site-packages/xgboost/callback.py", line 261, in after_iteration
    ret = any(c.after_iteration(model, epoch, self.history) for c in self.callbacks)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/conda/envs/MAP/lib/python3.12/site-packages/xgboost/callback.py", line 261, in <genexpr>
    ret = any(c.after_iteration(model, epoch, self.history) for c in self.callbacks)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/conda/envs/MAP/lib/python3.12/site-packages/xgboost/callback.py", line 446, in after_iteration
    raise ValueError(msg)
ValueError: Must have at least 1 validation dataset for early stopping.


In [16]:
models = regressor.get_model()

if isinstance(models, list):
    for i, model in enumerate(models):
        x_val = regressor.get_data(prepared=True)["x_val"]
        y_pred = model.predict(x_val)
        fold_r2 = r2_score(y_valr, y_pred)
        fold_rmse = root_mean_squared_error(y_valr, y_pred)
        print(i, ". R2:", round(fold_r2, 3))
        print(i, ". RMSE", round(fold_rmse, 3))

y_pred = regressor.predict(x_val)
model_r2 = r2_score(y_valr, y_pred)
model_rmse = root_mean_squared_error(y_valr, y_pred)
print()
print(f"Total Model Performance: {round(model_r2,3)}, {round(model_rmse,3)}")

0 . R2: 0.758
0 . RMSE 0.177
1 . R2: 0.839
1 . RMSE 0.144
2 . R2: 0.749
2 . RMSE 0.18
3 . R2: 0.793
3 . RMSE 0.163
4 . R2: 0.821
4 . RMSE 0.152

Total Model Performance: 0.851, 0.138


In [None]:
"""Apply early stopping on models"""