In [4]:
import random
import numpy as np
import src.prediction_models as pm
from sklearn import metrics
from typing import Literal, Optional
import src.generate_encodings as ge
import optuna
import xgboost as xgb
from optuna import create_study, logging
from optuna.pruners import MedianPruner
import os, sys

In [5]:
class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

In [6]:
"""Extract Sequence-Embeddings and Scores """

e_type = "blosum80"
dataset = "GRB2_HUMAN_Faure_2021"

import_data = f"../Data/Protein_Gym_Datasets/{dataset}.csv"
data = []
with open(import_data, "r") as infile:
    for line in infile.readlines()[1:]:
        line = line[:-1].split(",")
        sequence = line[1]
        label = line[2]
        representation = ge.generate_sequence_encodings(e_type, [sequence])[0]
        data.append((representation, label))

In [7]:
# random.shuffle(data)
# x_train = [xy[0] for xy in data[:int(len(data) * 0.80)]]
# y_train = [xy[1] for xy in data[:int(len(data) * 0.80)]]
#
# x_val = [xy[0] for xy in data[int(len(data) * 0.80):int(len(data) * 0.90)]]
# y_val = [xy[1] for xy in data[int(len(data) * 0.80):int(len(data) * 0.90)]]
#
# x_test = [xy[0] for xy in data[int(len(data) * 0.90):]]
# y_test = [xy[1] for xy in data[int(len(data) * 0.90):]]

In [8]:
# """To Do:
# Training-Script for XGB (Regressor)
# Provide used Hyperparameter via Arguments e.g. sys.argv[]
# return and print R2 and RMSE
# Command: Python xgboost_train.py <max_depth> <eta> <n_esti> ... --> RMSE = 0.18
# NEXT USE OPTUNA for running the script xgboost_train.py:
#
# """
#
# xgb_hyperCV = {
#     "max_depth": [4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
#     "eta": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
#     "n_estimators": [50, 100, 200, 300, 400, 500],
#     "reg_alpha": [0.0, 0.1, 0.5, 1, 1.5],
#     "reg_lambda": [0.0, 0.1, 0.5, 1, 1.5],
#     # "min_samples_split": [2, 3, 4, 5],
#     # "min_samples_leaf": [1, 2, 3, 4, 5],
#     # "bootstrap": [False, True]
# }

In [9]:
model = "xgboost"
cv_folds = 5
x_arr = [xy[0] for xy in data]
y_arr = [xy[1] for xy in data]



In [10]:
def train_with_params(model: Literal["rf", "xgboost", "gxboost_rf", "lightgbm", "svr", "adaboost"], x_arr, y_arr,
                      params: dict,
                      early_stopping: Optional[int] = False, seed: Optional[int] = random.seed()) -> (float, float):
    regressor = pm.ActivityPredictor(model_type=model, x_arr=x_arr, y_arr=y_arr, params=params,
                                     early_stopping=early_stopping, shuffle_data=False,
                                     seed=seed)

    with HiddenPrints():
        regressor.train(k_folds=cv_folds)
        performance = regressor.get_performance()
    print('------------------------------------------------')
    print(f"R2: {round(performance[0],4)}, RMSE: {round(performance[1],4)}")
    return performance  # performance: (R2, RMSE)


In [11]:
def objective(trial: optuna.Trial) -> float:
    params = dict()
    seed = 42
    params['max_depth'] = trial.suggest_int('max_depth', 3, 10)
    params['min_child_weight'] = trial.suggest_float('min_child_weight', 0.01, 10)
    params['subsample'] = trial.suggest_float('subsample', 0.2, 1)
    params['colsample_bytree'] = trial.suggest_float('colsample_bytree', 0.2, 0.9)
    params['eta'] = trial.suggest_float('eta', 0, 0.3)
    # params['num_boost_round'] = trial.suggest_int('num_boost_round', 100, 1000)

    try:
        early_stopping = int(params['num_boost_round'] / 10)
    except KeyError:
        early_stopping = 10

    results = train_with_params(model, x_arr, y_arr, params, early_stopping=early_stopping, seed=seed)
    r2 = round(results[0],4)
    rmse = round(results[1],4)
    return rmse



In [12]:
study = optuna.create_study(direction="minimize", study_name=f"{model}_{e_type}_{dataset}")
run_id = "NOD-Test"
study.optimize(lambda trial: objective(trial), n_trials=1000, show_progress_bar=True)

print(f"Best hyperparameters for run {run_id}:", study.best_trial.params)


[I 2025-03-14 17:31:56,653] A new study created in memory with name: xgboost_blosum80_GRB2_HUMAN_Faure_2021


  0%|          | 0/1000 [00:00<?, ?it/s]

------------------------------------------------
R2: 0.6436, RMSE: 0.2951
[I 2025-03-14 17:32:37,095] Trial 0 finished with value: 0.2951 and parameters: {'max_depth': 6, 'min_child_weight': 2.2471817434203123, 'subsample': 0.46726925890814136, 'colsample_bytree': 0.24724043483606106, 'eta': 0.2115681845371705}. Best is trial 0 with value: 0.2951.
------------------------------------------------
R2: 0.6141, RMSE: 0.3071
[I 2025-03-14 17:33:17,002] Trial 1 finished with value: 0.3071 and parameters: {'max_depth': 7, 'min_child_weight': 7.52978492958201, 'subsample': 0.3113768532382644, 'colsample_bytree': 0.44405771065249766, 'eta': 0.13967962495363148}. Best is trial 0 with value: 0.2951.
------------------------------------------------
R2: 0.6259, RMSE: 0.3023
[I 2025-03-14 17:33:54,435] Trial 2 finished with value: 0.3023 and parameters: {'max_depth': 8, 'min_child_weight': 5.4893303326066745, 'subsample': 0.2992129831470769, 'colsample_bytree': 0.6375390618048797, 'eta': 0.147478174