In [1]:
from processing_utils import DataPrepare
from studu_utils import load_best_global, save_best_global, plot_feature_importance

import polars as pl
import pandas as pd
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
import numpy as np
import matplotlib.pyplot as plt
import optuna
from catboost import CatBoostRegressor, Pool, cv

In [27]:
df: pl.DataFrame = pl.read_parquet(r'C:\Users\310\Desktop\Progects_Py\data\microstructure_price_prediction_data\cross_section\df_from_2024-06-15_to_2024-07-15_with_delta_0-05-00.parquet')
target_var: str = "log_return"
cols_to_exclude: list[str] = []
# File to store the best global results
BEST_GLOBAL_FILE = "best_global_CBR.json"
cat_features = ['currency_pair']
group_id = ... #should be a name of column containing info about crossection ID

In [29]:
data = DataPrepare(df)

In [30]:
data.train_test_split(train_test_ratio=0.7, cols_to_exclude=None)
X_train, y_train, X_test, y_test, = data.X_y_split(target_var=target_var, target_encode=False, to_pandas=True)

Train test ratio is 0.7
Train len for DOGEUSDT is 6048
Test len for DOGEUSDT is 2592
Train len for AVAXUSDT is 6048
Test len for AVAXUSDT is 2592


In [6]:
# Initialize best global results
best_global = load_best_global(BEST_GLOBAL_FILE)

In [35]:
def objective(trial, X_train, y_train, features):
    # Store the features in trial user attributes
    trial.set_user_attr("features", features)

    # Define hyperparameter search space
    params = {
        "iterations": trial.suggest_int("iterations", 100, 2000),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "depth": trial.suggest_int("depth", 3, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 10),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5, 1.0),
        "grow_policy": trial.suggest_categorical("grow_policy", ["SymmetricTree"]),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        "loss_function": trial.suggest_categorical("loss_function", ["YetiRank", "YetiRankPairwise"]),
        "random_seed": 42,
        "logging_level": "Silent"  # Suppress internal logs during CV
    }

    # Use CatBoost Pool to define training data with group_id for ranking
    train_pool = Pool(data=X_train, label=y_train, group_id=group_id, cat_features=cat_features)


    # Perform cross-validation using CatBoost's cv method
    cv_results = cv(
        pool=train_pool,
        params=params,
        fold_count=2,  # Number of cross-validation folds
        partition_random_seed=42,
        shuffle=True,
        verbose=False
    )

    # Extract the best cross-validated metric (NDCG loss approximated by the chosen ranking loss function)
    mean_ndcg_loss = np.min(cv_results["test-{}-mean".format(params["loss_function"])])

    return mean_ndcg_loss

In [8]:
def optimize_with_dataset(X_train, y_train, study_name=None):
    global best_global

    features = list(X_train.columns)

    if study_name:
        pass
    if features == best_global["features"]:
        study_name = best_global["study_name"]
    else:
        study_name = datetime.now().strftime("study_%Y-%m-%d_%H-%M-%S")
    

    # Create a new study
    study = optuna.create_study(
        study_name=study_name,
        storage="sqlite:///optuna_study_CB.db",
        direction="minimize",
        load_if_exists=True
    )

    # Run optimization
    study.optimize(lambda trial: objective(trial, X_train, y_train, features), n_trials=1)

    # Update the global best result if the current study has a better score
    if study.best_value < best_global["best_value"]:
        best_global.update(
            {
                "study_name": study_name,
                "best_value": study.best_value,
                "best_params": study.best_params,
                "features": features,
            }
        )
        # Save the updated best global results to disk
        save_best_global(best_global, BEST_GLOBAL_FILE)

    print(f"Study: {study_name}")
    print(f"Best Value: {study.best_value}")
    print(f"Best Params: {study.best_params}")

    return study


In [36]:
study = optimize_with_dataset(X_train, y_train)

[I 2024-12-13 20:41:52,786] A new study created in RDB with name: study_2024-12-13_20-41-52
[W 2024-12-13 21:01:47,970] Trial 0 failed with parameters: {'iterations': 1776, 'learning_rate': 0.208793554942058, 'depth': 6, 'l2_leaf_reg': 5.0000970178822515, 'bagging_temperature': 1.5181982342643963, 'colsample_bylevel': 0.8028092640694854, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 63, 'loss_function': 'YetiRank'} because of the following error: KeyboardInterrupt('').
Traceback (most recent call last):
  File "c:\Users\310\AppData\Local\pypoetry\Cache\virtualenvs\microstructure-price-prediction-p_py7spM-py3.12\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\310\AppData\Local\Temp\ipykernel_16764\3390006290.py", line 23, in <lambda>
    study.optimize(lambda trial: objective(trial, X_train, y_train, features), n_trials=1)
                                 ^^^^^^^^^^^^^^^^^^^^

KeyboardInterrupt: 