In [25]:
from processing_utils import DataPrepare
from studu_utils import load_best_global, save_best_global, plot_feature_importance, permutation_feature_importance, plot_feature_premutation_importance, plot_predictions


import polars as pl
import pandas as pd
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
import numpy as np
import matplotlib.pyplot as plt
import optuna
from catboost import CatBoostRegressor, Pool, cv

In [26]:
df: pl.DataFrame = pl.read_parquet(r'C:\Users\310\Desktop\Progects_Py\data\microstructure_price_prediction_data\dfs\2024-06-29 20-00-00_2024-07-01 00-00-00_delta_0-00-10_return_5_sec.parquet')
target_var: str = "log_return"
cols_to_exclude: list[str] = []
# File to store the best global results
BEST_GLOBAL_FILE = "best_global_CBR.json"
cat_features = ['currency_pair']
group_id =  "cross_section_id" #should be a name of column containing info about crossection ID

In [27]:
data = DataPrepare(df)

In [29]:
data.train_test_split(train_test_ratio=0.7, exclude_columns=None)
X_train, y_train, X_test, y_test, = data.X_y_split(target_var=target_var, target_encode=False, to_pandas=True)

Train test ratio is 0.7
Train len for AVAXUSDT is 5763
Test len for AVAXUSDT is 2470
Train len for DOGEUSDT is 6664
Test len for DOGEUSDT is 2856


In [30]:
# Initialize best global results
best_global = load_best_global(BEST_GLOBAL_FILE)

In [35]:
def objective(trial, X_train, y_train, features):
    # Store the features in trial user attributes
    trial.set_user_attr("features", features)

    # Define hyperparameter search space
    params = {
        "iterations": trial.suggest_int("iterations", 100, 2000),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "depth": trial.suggest_int("depth", 3, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 10),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5, 1.0),
        "grow_policy": trial.suggest_categorical("grow_policy", ["SymmetricTree"]),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        "loss_function": trial.suggest_categorical("loss_function", ["YetiRank", "YetiRankPairwise"]),
        "random_seed": 42,
        "logging_level": "Silent"  # Suppress internal logs during CV
    }

    # Sort the dataset by group_id
    sorted_data = X_train.sort_values(by=group_id)
    sorted_labels = y_train.loc[sorted_data.index]
    sorted_group_id = sorted_data[group_id].astype('category').cat.codes  # Convert to integer group IDs
    sorted_data = sorted_data.drop(columns=[group_id])

    # Use CatBoost Pool to define training data with group_id for ranking
    train_pool = Pool(
        data=sorted_data,
        label=sorted_labels,
        group_id=sorted_group_id,
        cat_features=cat_features
    )

    # Perform cross-validation using CatBoost's cv method
    cv_results = cv(
        pool=train_pool,
        params=params,
        fold_count=2,  # Number of cross-validation folds
        partition_random_seed=42,
        shuffle=True,
        verbose=False
    )
    print("Available keys in cv_results:", cv_results.keys())

    # Extract the best cross-validated metric
    mean_ndcg_loss = np.min(cv_results["test-NDCG-mean"])  # Use actual metric key from cv_results

    return mean_ndcg_loss

In [32]:
def optimize_with_dataset(X_train, y_train, study_name=None):
    global best_global

    features = list(X_train.columns)

    if study_name:
        pass
    if features == best_global["features"]:
        study_name = best_global["study_name"]
    else:
        study_name = datetime.now().strftime("study_%Y-%m-%d_%H-%M-%S")
    

    # Create a new study
    study = optuna.create_study(
        study_name=study_name,
        storage="sqlite:///optuna_study_CB.db",
        direction="minimize",
        load_if_exists=True
    )

    # Run optimization
    study.optimize(lambda trial: objective(trial, X_train, y_train, features), n_trials=1)

    # Update the global best result if the current study has a better score
    if study.best_value < best_global["best_value"]:
        best_global.update(
            {
                "study_name": study_name,
                "best_value": study.best_value,
                "best_params": study.best_params,
                "features": features,
            }
        )
        # Save the updated best global results to disk
        save_best_global(best_global, BEST_GLOBAL_FILE)

    print(f"Study: {study_name}")
    print(f"Best Value: {study.best_value}")
    print(f"Best Params: {study.best_params}")

    return study


In [37]:
study = optimize_with_dataset(X_train, y_train)

[I 2024-12-19 21:22:42,314] A new study created in RDB with name: study_2024-12-19_21-22-42
[W 2024-12-19 21:25:58,577] Trial 0 failed with parameters: {'iterations': 652, 'learning_rate': 0.030252425054006035, 'depth': 10, 'l2_leaf_reg': 2.3497368394827896, 'bagging_temperature': 3.626643589950932, 'colsample_bylevel': 0.7755915470560397, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 33, 'loss_function': 'YetiRank'} because of the following error: KeyError('test-NDCG-mean').
Traceback (most recent call last):
  File "c:\Users\310\AppData\Local\pypoetry\Cache\virtualenvs\microstructure-price-prediction-p_py7spM-py3.12\Lib\site-packages\pandas\core\indexes\base.py", line 3805, in get_loc
    return self._engine.get_loc(casted_key)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "index.pyx", line 167, in pandas._libs.index.IndexEngine.get_loc
  File "index.pyx", line 196, in pandas._libs.index.IndexEngine.get_loc
  File "pandas\\_libs\\hashtable_class_helper.pxi", line 7081, in 

Available keys in cv_results: Index(['iterations', 'test-NDCG:type=Base-mean', 'test-NDCG:type=Base-std'], dtype='object')


KeyError: 'test-NDCG-mean'

In [34]:
print(cv_results.keys())

NameError: name 'cv_results' is not defined