In [34]:
from processing_utils import train_test_split, target_encoding, X_y_split, DataPrepare

import json
import polars as pl
import pandas as pd
import numpy as np 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from tqdm import tqdm
from sklearn.model_selection import cross_val_score, GridSearchCV
from optuna.storages import RDBStorage
import matplotlib.pyplot as plt
import plotly
from datetime import datetime

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

from optuna.visualization import plot_optimization_history, plot_parallel_coordinate
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error
import numpy as np
import json
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

from dash import Dash, html, dcc
import plotly.express as px
import optuna


In [33]:
df: pl.DataFrame = pl.read_parquet(r'C:\Users\310\Desktop\Progects_Py\data\microstructure_price_prediction_data\cross_section\df_cross_section_V0.1_.parquet')
target_var: str = "target_five_step_ahead"
cols_to_exclude = ['trade_time', 'is_buyer_maker', 'date',]
# File to store the best global results
BEST_GLOBAL_FILE = "best_global.json"

In [7]:
data = DataPrepare(df)

In [8]:
data.train_test_split(train_test_ratio=0.01, cols_to_exclude=cols_to_exclude)
X_train, y_train, X_test, y_test, = data.X_y_split(target_var=target_var, target_encode=True)

Train test ratio is 0.01
Train len for AVAX-USDT is 13536
Test len for AVAX-USDT is 1340121
Train len for DOGE-USDT is 32493
Test len for DOGE-USDT is 3216827


In [35]:
# Load global best results from disk if they exist
def load_best_global():
    try:
        with open(BEST_GLOBAL_FILE, "r") as f:
            return json.load(f)
    except FileNotFoundError:
        return {"study_name": None, "best_value": float("inf"), "best_params": None, "features": None}
    
# Save global best results to disk
def save_best_global(best_global):
    with open(BEST_GLOBAL_FILE, "w") as f:
        json.dump(best_global, f, indent=4)

# Initialize best global results
best_global = load_best_global()

In [40]:
def objective(trial, X_train, y_train, features):
    # Store the features in trial user attributes
    trial.set_user_attr("features", features)

    # Hyperparameter space
    n_estimators = trial.suggest_int("n_estimators", 50, 500)
    max_depth = trial.suggest_int("max_depth", 3, 50)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
    min_weight_fraction_leaf = trial.suggest_float("min_weight_fraction_leaf", 0.0, 0.5)
    min_impurity_decrease = trial.suggest_float("min_impurity_decrease", 0.0, 0.5)

    # Define the RandomForestRegressor with trial parameters
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_weight_fraction_leaf=min_weight_fraction_leaf,
        min_impurity_decrease=min_impurity_decrease,
        random_state=42,
        n_jobs=-1,
    )

    # Use K-Fold cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    mse_scores = -cross_val_score(model, X_train, y_train, scoring="neg_mean_squared_error", cv=kf)

    # Return the mean MSE score
    return np.mean(mse_scores)

# Function to plot feature importance
def plot_feature_importance(model, feature_names):
    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]
    plt.figure(figsize=(10, 6))
    plt.title("Feature Importances")
    plt.bar(range(len(importances)), importances[indices], align="center")
    plt.xticks(range(len(importances)), [feature_names[i] for i in indices], rotation=45, ha="right")
    plt.tight_layout()
    plt.show()


In [45]:
best_global

{'study_name': 'study_2024-11-24_20-55-03',
 'best_value': 102.65464869162608,
 'best_params': {'n_estimators': 274,
  'max_depth': 35,
  'min_samples_split': 16,
  'min_weight_fraction_leaf': 0.47962742547911763,
  'min_impurity_decrease': 0.46498596544548115},
 'features': ['price',
  'quantity',
  'quote',
  'last_ask',
  'last_bid',
  'target',
  'spread',
  'cum_quote',
  'minute_sin',
  'minute_cos',
  'hour_sin',
  'hour_cos',
  'day_sin',
  'day_cos',
  'week_sin',
  'week_cos',
  'month_sin',
  'month_cos',
  'year_sin',
  'year_cos',
  'target_lag_1',
  'price_lag_1',
  'target_lag_2',
  'price_lag_2',
  'target_lag_3',
  'price_lag_3',
  'target_lag_5',
  'price_lag_5',
  'target_lag_7',
  'price_lag_7',
  'symbol_mean']}

In [48]:
def optimize_with_dataset(X_train, y_train, study_name=None):
    global best_global

    features = list(X_train.columns)

    if study_name:
        pass
    if features == best_global["features"]:
        study_name = best_global["study_name"]
    else:
        study_name = datetime.now().strftime("study_%Y-%m-%d_%H-%M-%S")
    

    # Create a new study
    study = optuna.create_study(
        study_name=study_name,
        storage="sqlite:///optuna_study.db",
        direction="minimize",
        load_if_exists=True
    )

    # Run optimization
    study.optimize(lambda trial: objective(trial, X_train, y_train, features), n_trials=1)

    # Update the global best result if the current study has a better score
    if study.best_value < best_global["best_value"]:
        best_global.update(
            {
                "study_name": study_name,
                "best_value": study.best_value,
                "best_params": study.best_params,
                "features": features,
            }
        )
        # Save the updated best global results to disk
        save_best_global(best_global)

    # Print the best result for this study
    print(f"Study: {study_name}")
    print(f"Best Value: {study.best_value}")
    print(f"Best Params: {study.best_params}")

    # Return the study for further analysis if needed
    return study


In [49]:
optimize_with_dataset(X_train, y_train)

[I 2024-11-24 21:05:06,245] Using an existing study with name 'study_2024-11-24_20-55-03' instead of creating a new one.
[I 2024-11-24 21:06:17,820] Trial 1 finished with value: 0.004790861329879529 and parameters: {'n_estimators': 375, 'max_depth': 35, 'min_samples_split': 6, 'min_weight_fraction_leaf': 0.05844255098555712, 'min_impurity_decrease': 0.2738548387993085}. Best is trial 1 with value: 0.004790861329879529.


Study: study_2024-11-24_20-55-03
Best Value: 0.004790861329879529
Best Params: {'n_estimators': 375, 'max_depth': 35, 'min_samples_split': 6, 'min_weight_fraction_leaf': 0.05844255098555712, 'min_impurity_decrease': 0.2738548387993085}


<optuna.study.study.Study at 0x1e621424080>

In [31]:
optuna.create_study(
    study_name="study_name",
    storage="sqlite:///optuna_study.db",  # Initialize SQLite storage
    direction="minimize",
    load_if_exists=True,
)
print("Study initialized successfully.")
study = optuna.load_study(
    study_name="study_name",
    storage="sqlite:///optuna_study.db"
)
print(f"Study name: {study.study_name}")
print(f"Number of trials: {len(study.trials)}")

study.optimize(objective, n_trials=2, show_progress_bar=True)

# Print the best parameters
print("Best hyperparameters:", study.best_params)
print("Best MSE:", study.best_value)

# Visualize optimization progress
#optuna.visualization.plot_optimization_history(study).show()
#optuna.visualization.plot_parallel_coordinate(study).show()





[I 2024-11-24 20:36:34,019] Using an existing study with name 'study_name' instead of creating a new one.


Study initialized successfully.
Study name: study_name
Number of trials: 8


  0%|          | 0/2 [00:00<?, ?it/s]

[I 2024-11-24 20:36:34,252] Trial 8 finished with value: 2148.826437996637 and parameters: {'x': -46.36622087249118, 'y': -1}. Best is trial 2 with value: 0.004790859827390177.
[I 2024-11-24 20:36:34,350] Trial 9 finished with value: 140.24159110970777 and parameters: {'x': 11.884510554066068, 'y': -1}. Best is trial 2 with value: 0.004790859827390177.
Best hyperparameters: {'n_estimators': 304, 'max_depth': 14, 'min_samples_split': 10, 'min_weight_fraction_leaf': 0.14780216944703894, 'min_impurity_decrease': 0.4894995858813643}
Best MSE: 0.004790859827390177
