In [1]:
import os
import joblib
import gc
from pprint import pprint

import pandas as pd
import numpy as np

from optiver_trading_at_the_close.feature_engineering import FE
from optiver_trading_at_the_close.nearest_neightbors_features import NearestNeighborsFeatures
from optiver_trading_at_the_close.column_selector import ColumnSelector
from optiver_trading_at_the_close.memory_reduction import MemoryReduction
from optiver_trading_at_the_close.mean_regressor_ensemble import MeanRegressorEnsemble

from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline

import xgboost as xgb
import lightgbm as lgb
import catboost as cb

import optuna

## Hard Variables

In [2]:
DATA_PATH = './../data/train.csv'

VERSION = 'V12'
MODEL = 'lightgbm'
# MODEL = 'catboost'
# MODEL = 'xgboost'
STUDY_PATH = f'./../output/hyperparameter_tunning/{VERSION}/{MODEL}/study.pkl'

## Read Data

In [3]:
df = pd.read_csv(DATA_PATH)

In [4]:
df.head()

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180602.69,1,0.999812,13380276.64,,,0.999812,60651.5,1.000026,8493.03,1.0,-3.029704,0,0_0_0
1,1,0,0,166603.91,-1,0.999896,1642214.25,,,0.999896,3233.04,1.00066,20605.09,1.0,-5.519986,0,0_0_1
2,2,0,0,302879.87,-1,0.999561,1819368.03,,,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995,0,0_0_2
3,3,0,0,11917682.27,-1,1.000171,18389745.62,,,0.999999,2324.9,1.000214,479032.4,1.0,-4.0102,0,0_0_3
4,4,0,0,447549.96,-1,0.999532,17860614.95,,,0.999394,16485.54,1.000016,434.1,1.0,-7.349849,0,0_0_4


## Feature Engineering

In [5]:
df = df.dropna(subset=['target'], axis=0)

X_train = df.loc[df['date_id'] <= 420]
X_test = df.loc[df['date_id'] > 420]

y_train = X_train['target']
# X_train = X_train.drop(columns='target')

y_test = X_test['target']
# X_test = X_test.drop(columns='target')

In [6]:
del df 
gc.collect()

0

In [7]:
pipeline = Pipeline(steps=[
    ('fe', FE()),
    # ('nn_features', NearestNeighborsFeatures(
    #     features_to_use_for_distance_computation=['seconds_in_bucket', 'wap', 'bid_plus_ask_sizes', 'bid_ask_size_imb'],
    #     get_target=True,
    #     features_get=['wap', 'bid_ask_size_imb'],
    #     n_neighbors=[40],
    #     metrics=['l1'],
    #     n_jobs=-1
    # )),
    ('column_selector', ColumnSelector(cols_to_drop=['time_id', 'row_id', 'date_id', 'target'])),
    ('memore_reduction', MemoryReduction()),
    ('mean_regressor_ensemble', MeanRegressorEnsemble([lgb.LGBMRegressor(n_estimators=1, max_depth=2)]))
])

pipeline.fit(X_train, y_train)

100%|██████████| 1/1 [04:13<00:00, 253.13s/it]


In [8]:
for step in pipeline.steps[:-1]:
    X_train = step[1].transform(X_train)
    X_test = step[1].transform(X_test)

In [9]:
if MODEL == 'catboost':
    X_train = cb.Pool(
        X_train,
        y_train,
        cat_features=['stock_id', 'imbalance_buy_sell_flag', 'dow', 'dom', 'seconds', 'minute']
    )
    
    X_test = cb.Pool(
        X_test,
        y_test,
        cat_features=['stock_id', 'imbalance_buy_sell_flag', 'dow', 'dom', 'seconds', 'minute']
    )
    
elif MODEL == 'xgboost':
    X_train = xgb.QuantileDMatrix(data=X_train, label=y_train)
    
    X_test = xgb.QuantileDMatrix(data=X_test, label=y_test)

## Hyperparameter Tunning

### Objective function

In [10]:
def objective_train_test(trial, X_train, y_train, X_test, y_test):
    if MODEL == 'lightgbm':
        param = {
            'metric': 'mae',
            'boosting_type': 'gbdt',
            
            'n_estimators': trial.suggest_int('n_estimators', 1000, 1800, step=100),
            'learning_rate': trial.suggest_float('learning_rate', 0.0001, 0.1, step=0.0001),
            'max_depth': trial.suggest_int('max_depth', 3, 12, step=1),
            'min_child_samples': trial.suggest_int('min_child_samples', 100, 10000, step=100),
            'reg_alpha': trial.suggest_float('reg_alpha', 0, 10, step=0.05),
            'reg_lambda': trial.suggest_float('reg_lambda', 0, 10, step=0.05),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0, step=0.05),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0, step=0.05),
            
            'random_state': 42,
            'n_jobs':-1,
            'verbose':-1
        }
        
        model = lgb.LGBMRegressor(**param)
        
        model.fit(X_train, y_train)
        
    if MODEL == 'xgboost':
        param = {
            'objective':'reg:squarederror',
            'tree_method':'hist',
            'booster': 'gbtree', # trial.suggest_categorical('booster', ['gbtree', 'gblinear']),
            
            # 'n_estimators': trial.suggest_int('n_estimators', 100, 2500, step=100),
            'max_depth': trial.suggest_int('max_depth', 2, 12, step=1),
            'learning_rate': trial.suggest_float('learning_rate', 0.0001, 0.05, step=0.0001),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0, step=0.05),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0, step=0.05),
            'reg_alpha': trial.suggest_float('reg_alpha', 0, 10, step=0.1),
            'reg_lambda': trial.suggest_float('reg_lambda', 0, 10, step=0.1),
            
            'random_state': 42,
            'n_jobs':-1,
            # 'verbose':False
        }
        
        param_estimators = {'n_estimators': trial.suggest_int('n_estimators', 100, 2500, step=100)}
        
        model = xgb.train(param, X_train, num_boost_round=param_estimators['n_estimators'])
                
    elif MODEL == 'catboost':
        param = {
            'loss_function': 'MAE',
            
            'n_estimators': trial.suggest_int('n_estimators', 100, 2500, step=100),
            'learning_rate': trial.suggest_float('learning_rate', 0.0001, 0.15, step=0.0001),
            'max_depth': trial.suggest_int('max_depth', 3, 8, step=1),
            'min_child_samples': trial.suggest_int('min_child_samples', 100, 10000, step=100),
            'reg_lambda': trial.suggest_float('reg_lambda', 0, 10, step=0.05),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0, step=0.05),
            
            'random_state': 42,
            'thread_count':-1,
            'verbose':False
        }
        
        model = cb.CatBoostRegressor(**param)
        
        model.fit(X_train)
        
    return mean_absolute_error(y_test, model.predict(X_test))

### Read/Create study

In [11]:
if os.path.exists(STUDY_PATH):
    study = joblib.load(STUDY_PATH)
else:
    os.makedirs('/'.join(STUDY_PATH.split('/')[:-1]), exist_ok=True)
    
    study = optuna.create_study(
        direction="minimize",
        sampler=optuna.samplers.TPESampler(),
        # pruner=optuna.pruners.SuccessiveHalvingPruner()
    )

[I 2023-11-29 17:05:58,939] A new study created in memory with name: no-name-51ea64e4-0653-4f60-9e58-03811b5fc2f3


### Do hyperparameter tunning

In [28]:
# study.optimize(
#     lambda trial: objective(trial, X, y, TimeSeriesSplit(n_splits=3)),
#     timeout=9*60*60,
#     n_jobs=1
# )

study.optimize(
    lambda trial: objective_train_test(trial, X_train, y_train, X_test, y_test),
    timeout=6*60*60,
    n_jobs=1
)

[I 2023-12-01 08:40:51,360] Trial 14 finished with value: 8.164528164208877 and parameters: {'max_depth': 8, 'learning_rate': 0.0117, 'colsample_bytree': 0.9, 'subsample': 0.8500000000000001, 'reg_alpha': 7.300000000000001, 'reg_lambda': 0.0, 'n_estimators': 1100}. Best is trial 13 with value: 5.74528286033919.


### Save Study

In [29]:
joblib.dump(study, STUDY_PATH)

['./../output/hyperparameter_tunning/V11/xgboost/study.pkl']

### Results

#### All Results

In [30]:
results_df = study.trials_dataframe().sort_values(by='value')
results_df.head(50)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_learning_rate,params_max_depth,params_n_estimators,params_reg_alpha,params_reg_lambda,params_subsample,state
13,13,5.745283,2023-11-30 16:22:17.267830,2023-12-01 00:08:40.231979,0 days 07:46:22.964149,0.9,0.0011,9,1000,6.5,0.2,0.85,COMPLETE
12,12,5.79011,2023-11-30 15:42:16.638916,2023-11-30 16:22:17.248958,0 days 00:40:00.610042,1.0,0.0029,9,100,6.4,3.9,0.85,COMPLETE
11,11,5.821819,2023-11-30 14:12:26.404709,2023-11-30 15:42:16.621749,0 days 01:29:50.217040,1.0,0.0007,9,100,6.1,3.3,0.85,COMPLETE
10,10,5.831058,2023-11-30 13:14:27.915302,2023-11-30 14:12:26.384437,0 days 00:57:58.469135,1.0,0.0002,9,100,5.9,4.0,0.85,COMPLETE
9,9,5.976493,2023-11-30 11:43:37.928564,2023-11-30 13:14:27.897597,0 days 01:30:49.969033,0.75,0.015,6,500,1.4,8.6,0.65,COMPLETE
4,4,5.982617,2023-11-30 02:15:26.064093,2023-11-30 03:42:18.221779,0 days 01:26:52.157686,0.95,0.0063,6,800,4.3,7.3,0.7,COMPLETE
5,5,6.082872,2023-11-30 03:42:18.224931,2023-11-30 04:11:33.844306,0 days 00:29:15.619375,0.6,0.0374,6,400,3.0,9.8,0.55,COMPLETE
7,7,6.25251,2023-11-30 05:37:13.780039,2023-11-30 05:54:45.749647,0 days 00:17:31.969608,0.75,0.0488,6,200,1.0,7.6,1.0,COMPLETE
3,3,6.877654,2023-11-30 01:22:47.342882,2023-11-30 02:15:26.059075,0 days 00:52:38.716193,0.65,0.0491,3,1600,9.1,8.7,0.5,COMPLETE
1,1,7.386375,2023-11-29 19:05:30.995341,2023-11-29 21:15:10.298134,0 days 02:09:39.302793,0.7,0.027,4,2200,9.8,6.2,1.0,COMPLETE


#### Best Params 

In [31]:
study.best_params

{'max_depth': 9,
 'learning_rate': 0.0011,
 'colsample_bytree': 0.9,
 'subsample': 0.8500000000000001,
 'reg_alpha': 6.5,
 'reg_lambda': 0.2,
 'n_estimators': 1000}

#### Plots

In [32]:
optuna.visualization.plot_optimization_history(study)

In [33]:
optuna.visualization.plot_slice(study)

In [34]:
optuna.visualization.plot_parallel_coordinate(study)

In [35]:
optuna.visualization.plot_param_importances(study)