In [1]:
import os
import joblib
import uuid

import pandas as pd

from optiver_trading_at_the_close.feature_engineering import FE
from optiver_trading_at_the_close.nearest_neightbors_features import NearestNeighborsFeatures
from optiver_trading_at_the_close.column_selector import ColumnSelector
from optiver_trading_at_the_close.memory_reduction import MemoryReduction
from optiver_trading_at_the_close.mean_regressor_ensemble import MeanRegressorEnsemble

from sklearn.metrics import mean_absolute_error

from sklearn.pipeline import Pipeline
import lightgbm as lgb
import catboost as cb

## Hard Variables

In [2]:
DATA_PATH = './../data/train.csv'

VERSION = 'V14'

MODEL = 'lightgbm'
HYPERPARAMETERS = [
    {
        'objective': 'mae',
        
        'boosting_type': 'gbdt',
        'max_bin': 256,
        'learning_rate': 0.015,
        'max_depth': 12,
        'n_estimators': 1400,
        'num_leaves': 300,
        'reg_alpha': 0.005,
        'reg_lambda': 0.001,
        'colsample_bytree': 0.6,
        'subsample': 0.875,
        'min_child_samples': 128,
        
        'random_state': 42,
        'n_jobs':-1,
        'verbose':-1
    }
]

# MODEL = 'catboost'
# HYPERPARAMETERS = [
#     {
#         'loss_function': 'MAE',
        
#         'n_estimators': 1800,
#         'learning_rate': 0.1248,
#         'max_depth': 3,
#         'min_child_samples': 5500,
#         'reg_lambda': 8.45,
#         'subsample': 0.7,
        
#         'random_state': 42,
#         'thread_count':-1,
#         'verbose':False
#     }
# ]

SAVE_MODEL_PATH_FOLDER = f'./../output/models/{VERSION}/{MODEL}/'

## Read Data

In [3]:
df = pd.read_csv(DATA_PATH)

In [4]:
df.head()

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180602.69,1,0.999812,13380276.64,,,0.999812,60651.5,1.000026,8493.03,1.0,-3.029704,0,0_0_0
1,1,0,0,166603.91,-1,0.999896,1642214.25,,,0.999896,3233.04,1.00066,20605.09,1.0,-5.519986,0,0_0_1
2,2,0,0,302879.87,-1,0.999561,1819368.03,,,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995,0,0_0_2
3,3,0,0,11917682.27,-1,1.000171,18389745.62,,,0.999999,2324.9,1.000214,479032.4,1.0,-4.0102,0,0_0_3
4,4,0,0,447549.96,-1,0.999532,17860614.95,,,0.999394,16485.54,1.000016,434.1,1.0,-7.349849,0,0_0_4


## Feature Engineering

In [5]:
df = df.dropna(subset=['target'], axis=0)

y = df['target']

## Fit Model

In [6]:
def create_model(model, hyperparamters):
    if model == 'lightgbm':
        return lgb.LGBMRegressor(**hyperparamters)
    elif model == 'catboost':
        return cb.CatBoostRegressor(**hyperparamters)
    
    raise ValueError(f'Not valid model. Valid models are \'lightgbm\', \'catboost\'.')

In [7]:
estimators = [create_model(MODEL, hyperparameter) for hyperparameter in HYPERPARAMETERS]

pipeline = Pipeline(steps=[
    ('fe', FE()),
    # ('nn_features', NearestNeighborsFeatures(
    #     features_to_use_for_distance_computation=['seconds_in_bucket', 'wap', 'bid_plus_ask_sizes', 'bid_ask_size_imb'],
    #     get_target=True,
    #     features_get=['wap', 'bid_ask_size_imb'],
    #     n_neighbors=[40],
    #     metrics=['l1'],
    #     n_jobs=-1
    # )),
    ('column_selector', ColumnSelector(cols_to_drop=['time_id', 'row_id', 'date_id', 'target'])),
    ('memore_reduction', MemoryReduction()),
    ('mean_regressor_ensemble', MeanRegressorEnsemble(estimators))
])

# pipeline.fit(X_train, y_train, mean_regressor_ensemble__fit_estimators=False)
if MODEL == 'lightgbm':
    pipeline.fit(df, y)
elif MODEL == 'catboost':
    pipeline.fit(
        df,
        y,
        mean_regressor_ensemble__cat_features=['stock_id', 'imbalance_buy_sell_flag', 'dow', 'dom', 'seconds', 'minute']
    )

100%|██████████| 1/1 [1:09:08<00:00, 4148.97s/it]


## Save Model

In [8]:
for estimator in pipeline['mean_regressor_ensemble'].estimators:
    os.makedirs(SAVE_MODEL_PATH_FOLDER, exist_ok=True)
    joblib.dump(estimator, os.path.join(SAVE_MODEL_PATH_FOLDER, f'{MODEL}-{str(uuid.uuid4())}.joblib'))

In [9]:
# DATA_PATH = './../data/train.csv'

# VERSION = 'V9_use_cv'

# MODEL = 'lightgbm'
# HYPERPARAMETERS = [
#     {
#         'objective': 'mae',
        
#         'boosting_type': 'gbdt',
#         'max_bin': 256,
#         'learning_rate': 0.015,
#         'max_depth': 12,
#         'n_estimators': 1400,
#         'num_leaves': 300,
#         'reg_alpha': 0.005,
#         'reg_lambda': 0.001,
#         'colsample_bytree': 0.6,
#         'subsample': 0.875,
#         'min_child_samples': 128,
        
#         'random_state': 42,
#         'n_jobs':-1,
#         'verbose':-1
#     }
# ]
# SAVE_MODEL_PATH_FOLDER = f'./../output/models/{VERSION}/lightgbm/'
# SAVE_MODEL_BASE_NAME = 'lightgbm'

In [10]:
# estimators = [create_model(MODEL, hyperparameter) for hyperparameter in HYPERPARAMETERS]

# pipeline = Pipeline(steps=[
#     ('fe', FE()),
#     # ('nn_features', NearestNeighborsFeatures(
#     #     features_to_use_for_distance_computation=['seconds_in_bucket', 'wap', 'bid_plus_ask_sizes', 'bid_ask_size_imb'],
#     #     get_target=True,
#     #     features_get=['wap', 'bid_ask_size_imb'],
#     #     n_neighbors=[40],
#     #     metrics=['l1'],
#     #     n_jobs=-1
#     # )),
#     ('column_selector', ColumnSelector(cols_to_drop=['time_id', 'row_id', 'date_id', 'target'])),
#     ('memore_reduction', MemoryReduction()),
#     ('mean_regressor_ensemble', MeanRegressorEnsemble(estimators))
# ])

# # pipeline.fit(X_train, y_train, mean_regressor_ensemble__fit_estimators=False)
# pipeline.fit(df, y, mean_regressor_ensemble__nr_cv_folds=10)

In [11]:
# for estimator in pipeline['mean_regressor_ensemble'].estimators:
#     os.makedirs(SAVE_MODEL_PATH_FOLDER, exist_ok=True)
#     joblib.dump(estimator, os.path.join(SAVE_MODEL_PATH_FOLDER, f'{SAVE_MODEL_BASE_NAME}-{str(uuid.uuid4())}.joblib'))