In [1]:
import gc
import copy
from tqdm import tqdm
import logging

import pandas as pd
import numpy as np

from optiver_trading_at_the_close.feature_engineering import FE
from optiver_trading_at_the_close.column_selector import ColumnSelector
from optiver_trading_at_the_close.memory_reduction import MemoryReduction
from optiver_trading_at_the_close.mean_regressor_ensemble import MeanRegressorEnsemble
from optiver_trading_at_the_close.tabnet_regressor_pandas_wrapper import TabNetRegressorPandasWrapper

from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline

import lightgbm as lgb


## Hard Variables

In [2]:
DATA_PATH = './../data/train.csv'

## Read Data

In [3]:
df = pd.read_csv(DATA_PATH)

In [4]:
df.head()

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180602.69,1,0.999812,13380276.64,,,0.999812,60651.5,1.000026,8493.03,1.0,-3.029704,0,0_0_0
1,1,0,0,166603.91,-1,0.999896,1642214.25,,,0.999896,3233.04,1.00066,20605.09,1.0,-5.519986,0,0_0_1
2,2,0,0,302879.87,-1,0.999561,1819368.03,,,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995,0,0_0_2
3,3,0,0,11917682.27,-1,1.000171,18389745.62,,,0.999999,2324.9,1.000214,479032.4,1.0,-4.0102,0,0_0_3
4,4,0,0,447549.96,-1,0.999532,17860614.95,,,0.999394,16485.54,1.000016,434.1,1.0,-7.349849,0,0_0_4


## Feature Engineering

In [5]:
df = df.dropna(subset=['target'], axis=0)

X_train = df.loc[df['date_id'] <= 420]
X_test = df.loc[df['date_id'] > 420]

y_train = X_train['target']
# X_train = X_train.drop(columns='target')

y_test = X_test['target']
# X_test = X_test.drop(columns='target')

In [6]:
del df 
gc.collect()

0

In [7]:
pipeline = Pipeline(steps=[
    ('fe', FE()),
    # ('nn_features', NearestNeighborsFeatures(
    #     features_to_use_for_distance_computation=['seconds_in_bucket', 'wap', 'bid_plus_ask_sizes', 'bid_ask_size_imb'],
    #     get_target=True,
    #     features_get=['wap', 'bid_ask_size_imb'],
    #     n_neighbors=[40],
    #     metrics=['l1'],
    #     n_jobs=-1
    # )),
    ('column_selector', ColumnSelector(cols_to_drop=['time_id', 'row_id', 'date_id', 'target'])),
    ('memore_reduction', MemoryReduction()),
    ('mean_regressor_ensemble', MeanRegressorEnsemble([lgb.LGBMRegressor(n_estimators=1, max_depth=2)]))
])

pipeline.fit(X_train, y_train)

100%|██████████| 1/1 [02:04<00:00, 124.41s/it]


In [8]:
for step in pipeline.steps[:-1]:
    X_train = step[1].transform(X_train)
    X_test = step[1].transform(X_test)

## tabnet

In [9]:
import os
import joblib
import uuid

class Callback:
    """
    Abstract base class used to build new callbacks.
    """
    def __init__(self):
        pass

    def set_params(self, params):
        self.params = params

    def set_trainer(self, model):
        self.trainer = model

    def on_epoch_begin(self, epoch, logs=None):
        pass

    def on_epoch_end(self, epoch, logs=None):
        pass

    def on_batch_begin(self, batch, logs=None):
        pass

    def on_batch_end(self, batch, logs=None):
        pass

    def on_train_begin(self, logs=None):
        pass

    def on_train_end(self, logs=None):
        pass

class SaveModelCallback(Callback):
    def __init__(self, folder_path, model_name) -> None:
        self.folder_path = folder_path
        self.model_name = model_name
        
    def on_epoch_end(self, epoch, logs=None):
        os.makedirs(self.folder_path, exist_ok=True)
        joblib.dump(
            self.trainer,
            os.path.join(self.folder_path, f'{self.model_name}-{epoch}-{str(uuid.uuid4())}.joblib')
        )

In [10]:
X_train.shape

(4577893, 483)

In [21]:
model = TabNetRegressorPandasWrapper(
    cat_variables=['stock_id', 'dow', 'dom'],
    
    n_d=4,           # from 8 to 64
    n_a=4,           # equal n_d
    n_steps=5,       # bwtween 3 and 10
    gamma=1.3,       # between 1 and 2
    n_independent=2, # from 1 to 5
    n_shared=2,      # from 1 to 5
    seed=42,
    verbose=1
)

model.fit(
    X_train,
    y_train,
    load_model_path='tabnet_test_2/tabnet-9-bdeba228-b64b-4b13-8d4b-185a2a5c008c.joblib',
    max_epochs=100,
    eval_set=[
        (X_train.values, y_train.values.reshape(-1, 1)),
        (X_test.values, y_test.values.reshape(-1, 1))
    ],
    eval_name=['train', 'valid'],
    eval_metric=['mse', 'mae'],
    patience=50,
    batch_size=262144,
    virtual_batch_size=131072, # Has to divide batch_size
    drop_last=False,
    compute_importance=False,
    callbacks=[SaveModelCallback('./tabnet_test_3/', 'tabnet')]
) 
# epoch 9  | loss: 85.13423| train_mse: 85.31628| train_mae: 6.3185  | valid_mse: 74.08844| valid_mae: 5.73756 |  3:45:24s

: 