In [1]:
import warnings
from warnings import simplefilter
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error
warnings.filterwarnings("ignore")
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

### Data Pre-processing

In [22]:
from itertools import combinations
import gc
def data_preprocessing(df):
    cols = [c for c in df.columns if c not in ['row_id', 'time_id']]
    df = df[cols]

    prices = ['reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap']

    for c in combinations(prices, 2):
        df[f'{c[0]}_minus_{c[1]}'] = (df[f'{c[0]}'] - df[f'{c[1]}']).astype(np.float32)
        df[f'{c[0]}_times_{c[1]}'] = (df[f'{c[0]}'] * df[f'{c[1]}']).astype(np.float32)
        df[f'{c[0]}_{c[1]}_imb'] = df.eval(f'({c[0]}-{c[1]})/({c[0]}+{c[1]})')

    for c in combinations(prices, 3):
        max_ = df[list(c)].max(axis=1)
        min_ = df[list(c)].min(axis=1)
        mid_ = df[list(c)].sum(axis=1)-min_-max_
        df[f'{c[0]}_{c[1]}_{c[2]}_imb2'] = (max_-mid_)/(mid_-min_)

    df.drop(columns=['date_id'], inplace=True)
    gc.collect()
    return df

def data_split_day(df:pd.DataFrame, _min, _max):
    df = df[df['date_id'] < _max]
    df = df[df['date_id'] >= _min]
    return df

def data_split_xy_and_data_preprocess(df:pd.DataFrame):
    x = df.drop(['target'],axis=1)
    x = data_preprocessing(x)
    y = df[['target']]
    return x, y

# Read the dataset from a CSV file using Pandas
df = pd.read_csv("train.csv")
df = df.dropna(subset=['target'])

df_train = data_split_day(df, 0, 399)
df_valid = data_split_day(df, 400, 478)
df_tests = data_split_day(df, 478, 481)

df_train_x, df_train_y = data_split_xy_and_data_preprocess(df_train)
df_valid_x, df_valid_y = data_split_xy_and_data_preprocess(df_valid)
df_tests_x, df_tests_y = data_split_xy_and_data_preprocess(df_tests)

### LGBMRegressor

In [10]:
from lightgbm import LGBMRegressor
lgb_params = {
        "objective": "mae",
        "n_estimators": 500,
        "num_leaves": 256,
        "subsample": 0.6,
        "colsample_bytree": 0.8,
        "learning_rate": 0.1,
        'max_depth': 40,
        "n_jobs": 8,
        "device": "cpu",
        "verbosity": -1,
        "importance_type": "gain",
        "reg_alpha": 0.2,
        "reg_lambda": 3.25
    }
LGB=LGBMRegressor(**lgb_params)

LGB.fit(
    df_train_x,  df_train_y,
    eval_set=[(df_valid_x, df_valid_y)],
    callbacks=[
        lgb.callback.early_stopping(stopping_rounds=100),
        lgb.callback.log_evaluation(period=10),
    ],
)

Training until validation scores don't improve for 100 rounds
[10]	valid_0's l1: 5.9661
[20]	valid_0's l1: 5.94858
[30]	valid_0's l1: 5.94158
[40]	valid_0's l1: 5.93865
[50]	valid_0's l1: 5.93717
[60]	valid_0's l1: 5.93414
[70]	valid_0's l1: 5.93243
[80]	valid_0's l1: 5.93206
[90]	valid_0's l1: 5.93169
[100]	valid_0's l1: 5.93202
[110]	valid_0's l1: 5.93143
[120]	valid_0's l1: 5.93129
[130]	valid_0's l1: 5.93146
[140]	valid_0's l1: 5.93125
[150]	valid_0's l1: 5.93116
[160]	valid_0's l1: 5.93158
[170]	valid_0's l1: 5.93167
[180]	valid_0's l1: 5.93183
[190]	valid_0's l1: 5.93207
[200]	valid_0's l1: 5.93213
[210]	valid_0's l1: 5.93222
[220]	valid_0's l1: 5.93237
[230]	valid_0's l1: 5.93266
[240]	valid_0's l1: 5.9327
[250]	valid_0's l1: 5.93277
Early stopping, best iteration is:
[151]	valid_0's l1: 5.93109


### Evaluation

In [11]:
pred_LGB = LGB.predict(df_train_x)
print("Train Loss : ", mean_absolute_error(df_train_y, pred_LGB))

pred_LGB = LGB.predict(df_valid_x)
print("valid Loss : ", mean_absolute_error(df_valid_y, pred_LGB))

pred_LGB = LGB.predict(df_tests_x)
print("Test Loss : ", mean_absolute_error(df_tests_y, pred_LGB))

Train Loss :  6.274945251444542
valid Loss :  5.93108832774886
Test Loss :  5.228813033954659


### Submission

In [13]:
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()
counter=0

for (test, revealed_targets, sample_prediction) in iter_test:
    feat = data_preprocessing(test[[ 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size','far_price', 'near_price',
       'bid_price', 'bid_size', 'ask_price', 'ask_size', 'wap','row_id']])
    sample_prediction['target'] = LGB.predict(feat)
    env.predict(sample_prediction)
    counter += 1


ModuleNotFoundError: No module named 'optiver2023.competition'