In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
import os

import lightgbm as lgb
import xgboost as xgb
import catboost as cbt

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error

In [None]:
# Function to generates lag feature
def add_lagged_features(df, cols, shifts=1, add_first=True):
    for col in cols:
        grouped_vals = df[["stock_id", "date_id", col]].groupby(["stock_id","date_id"])
        fill_value = df[col].mean()
        for shift in np.arange(shifts):
            df[col+"_shift"+str(shift+1)] = grouped_vals.shift(shift+1).fillna(fill_value)
        if add_first:
            df = df.merge(grouped_vals.first().reset_index(), on=["date_id","stock_id"], suffixes=["","_first"])
    return df

In [None]:
# Main function to generate all features
def generate_feature(df):         
    df = add_lagged_features(df, ["imbalance_ratio","imbalance_indicator","reference_price","wap", "bid-ask_spread_indicator", "bid_size", "ask_size"], shifts=5, add_first=True)
    features = [c for c in df.columns if c not in ["row_id", "time_id", "date_id"]]
    return df[features]

In [None]:
# Preprocessing dataframe and split into X, Y
def preprocess(df):
    df = generate_feature(df)
    df = df.dropna()
    y = df["target"]
    df = df.drop(columns=["target"])
    x = df
    return x, y

In [None]:
# Input dataframe
df = pd.read_csv('../input/optiver-trading-at-the-close/train.csv')

In [None]:
# Cross validation
df.set_index("date_id", inplace=True)
days  = np.sort(df.index.unique())
tscv = TimeSeriesSplit(5)
for fold, (train_index, val_index) in enumerate(tscv.split(days)):
    if fold != 4:
        continue
    train_days, test_days = days[train_index], days[val_index]
    train_df, val_df = df.loc[train_days], df.loc[test_days]
    
    print(f"Train size: {len(train_df)} Train percentage: {len(train_df)/len(df)}")
    print(f"Val size: {len(val_df)} Val percentage:{len(val_df)/len(df)}")
    print(f"Total size: {(len(train_df) + len(val_df))} Total percentage: {(len(train_df)+ len(val_df)) / len(df)}")
    
    train_df.reset_index(inplace=True)
    val_df.reset_index(inplace=True)
    
    train_x, train_y = preprocess(train_df)
    val_x, val_y = preprocess(val_df)

In [None]:
# Train and validation dataset split
train_df = df[df["date_id"]<335].copy()
val_df = df[df["date_id"]>=335].copy()

print(f"Train size: {len(train_df)} Train percentage: {len(train_df)/len(df)}")
print(f"Val size: {len(val_df)} Val percentage:{len(val_df)/len(df)}")
print(f"Total size: {(len(train_df) + len(val_df))} Total percentage: {(len(train_df)+ len(val_df)) / len(df)}")
    
train_x, train_y = preprocess(train_df)
val_x, val_y = preprocess(val_df)

In [None]:
models = []

In [None]:
# LGBM model
lgbm_model = lgb.LGBMRegressor(objective='regression_l1', n_estimators=500)
lgbm_model.fit(train_x, train_y, eval_set=[(val_x, val_y)], verbose=10, early_stopping_rounds=100)
print(lgbm_model.best_score_)
models.append(lgbm_model)

In [None]:
# Catboost model
cbt_model = cbt.CatBoostRegressor(objective='MAE', iterations=3000)
cbt_model.fit(train_x, train_y, eval_set=[(val_x, val_y)], early_stopping_rounds=100, verbose=10)
models.append(cbt_model)

In [None]:
# Submit
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()
counter = 0
for (test, revealed_targets, sample_prediction) in iter_test:
    test = test.drop(columns=["currently_scored"])
    test = generate_feature(test)
    sample_prediction['target'] = np.mean([model.predict(test) for model in models], 0)
    env.predict(sample_prediction)
    counter += 1