In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler
import os
import gc
import time
import warnings
from warnings import simplefilter
from itertools import combinations

import joblib
import seaborn as sns
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor, EShapCalcType, EFeaturesSelectionAlgorithm
from sklearn.metrics import mean_absolute_error

warnings.filterwarnings("ignore")
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)



In [2]:
is_offline = False    # Flag for online/offline mode
is_train = True    # Flag for training mode
is_infer = True    # Flag for inference mode
split_day = 435    # Split day for time series data

In [3]:
train = pd.read_csv("train.csv")

In [4]:
train.head(20)

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180602.69,1,0.999812,13380276.64,,,0.999812,60651.5,1.000026,8493.03,1.0,-3.029704,0,0_0_0
1,1,0,0,166603.91,-1,0.999896,1642214.25,,,0.999896,3233.04,1.00066,20605.09,1.0,-5.519986,0,0_0_1
2,2,0,0,302879.87,-1,0.999561,1819368.03,,,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995,0,0_0_2
3,3,0,0,11917682.27,-1,1.000171,18389745.62,,,0.999999,2324.9,1.000214,479032.4,1.0,-4.0102,0,0_0_3
4,4,0,0,447549.96,-1,0.999532,17860614.95,,,0.999394,16485.54,1.000016,434.1,1.0,-7.349849,0,0_0_4
5,5,0,0,0.0,0,1.000635,13552875.92,,,0.999779,1962.72,1.000635,5647.65,1.0,6.779432,0,0_0_5
6,6,0,0,969969.4,1,1.000115,3647503.98,,,0.999506,6663.16,1.000283,3810.48,1.0,-2.499819,0,0_0_6
7,7,0,0,9412959.1,1,0.999818,21261245.87,,,0.999741,5139.2,1.00013,2570.6,1.0,-1.959801,0,0_0_7
8,8,0,0,2394875.85,1,0.999916,9473209.08,,,0.999022,52011.6,1.000041,2169.36,1.0,-5.970001,0,0_0_8
9,9,0,0,3039700.65,-1,1.000969,6248958.45,,,0.999354,6191.0,1.000646,6199.0,1.0,7.970333,0,0_0_9


In [6]:
num_rows, num_columns = train.shape
print("Number of Rows: ", num_rows)
print("Number of Columns: ", num_columns)

Number of Rows:  5237980
Number of Columns:  17


In [7]:
train = train.dropna(subset=["target"])

# 🔁 Reset the index of the DataFrame and apply the changes in place
train.reset_index(drop=True, inplace=True)

num_rows, num_columns = train.shape
print("Number of Rows: ", num_rows)
print("Number of Columns: ", num_columns)

Number of Rows:  5237892
Number of Columns:  17


In [8]:
pos_changes = train['reference_price'].diff().clip(lower=1)
neg_changes = train['reference_price'].diff().clip(upper=1)

train['RSI'] = 1 - (100/(1 + pos_changes/neg_changes))

In [9]:
#train['date_id'] = pd.to_datetime(train['date_id'])

In [10]:
# Define the window size for the moving average
window_size = 2
# Compute the moving averages for each stock and the 'wap' column
train['SMA'] = train.groupby('stock_id')['wap'].transform(lambda x: x.rolling(window=window_size,  min_periods=1).mean())

In [11]:
daily_wap_sum = train.groupby('date_id')['wap'].sum()
rows_per_day = train.groupby('date_id').size()

overall_average_wap = daily_wap_sum / rows_per_day

train = train.merge(overall_average_wap.reset_index(name='daily_average_wap'), on='date_id', how='left')

In [23]:
#⚡Optiver 🚀Robust Best ⚡ Single Model features

train["volume"] = train.eval("ask_size + bid_size")
train["mid_price"] = train.eval("(ask_price + bid_price) / 2")
train["liquidity_imbalance"] = train.eval("(bid_size-ask_size)/(bid_size+ask_size)")
train["matched_imbalance"] = train.eval("(imbalance_size-matched_size)/(matched_size+imbalance_size)")
train["size_imbalance"] = train.eval("bid_size / ask_size")
train["price_spread"] = train["ask_price"] - train["bid_price"]
train['market_urgency'] = train['price_spread'] * train['liquidity_imbalance']

In [24]:
train.head(20)

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,...,RSI,SMA,daily_average_wap,volume,mid_price,liquidity_imbalance,matched_imbalance,size_imbalance,price_spread,market_urgency
0,0,0,0,3180602.69,1,0.999812,13380276.64,,,0.999812,...,,1.0,1.000021,69144.53,0.999919,0.75434,-0.61589,7.141326,0.000214,0.0001614287
1,1,0,0,166603.91,-1,0.999896,1642214.25,,,0.999896,...,0.991601,1.0,1.000021,23838.13,1.000278,-0.728751,-0.815787,0.156905,0.000764,-0.0005567654
2,2,0,0,302879.87,-1,0.999561,1819368.03,,,0.999403,...,1.033511,1.0,1.000021,56951.0,0.99985,0.332935,-0.714567,1.99821,0.000895,0.0002979771
3,3,0,0,11917682.27,-1,1.000171,18389745.62,,,0.999999,...,0.939037,1.0,1.000021,481357.3,1.000107,-0.99034,-0.213547,0.004853,0.000215,-0.0002129231
4,4,0,0,447549.96,-1,0.999532,17860614.95,,,0.999394,...,1.063941,1.0,1.000021,16919.64,0.999705,0.948687,-0.951109,37.976365,0.000622,0.0005900832
5,5,0,0,0.0,0,1.000635,13552875.92,,,0.999779,...,0.889822,1.0,1.000021,7610.37,1.000207,-0.484199,-1.0,0.347529,0.000856,-0.0004144739
6,6,0,0,969969.4,1,1.000115,3647503.98,,,0.999506,...,1.052027,1.0,1.000021,10473.64,0.999895,0.272368,-0.57987,1.748641,0.000777,0.0002116296
7,7,0,0,9412959.1,1,0.999818,21261245.87,,,0.999741,...,1.029709,1.0,1.000021,7709.8,0.999935,0.33316,-0.386262,1.999222,0.000389,0.0001295994
8,8,0,0,2394875.85,1,0.999916,9473209.08,,,0.999022,...,0.990201,1.0,1.000021,54180.96,0.999532,0.919922,-0.596417,23.97555,0.001019,0.0009374002
9,9,0,0,3039700.65,-1,1.000969,6248958.45,,,0.999354,...,0.894811,1.0,1.000021,12390.0,1.0,-0.000646,-0.345503,0.998709,0.001292,-8.342211e-07


In [25]:
train.tail(20)

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,...,RSI,SMA,daily_average_wap,volume,mid_price,liquidity_imbalance,matched_imbalance,size_imbalance,price_spread,market_urgency
5237872,180,480,540,563043.09,-1,0.998141,6717567.0,0.996492,0.996767,0.997866,...,0.785362,0.997853,0.999562,43080.02,0.998004,0.361296,-0.845331,2.131339,0.000275,9.9e-05
5237873,181,480,540,498430.75,1,1.001248,13842170.0,1.001527,1.001304,1.001081,...,0.690262,1.000977,0.999562,61060.32,1.001193,-0.958833,-0.930487,0.021016,0.000223,-0.000214
5237874,182,480,540,2685032.08,1,0.999491,38488390.0,1.00021,1.00021,0.999311,...,1.176009,0.999431,0.999562,344639.36,0.999401,0.219214,-0.869575,1.561522,0.00018,3.9e-05
5237875,183,480,540,571821.88,-1,0.996694,97527970.0,0.996468,0.996468,0.996694,...,1.280485,0.996718,0.999562,103384.11,0.99675,-0.615091,-0.988342,0.238321,0.000113,-7e-05
5237876,184,480,540,283509.38,1,0.99665,15907410.0,0.998372,0.997156,0.996447,...,1.0044,0.996506,0.999562,138955.61,0.996549,-0.084319,-0.964979,0.844476,0.000203,-1.7e-05
5237877,185,480,540,3018493.53,-1,1.000057,27238940.0,0.997937,0.998915,1.000029,...,0.660457,1.000314,0.999562,255106.09,1.000165,-0.769175,-0.800479,0.13047,0.000272,-0.000209
5237878,186,480,540,2106025.88,-1,0.99978,19883460.0,0.998713,0.998713,0.99978,...,1.027708,0.999818,0.999562,691699.59,0.999856,-0.494699,-0.808452,0.338062,0.000153,-7.6e-05
5237879,187,480,540,3791745.66,1,0.999641,33806280.0,1.000641,1.000564,0.999564,...,1.013902,0.999632,0.999562,97820.38,0.999602,-0.508661,-0.798301,0.325679,7.7e-05,-3.9e-05
5237880,188,480,540,18930.38,-1,0.999741,6465049.0,0.999741,0.999741,0.999741,...,0.990001,0.999899,0.999562,334446.86,1.000019,-0.29922,-0.994161,0.539385,0.000556,-0.000166
5237881,189,480,540,956757.96,1,1.000715,37109380.0,1.001167,1.001054,1.000602,...,0.902695,1.000691,0.999562,837960.32,1.000659,0.881619,-0.949732,15.894634,0.000113,0.0001


In [16]:
offline_train = train[train["date_id"] <= split_day]
offline_test = train[train["date_id"] > split_day]
print("Offline mode")
print(f"train : {offline_train.shape}, valid : {offline_test.shape}")

Offline mode
train : (4742893, 20), valid : (494999, 20)
