Here we go, time to get cracking with an aggregate of models.

In [None]:
import os
import gc

import pandas as pd
import polars as pl
import pandas as pd
import numpy as np

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

import random

import kaggle_evaluation.jane_street_inference_server

In [None]:
# Setting a random seed so that we can compare how
# changes in the code are changing our score.
# We don't want randomness to be the reason for score changes.
def seed_randomness(seed):
    # We have to set it for both python and numpy
    random.seed(seed)
    np.random.seed(seed)

seed_randomness(42)

**LOADING DATA**

When training the model, we kept running into issues with the notebook trying to allocate more memory than was available. After some looking around online, we found a function that reduces the memory usage of a pandas dataframe by going through each of its columns and opting for smaller data types wherever possible. The code was found from: https://www.machinelearningplus.com/data-manipulation/how-to-reduce-the-memory-size-of-pandas-data-frame/

In [None]:
def reduce_mem_usage(df):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage(deep=True).sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':  # for integers
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:  # for floats.
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
data = []
MAX_TIME_ID = 967
# Due to time and memory constraints, we can't look at all of the training data
# Look at the end of the data set because it shows the responder trends more clearly
# (it also has fewer missing values)
for i in range(6, 10):
    train=pl.read_parquet(f"/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id={i}/part-0.parquet")
    train=train.to_pandas()
    # Feauture engineering: create new features based on time_id
    # (shows how far through the day we are)
    # sin and cos are cyclic so they wrap around to make the end of one day the same and the start of the next
    train['sin_time'] = np.sin(2*np.pi*train['time_id']/MAX_TIME_ID)
    train['cos_time'] = np.cos(2*np.pi*train['time_id']/MAX_TIME_ID)
    train['two_sin_time'] = np.sin(8*np.pi*train['time_id']/MAX_TIME_ID)
    train['two_cos_time'] = np.cos(8*np.pi*train['time_id']/MAX_TIME_ID)
    #train['four_sin_time']=np.sin(8*np.pi*train['time_id']/MAX_TIME_ID)
    #train['four_cos_time']=np.cos(8*np.pi*train['time_id']/MAX_TIME_ID)
    
    # Reduce memory usage
    train = reduce_mem_usage(df=train)
    # Storing data in the list
    data.append(train)

# Concatenate all of the pandas objects into one
train = pd.concat(data)

# Remove the stuff that we don't need anymore
del data
gc.collect

# Here is the ordering for the features that we want to use for training
training_features = ['symbol_id','sin_time','cos_time','two_sin_time','two_cos_time'] + [f'feature_0{i}' if i<10 else f'feature_{i}' for i in range(79)]
train = train[['responder_6'] + training_features]

# Let's see what the start of the training data is looking like
train.head()

**TRAINING THE MODEL**

We will be using several open source libraries that can each train up specific machine learning models. The libraries that we have chosen are xgboost, lightgbm, and catboost.

Here are the parameters for each of them that we have landed on through reading the library documentation and doing iterative hyperparameter tuning:

In [None]:
lgb_params={'boosting_type': 'gbdt','metric':'rmse','random_state':2025,'max_depth':10,'learning_rate':0.1,'n_estimators':120,'colsample_bytree':0.6,'colsample_bynode':0.6,'reg_alpha': 0.2,'reg_lambda':5,'extra_trees':True,'num_leaves':64,'max_bin':255,'device':'gpu','gpu_use_dp':True,}
cat_params={'task_type':'GPU','random_state':2025,'eval_metric':'RMSE','bagging_temperature':0.50,'iterations':200,'learning_rate':0.1,'max_depth':12,'l2_leaf_reg':1.25,'min_data_in_leaf':24,'random_strength':0.25,}
xgb_params={'random_state': 2025, 'n_estimators': 125,'learning_rate':0.1,'max_depth':10,'reg_alpha':0.08,'reg_lambda':0.8,'subsample':0.95,'colsample_bytree':0.6,'min_child_weight':3,'tree_method':'hist','device':'cuda',}

lgb=LGBMRegressor(**lgb_params)
lgb.fit(train[training_features].values,train['responder_6'].values)

cat=CatBoostRegressor(**cat_params)
cat.fit(train[training_features].values,train['responder_6'].values)

xgb=XGBRegressor(**xgb_params)
xgb.fit(train[training_features].values,train['responder_6'].values)

The evaluation API requires that you set up a server which will respond to inference requests. We have already defined the server; you just need write the predict function. When we evaluate your submission on the hidden test set the client defined in `jane_street_gateway` will run in a different container with direct access to the hidden test set and hand off the data timestep by timestep.



Your code will always have access to the published copies of the files.

In [None]:
lags_ : pl.DataFrame | None = None


# Replace this function with your inference code.
# You can return either a Pandas or Polars dataframe, though Polars is recommended.
# Each batch of predictions (except the very first) must be returned within 1 minute of the batch features being provided.
def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
    """Make a prediction."""

    global lgb, xgb, cat
    # All the responders from the previous day are passed in at time_id == 0. We save them in a global variable for access at every time_id.
    # Use them as extra features, if you like.
    global lags_
    if lags is not None:
        lags_ = lags

    predictions = test.select(
        'row_id',
        pl.lit(0.0).alias('responder_6'),
    )
    test=test.to_pandas()
    test['sin_time']=np.sin(2*np.pi*test['time_id']/MAX_TIME_ID)
    test['cos_time']=np.cos(2*np.pi*test['time_id']/MAX_TIME_ID)
    test['two_sin_time']=np.sin(4*np.pi*test['time_id']/MAX_TIME_ID)
    test['two_cos_time']=np.cos(4*np.pi*test['time_id']/MAX_TIME_ID)
    #test['four_sin_time']=np.sin(8*np.pi*test['time_id']/MAX_TIME_ID)
    #test['four_cos_time']=np.cos(8*np.pi*test['time_id']/MAX_TIME_ID)
    test=test.fillna(-1)
    test=test[training_features]
    eps=1e-10
    test_preds=0.55*lgb.predict(test)+0.2*cat.predict(test)+0.25*xgb.predict(test)
    test_preds=np.clip(test_preds,-5+eps,5-eps)
    predictions = predictions.with_columns(pl.Series('responder_6', test_preds.ravel()))
    return predictions

When your notebook is run on the hidden test set, inference_server.serve must be called within 15 minutes of the notebook starting or the gateway will throw an error. If you need more than 15 minutes to load your model you can do so during the very first `predict` call, which does not have the usual 1 minute response deadline.

In [None]:
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            '/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet',
            '/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet',
        )
    )