In [1]:
%%capture

import polars as pl
import pandas as pd
import numpy as np
import os
import warnings; warnings.filterwarnings(action='ignore')

from gc import collect
from pprint import pprint

from xgboost import XGBRegressor

In [2]:
TARGET = "responder_6"
WEIGHT = 'weight'
FEATURES = [f"feature_{i:02d}" for i in range(79)]

RANDOM_STATE = 42
N_FOLDS = 5

# **DATA LOADING**

Here, we load the data and describe the CV scheme. We don't need to specify the sub-directory paths while importing the datasets; polars knows to import all training components as this is a **hive** dataset. Specifying the train path is enough. Weights parameter is important here — this is a sample weight used in our custom eval-metric.


In [3]:
def load_data(file_path): 
    id_col = pl.int_range(pl.len(), dtype=pl.UInt32).alias("id") # Generate an id column
    all_cols = pl.all() # Select all columns

    # Read the parquet file and select the specified columns
    data = pl.scan_parquet(file_path).select(id_col, all_cols)
    
    all_col_names = data.collect_schema().names()
    
    # Cols to not look for when classifying train and target column names
    cols_of_disinterest = ("weight", "id", "date_id", "time_id", "partition_id")
    target_columns, selected_columns = [], []

    # Factory for loop to classify train and target column names
    for col in all_col_names: 
        if col.startswith("responder"):
            target_columns.append(col)

        elif not col.startswith(cols_of_disinterest):
            selected_columns.append(col)

    # Sample weights for custom metric 
    # weights = train.select(pl.col("weight")).collect().to_series()
    # date_column = train.select(pl.col("date_id")).collect()
    
    data = data.collect()
    
    return data

In [4]:
os.system('mkdir models')
model_path = '/kaggle/input/js_models'

mkdir: cannot create directory 'models': File exists


In [5]:
file_path = "/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet"

raw_data = load_data(file_path)

dates_to_skip = 500
num_test_dates = 100

# Filter the DataFrame to include only dates greater than or equal to dates_to_skip
raw_data = raw_data.filter(pl.col('date_id') >= dates_to_skip)

# Get unique dates from the DataFrame
dates = raw_data['date_id'].unique()

# Define validation dates as the last `num_test_dates` dates
test_dates = dates[-num_test_dates:]

# Define training dates as all dates except the last `num_test_dates` dates
train_dates = dates[:-num_test_dates]

raw_data.head(3)

id,date_id,time_id,symbol_id,weight,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,feature_07,feature_08,feature_09,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,…,feature_52,feature_53,feature_54,feature_55,feature_56,feature_57,feature_58,feature_59,feature_60,feature_61,feature_62,feature_63,feature_64,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,responder_0,responder_1,responder_2,responder_3,responder_4,responder_5,responder_6,responder_7,responder_8,partition_id
u32,i16,i16,i8,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,i8,i8,i16,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,…,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,i64
7550157,500,0,1,4.372602,0.963079,0.84618,1.939234,1.16474,-3.205123,1.400332,1.898511,0.741687,-0.771141,11,7,76,-0.868997,0.425592,-0.567527,,-0.190049,,-1.571221,-1.437043,1.045398,,2.7043,1.706279,0.158466,0.663829,,,1.811922,-0.342157,-0.454954,,…,,,-0.180584,,-1.756384,1.894943,,1.671203,0.640596,0.202007,-0.301725,-0.227596,-0.232882,-1.177839,-1.555593,-0.839841,0.338598,-0.424715,-1.003515,0.59359,-0.449099,,,-0.299348,-0.242119,-0.409873,-0.217973,-0.788824,-0.48744,-0.964526,-0.626286,-0.334997,-0.268242,-0.310255,-0.004007,0.286226,2
7550158,500,0,2,1.199431,0.999282,0.764294,1.495962,1.022274,-3.490678,2.477362,1.646399,0.750717,-0.850572,81,2,59,-1.189323,-0.369424,-0.490278,,-0.423214,,-1.497059,-1.589559,-0.578691,,-0.466578,-0.832514,0.216229,-0.178512,,,0.522295,-0.337715,-0.75995,,…,,,-0.50654,,-2.575124,2.466314,,-0.942976,-0.517649,0.202007,0.30086,0.180316,0.457092,-1.667572,-1.902079,-1.052028,-0.368874,-0.689406,-0.665303,-0.233074,-0.753893,,,2.39499,1.658236,3.359613,3.601644,-0.291857,-0.329453,0.17402,0.122967,0.603747,0.371612,0.369653,0.823404,0.424229,2
7550159,500,0,3,0.689271,2.156528,0.379673,1.816525,1.40613,-2.788149,1.527157,1.377196,0.809322,-0.592266,4,3,11,-1.142846,0.255657,-0.589891,,0.357676,,-2.12926,-1.50641,-0.194643,,-1.396933,-0.544472,0.455939,1.345349,,,-1.497517,-0.453106,-0.511397,,…,,,-0.107985,,-0.649515,1.734169,,4.556169,1.784292,0.202007,-0.047826,-0.341877,-0.189566,,,-0.711492,0.055067,-0.587364,-1.103039,0.416081,-0.582688,,,1.390904,1.278604,0.061478,0.17966,-0.205636,-0.101051,-0.373885,0.610414,0.830006,-0.512456,0.776823,0.907171,-0.480423,2


In [6]:
test_data = raw_data.filter(pl.col('date_id').is_in(test_dates))

# Prepare validation data for training
X_test = test_data[FEATURES]
y_test = test_data[TARGET]
w_test = test_data[WEIGHT]

In [7]:
def train(model): 
    # Select dates for training based on the fold number
    selected_dates = [date for ii, date in enumerate(train_dates) if ii % N_FOLDS != i]
    
    train_data = raw_data.filter(pl.col('date_id').is_in(selected_dates))
    
    X_train = train_data[FEATURES]
    y_train = train_data[TARGET]
    w_train = train_data[WEIGHT]
    
    # Train XGBoost model with early stopping and verbose logging
    model.fit(X_train, y_train, sample_weight=w_train, 
              eval_set=[(X_test, y_test)], 
              sample_weight_eval_set=[w_test], 
              verbose=10, 
              early_stopping_rounds=100)
    
    # Append the trained model to the list
    models.append(model)

    # Save the trained model to a file
    joblib.dump(model, f'./models/{model_name}_{i}.model')
    
    
# Custom R2 metric for XGBoost
def r2_xgb(y_true, y_pred, sample_weight):
    r2 = 1 - np.average((y_pred - y_true) ** 2, weights=sample_weight) / (np.average((y_true) ** 2, weights=sample_weight) + 1e-38)
    
    return -r2

In [None]:
xgb_params = {'n_estimators': 2000,
              'learning_rate': 0.1, 
              'max_depth': 6, 
              'tree_method': 'hist', 
              'objective': 'reg:squarederror',
              'eval_metric': r2_xgb,
              'disable_default_eval_metric': True}

xgb = XGBRegressor(**xgb_params)

for i in range(N_FOLDS): 
    train(xgb)

In [None]:
lags_ : pl.DataFrame | None = None

# You can return either a Pandas or Polars dataframe, though Polars is recommended.
# Each batch of predictions (except the very first) must be returned within 10 minutes of the batch features being provided.
def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
    """Make a prediction."""
    # All the responders from the previous day are passed in at time_id == 0. We save them in a global variable for access at every time_id.
    # Use them as extra features, if you like.
    global lags_
    
    if lags is not None:
        lags_ = lags

    # Predictions are clipped between -5 and 5.
    predictions = test.select(
        'row_id',
        pl.lit(0.0).clip(-5, 5).alias('responder_6'),
    )

    # The predict function must return a DataFrame
    assert isinstance(predictions, pl.DataFrame | pd.DataFrame)
    # with columns 'row_id', 'responder_6'
    assert predictions.columns == ['row_id', 'responder_6']
    # and as many rows as the test data.
    assert len(predictions) == len(test)

    return predictions

In [None]:
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            '/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet',
            '/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet',
        )
    )