In [34]:
import os

import pandas as pd
import polars as pl

import kaggle_evaluation.jane_street_inference_server

In [35]:
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [36]:
df0 = pd.read_parquet('/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=0/part-0.parquet')
display(df0)

# To deal with NaN feature values
df0.fillna(0, inplace=True)

def get_lagged_responders(df):

    # - Shifts responder column values one down to simulate lagged responders
    # * "Top row" will be replaced with NaN values
    lagged = df.groupby('symbol_id')[['date_id', 'time_id']+[f"responder_{i}" for i in range(9)]].shift(1)
    lagged.columns = [f'{i}_lag_1' for i in lagged.columns]
    return pd.concat([df, lagged], axis=1)

df0 = get_lagged_responders(df0)

# Drop rows with NaN values, i.e. the "Top row"s mentioned above that have no lagged responder values
df0.dropna(inplace=True)

# X = df0[[col for col in df0.columns if col != 'responder_6']]
X = df0.drop(columns=['weight']+[f'responder_{i}' for i in range(9)])
y = df0['responder_6']

# display(X.columns)
# display(y.columns)

Unnamed: 0,date_id,time_id,symbol_id,weight,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,...,feature_78,responder_0,responder_1,responder_2,responder_3,responder_4,responder_5,responder_6,responder_7,responder_8
0,0,0,1,3.889038,,,,,,0.851033,...,-0.281498,0.738489,-0.069556,1.380875,2.005353,0.186018,1.218368,0.775981,0.346999,0.095504
1,0,0,7,1.370613,,,,,,0.676961,...,-0.302441,2.965889,1.190077,-0.523998,3.849921,2.626981,5.000000,0.703665,0.216683,0.778639
2,0,0,9,2.285698,,,,,,1.056285,...,-0.096792,-0.864488,-0.280303,-0.326697,0.375781,1.271291,0.099793,2.109352,0.670881,0.772828
3,0,0,10,0.690606,,,,,,1.139366,...,-0.296244,0.408499,0.223992,2.294888,1.097444,1.225872,1.225376,1.114137,0.775199,-1.379516
4,0,0,14,0.440570,,,,,,0.955200,...,3.418133,-0.373387,-0.502764,-0.348021,-3.928148,-1.591366,-5.000000,-3.572820,-1.089123,-5.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1944205,169,848,19,3.438631,,,,,,-0.028087,...,-0.166964,0.983339,-0.669860,0.272615,-3.676842,-1.221126,1.070584,0.465345,0.207483,0.874975
1944206,169,848,30,0.768528,,,,,,-0.022584,...,-0.352810,0.992615,0.961595,1.089402,0.796034,0.488380,1.846634,-0.088542,-0.008324,-0.153451
1944207,169,848,33,1.354696,,,,,,-0.024804,...,-0.239716,1.701618,0.757672,-5.000000,-3.174266,-1.110790,-3.349107,-0.407801,-0.185842,-0.931004
1944208,169,848,34,1.021797,,,,,,-0.016138,...,-0.442859,-2.036891,-0.064228,1.919665,1.827681,0.872019,3.248694,0.254584,0.090288,0.434726


In [37]:
model = XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_val)

display(mean_squared_error(y_val, y_pred))

0.1173139

The evaluation API requires that you set up a server which will respond to inference requests. We have already defined the server; you just need write the predict function. When we evaluate your submission on the hidden test set the client defined in `jane_street_gateway` will run in a different container with direct access to the hidden test set and hand off the data timestep by timestep.



Your code will always have access to the published copies of the files.

In [38]:
test_parquet = pd.read_parquet('/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet/date_id=0/part-0.parquet')
display(test_parquet)

lags_parquet = pd.read_parquet('/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet/date_id=0/part-0.parquet')
display(lags_parquet)

Unnamed: 0,row_id,date_id,time_id,symbol_id,weight,is_scored,feature_00,feature_01,feature_02,feature_03,...,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78
0,0,0,0,0,3.169998,True,0.0,0.0,0.0,0.0,...,-0.0,-0.0,0.0,0.0,,,0.0,0.0,-0.0,-0.0
1,1,0,0,1,2.165993,True,0.0,-0.0,0.0,0.0,...,-0.0,-0.0,0.0,-0.0,,,0.0,0.0,0.0,0.0
2,2,0,0,2,3.06555,True,0.0,-0.0,0.0,0.0,...,0.0,-0.0,0.0,0.0,,,0.0,0.0,-0.0,-0.0
3,3,0,0,3,2.698642,True,0.0,0.0,0.0,0.0,...,0.0,-0.0,0.0,0.0,,,0.0,0.0,-0.0,-0.0
4,4,0,0,4,1.80333,True,0.0,-0.0,0.0,0.0,...,-0.0,-0.0,0.0,-0.0,,,0.0,0.0,0.0,0.0
5,5,0,0,5,2.605776,True,0.0,-0.0,0.0,0.0,...,-0.0,-0.0,0.0,0.0,,,0.0,0.0,0.0,0.0
6,6,0,0,6,1.047993,True,0.0,-0.0,0.0,0.0,...,-0.0,-0.0,0.0,0.0,,,0.0,0.0,0.0,0.0
7,7,0,0,7,4.231289,True,0.0,0.0,0.0,0.0,...,0.0,-0.0,0.0,-0.0,,,0.0,0.0,-0.0,-0.0
8,8,0,0,8,2.600524,True,0.0,0.0,0.0,0.0,...,-0.0,-0.0,0.0,0.0,,,0.0,0.0,0.0,0.0
9,9,0,0,9,1.256275,True,0.0,-0.0,0.0,0.0,...,-0.0,-0.0,0.0,-0.0,,,0.0,0.0,-0.0,-0.0


Unnamed: 0,date_id,time_id,symbol_id,responder_0_lag_1,responder_1_lag_1,responder_2_lag_1,responder_3_lag_1,responder_4_lag_1,responder_5_lag_1,responder_6_lag_1,responder_7_lag_1,responder_8_lag_1
0,0,0,0,-0.442215,-0.322407,0.143594,-0.92689,-0.782236,-0.036595,-1.305746,-0.795677,-0.143724
1,0,0,1,-0.651829,-1.70784,-0.893942,-1.065488,-1.871338,-0.615652,-1.162801,-1.205924,-1.245934
2,0,0,2,-0.656373,-0.264575,-0.892879,-1.511886,-1.03348,-0.378265,-1.57429,-1.863071,-0.027343
3,0,0,3,-0.188186,-0.19097,-0.70149,0.098453,-1.015506,-0.054984,0.329152,-0.965471,0.576635
4,0,0,4,-0.257462,-0.471325,-0.29742,0.074018,-0.324194,-0.597093,0.219856,-0.276356,-0.90479
5,0,0,5,0.027579,-0.020169,0.640348,-0.948373,-0.374251,-0.24035,-0.913801,-0.548867,-1.283726
6,0,0,6,-0.419646,-0.181228,-0.194079,0.667993,0.936857,0.517728,0.896325,1.068884,1.57929
7,0,0,7,-0.114118,-0.198511,-0.200027,-0.410021,-0.135167,-0.182887,-0.492168,-0.142915,-0.202081
8,0,0,8,-0.374147,0.092127,0.294723,0.402989,2.060188,-0.225042,0.95646,2.185598,-0.435856
9,0,0,9,-0.529529,0.040104,-0.33309,-0.95904,-1.318411,-0.774299,-0.716492,-1.471419,-1.107083


In [44]:
lags_ : pl.DataFrame | None = None


# Replace this function with your inference code.
# You can return either a Pandas or Polars dataframe, though Polars is recommended.
# Each batch of predictions (except the very first) must be returned within 1 minute of the batch features being provided.
def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
    
    test = test.to_pandas()

    
    
    """Make a prediction."""
    # All the responders from the previous day are passed in at time_id == 0. We save them in a global variable for access at every time_id.
    # Use them as extra features, if you like.
    global lags_
    if lags is not None:
        lags = lags.to_pandas()
        lags_={}
        for i in range(len(lags)):
            lags_[lags.iloc[i]['symbol_id']]=lags.drop('symbol_id', axis=1).iloc[i]
    # display(lags_)
    
    # lags_ = lags

    # test_data = pd.concat([test, lags], axis=1)
    lagged_data = pd.DataFrame(columns=['date_id_lag_1', 'time_id_lag_1']+[f'responder_{i}_lag_1' for i in range(9)])
    for i in range(len(test)):
        if(lags_ != None and test.iloc[i]['symbol_id'] in lags_):
            lagged_data.loc[len(lagged_data)] = lags_[test.iloc[i]['symbol_id']]
        else:
            lagged_data.loc[len(lagged_data)] = pd.Series([float('nan')] * len(lagged_data.columns))
    
    test_data = pd.concat([test, lagged_data], axis=1)
    test_data.drop(columns=['row_id','is_scored', 'weight'], inplace=True)

    # display(test_data.columns)
    # display(X_train.columns)
    
    preds = model.predict(test_data)
    
    predictions = pd.DataFrame({
        'row_id': range(len(preds)),
        'responder_6': preds
    })

    # display(predictions)

    
    # # Replace this section with your own predictions
    # predictions = test.select(
    #     'row_id',
    #     pl.lit(0.0).alias('responder_6'),
    # )
    
    if isinstance(predictions, pl.DataFrame):
        assert predictions.columns == ['row_id', 'responder_6']
    elif isinstance(predictions, pd.DataFrame):
        assert (predictions.columns == ['row_id', 'responder_6']).all()
    else:
        raise TypeError('The predict function must return a DataFrame')
    # Confirm has as many rows as the test data.
    assert len(predictions) == len(test)

    return predictions

# predict(test_parquet, lags_parquet)
# predict(test_parquet.iloc[0:5], None)

When your notebook is run on the hidden test set, inference_server.serve must be called within 15 minutes of the notebook starting or the gateway will throw an error. If you need more than 15 minutes to load your model you can do so during the very first `predict` call, which does not have the usual 1 minute response deadline.

In [45]:
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            '/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet',
            '/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet',
        )
    )

Index(['date_id', 'time_id', 'symbol_id', 'feature_00', 'feature_01',
       'feature_02', 'feature_03', 'feature_04', 'feature_05', 'feature_06',
       'feature_07', 'feature_08', 'feature_09', 'feature_10', 'feature_11',
       'feature_12', 'feature_13', 'feature_14', 'feature_15', 'feature_16',
       'feature_17', 'feature_18', 'feature_19', 'feature_20', 'feature_21',
       'feature_22', 'feature_23', 'feature_24', 'feature_25', 'feature_26',
       'feature_27', 'feature_28', 'feature_29', 'feature_30', 'feature_31',
       'feature_32', 'feature_33', 'feature_34', 'feature_35', 'feature_36',
       'feature_37', 'feature_38', 'feature_39', 'feature_40', 'feature_41',
       'feature_42', 'feature_43', 'feature_44', 'feature_45', 'feature_46',
       'feature_47', 'feature_48', 'feature_49', 'feature_50', 'feature_51',
       'feature_52', 'feature_53', 'feature_54', 'feature_55', 'feature_56',
       'feature_57', 'feature_58', 'feature_59', 'feature_60', 'feature_61',
      

Index(['date_id', 'time_id', 'symbol_id', 'feature_00', 'feature_01',
       'feature_02', 'feature_03', 'feature_04', 'feature_05', 'feature_06',
       'feature_07', 'feature_08', 'feature_09', 'feature_10', 'feature_11',
       'feature_12', 'feature_13', 'feature_14', 'feature_15', 'feature_16',
       'feature_17', 'feature_18', 'feature_19', 'feature_20', 'feature_21',
       'feature_22', 'feature_23', 'feature_24', 'feature_25', 'feature_26',
       'feature_27', 'feature_28', 'feature_29', 'feature_30', 'feature_31',
       'feature_32', 'feature_33', 'feature_34', 'feature_35', 'feature_36',
       'feature_37', 'feature_38', 'feature_39', 'feature_40', 'feature_41',
       'feature_42', 'feature_43', 'feature_44', 'feature_45', 'feature_46',
       'feature_47', 'feature_48', 'feature_49', 'feature_50', 'feature_51',
       'feature_52', 'feature_53', 'feature_54', 'feature_55', 'feature_56',
       'feature_57', 'feature_58', 'feature_59', 'feature_60', 'feature_61',
      

Unnamed: 0,row_id,responder_6
0,0,-1.029659
1,1,-0.975659
2,2,-1.491359
3,3,0.179639
4,4,0.250818
5,5,-1.094802
6,6,0.967704
7,7,-0.464549
8,8,1.062172
9,9,-0.811167
