In [1]:
import os

import pandas as pd
import polars as pl

import kaggle_evaluation.jane_street_inference_server

The evaluation API requires that you set up a server which will respond to inference requests. We have already defined the server; you just need write the predict function. When we evaluate your submission on the hidden test set the client defined in `jane_street_gateway` will run in a different container with direct access to the hidden test set and hand off the data timestep by timestep.



Your code will always have access to the published copies of the files.

In [2]:
import pickle

lags_ : pl.DataFrame | None = None


# Replace this function with your inference code.
# You can return either a Pandas or Polars dataframe, though Polars is recommended.
# Each batch of predictions (except the very first) must be returned within 1 minute of the batch features being provided.
import xgboost as xgb

loaded_model = None

def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
    """Make a prediction."""

    # All the responders from the previous day are passed in at time_id == 0. We save them in a global variable for access at every time_id.
    # Use them as extra features, if you like.
    global lags_, loaded_model
    if lags is not None:
        lags_ = lags

    if loaded_model is None:
        with open("/kaggle/input/catboost-linear/other/default/1/saved_model_catboost (1).pkl", "rb") as f:
            loaded_model = pickle.load(f)

    combined_data = test.join(lags_, on=["date_id", "time_id", "symbol_id"], how="left")

    combined_data.with_columns(
        pl.all().fill_null(strategy="forward")
    )

    # Assuming `features_*` are all columns starting with "feature_" or ending with "_lag"
    feature_columns = ['responder_3_lag_1', 'responder_8_lag_1', 'responder_7_lag_1', 'responder_4_lag_1', 'responder_5_lag_1',
        'responder_0_lag_1', 'responder_2_lag_1', 'responder_1_lag_1', 
        'feature_06', 'feature_60', 'feature_49', 'feature_04', 'feature_07', 
        'feature_58', 'feature_59', 'feature_47', 'feature_51', 'feature_36', 
        'feature_52', 'feature_68', 'feature_13', 'feature_02', 'feature_05', 
        'feature_41', 'feature_01', 'time_id', 'feature_54', 'feature_40', 
        'feature_03', 'feature_55', 'feature_08', 'feature_19', 'feature_48', 
        'feature_00', 'feature_71', 'feature_66', 'feature_45']

    # Prepare the input features
    X_pred = combined_data.select(feature_columns).to_pandas()  # Convert Polars DataFrame to Pandas for XGBoost compatibility
    
    X_pred.fillna(0, inplace=True)
    
    # Get predictions from the loaded model
    predictions = loaded_model.predict(X_pred)

    combined_data = combined_data.with_columns(pl.Series(name="responder_6", values=predictions))

    # Replace this section with your own predictions
    predictions = combined_data.select(
        'row_id', 'responder_6'
    )

    if isinstance(predictions, pl.DataFrame):
        assert predictions.columns == ['row_id', 'responder_6']
    elif isinstance(predictions, pd.DataFrame):
        assert (predictions.columns == ['row_id', 'responder_6']).all()
    else:
        raise TypeError('The predict function must return a DataFrame')
    # Confirm has as many rows as the test data.
    assert len(predictions) == len(test)

    return predictions

When your notebook is run on the hidden test set, inference_server.serve must be called within 15 minutes of the notebook starting or the gateway will throw an error. If you need more than 15 minutes to load your model you can do so during the very first `predict` call, which does not have the usual 1 minute response deadline.

In [3]:
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            '/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet',
            '/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet',
        )
    )

In [4]:
import os
if os.path.isfile('submission.parquet'):
    pl_sub = pl.read_parquet('submission.parquet')
    display(pl_sub)

row_id,responder_6
i64,f64
0,0.565396
1,0.468717
2,0.614237
3,0.79441
4,0.118273
…,…
34,0.548117
35,0.718404
36,0.183481
37,0.637067
