In [1]:
import pandas as pd
import polars as pl
import numpy as np
import os, gc
from tqdm.auto import tqdm
from matplotlib import pyplot as plt
import pickle

from sklearn.metrics import r2_score
from lightgbm import LGBMRegressor
import lightgbm as lgb
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor

import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

import kaggle_evaluation.jane_street_inference_server

In [2]:
class CONFIG:
    seed = 42
    target_col = "responder_6"
    feature_cols = ["symbol_id", "time_id"] + [f"feature_{idx:02d}" for idx in range(79)]+ [f"responder_{idx}_lag_1" for idx in range(9)]
    
    model_paths = [
        "/kaggle/input/jane-street-xgb-single-model/result.pkl",
    ]

In [3]:
valid = pl.scan_parquet(
    f"//kaggle/input/jane-street-preprocessing-fin-ai-blue/validation.parquet/"
).collect().to_pandas()

In [4]:
models = []
for model_path in CONFIG.model_paths:
    with open( model_path, "rb") as fp:
        result = pickle.load(fp)

    model = result["model"]
    models.append(model)

# Show model
for model in models:
    display(model)

In [5]:
X_valid = valid[ CONFIG.feature_cols ]
y_valid = valid[ CONFIG.target_col ]
w_valid = valid[ "weight" ]

X_valid.shape, y_valid.shape, w_valid.shape

((3599024, 90), (3599024,), (3599024,))

In [6]:
y_pred_valid = model.predict(X_valid)
valid_score = r2_score( y_valid, y_pred_valid, sample_weight=w_valid )
valid_score

0.021097901303587796

In [7]:
del valid, X_valid, y_valid, w_valid
gc.collect()

0

In [8]:
lags_ : pl.DataFrame | None = None
    
def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
    global lags_
    if lags is not None:
        lags_ = lags

    predictions = test.select(
        'row_id',
        pl.lit(0.0).alias('responder_6'),
    )
    symbol_ids = test.select('symbol_id').to_numpy()[:, 0]

    if not lags is None:
        lags = lags.group_by(["date_id", "symbol_id"], maintain_order=True).first() # pick up last record of previous date
        test = test.join(lags, on=["date_id", "symbol_id"],  how="left")
    else:
        test = test.with_columns(
            ( pl.lit(0.0).alias(f'responder_{idx}_lag_1') for idx in range(9) )
        )
    
    preds = np.zeros((test.shape[0],))
    for i, model in enumerate(tqdm(models)):
        preds += model.predict(test[CONFIG.feature_cols].to_pandas()) / len(models)
    print(f"predict> preds.shape =", preds.shape)
    
    predictions = \
    test.select('row_id').\
    with_columns(
        pl.Series(
            name   = 'responder_6', 
            values = np.clip(preds, a_min = -5, a_max = 5),
            dtype  = pl.Float64,
        )
    )
    print(predictions)

    # The predict function must return a DataFrame
    assert isinstance(predictions, pl.DataFrame | pd.DataFrame)
    # with columns 'row_id', 'responer_6'
    assert list(predictions.columns) == ['row_id', 'responder_6']
    # and as many rows as the test data.
    assert len(predictions) == len(test)

    return predictions

In [9]:
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            '/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet',
            '/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet',
        )
    )

  0%|          | 0/1 [00:00<?, ?it/s]

predict> preds.shape = (39,)
shape: (39, 2)
┌────────┬─────────────┐
│ row_id ┆ responder_6 │
│ ---    ┆ ---         │
│ i64    ┆ f64         │
╞════════╪═════════════╡
│ 0      ┆ 0.070714    │
│ 1      ┆ 0.070177    │
│ 2      ┆ 0.060707    │
│ 3      ┆ 0.036295    │
│ 4      ┆ 0.081751    │
│ …      ┆ …           │
│ 34     ┆ 0.085803    │
│ 35     ┆ 0.029169    │
│ 36     ┆ 0.082519    │
│ 37     ┆ 0.061834    │
│ 38     ┆ 0.110552    │
└────────┴─────────────┘
