In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/linregwithlagged/other/default/1/linreg_model_cv_nonrandom (1) (1).pkl
/kaggle/input/jane-street-real-time-market-data-forecasting/responders.csv
/kaggle/input/jane-street-real-time-market-data-forecasting/sample_submission.csv
/kaggle/input/jane-street-real-time-market-data-forecasting/features.csv
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=4/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=5/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=6/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=3/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=1/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=8/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/t

In [2]:
import os

import pandas as pd
import polars as pl

import kaggle_evaluation.jane_street_inference_server

In [3]:
import pickle
import time
import polars as pl
import pandas as pd
from sklearn.linear_model import LinearRegression

# Global variables
lags_: pl.DataFrame | None = None
history_cache = pl.DataFrame()
lags_cache = pl.DataFrame()
loaded_model = None

# Track the maximum date_id encountered so far
max_date_id_encountered = -1

# Feature and responder configurations
features_to_lag = [
    'feature_06', 'feature_60', 'feature_49', 'feature_04', 'feature_07',
    'feature_58', 'feature_59', 'feature_47', 'feature_51', 'feature_36',
    'feature_52', 'feature_68', 'feature_13', 'feature_02', 'feature_05'
]
lag_range = range(1, 11)  # Lags from t-1 to t-10
responder_columns = [f"responder_{i}" for i in range(9)]

# Feature columns for prediction
lagged_feature_cols = [f"{feature}_lag_{lag}" for feature in features_to_lag for lag in lag_range]
lagged_responder_cols = [f"{responder}_lag_1" for responder in responder_columns]
feature_columns = features_to_lag + lagged_feature_cols + lagged_responder_cols + ["time_id"]
max_lag = 11

with open("/kaggle/input/linregwithlagged/other/default/1/linreg_model_cv_nonrandom (1) (1).pkl", "rb") as f:
    loaded_model = pickle.load(f)

def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
    """Make a prediction for the test set."""
    global lags_, loaded_model, history_cache, lags_cache, max_lag

    # Ensure test DataFrame is not None
    if test is None:
        test = pl.DataFrame()

    # If lags DataFrame is provided, store the lagged responder data
    if lags is not None:
        lags_ = lags
        lags_cache = pl.concat([lags_cache, lags])

    # If test DataFrame is empty, create necessary columns with zeros
    if test.is_empty():
        # Create empty DataFrame with required columns
        test = pl.DataFrame({col: [] for col in ['date_id', 'time_id', 'symbol_id', 'row_id'] + features_to_lag})

    # Update history_cache with test data
    history_cache = pl.concat([history_cache, test])

    # Extract the top 3 unique `date_id` values (sorted descending)
    top_3_dates = history_cache.select("date_id").unique().sort("date_id", descending=True).limit(3)

    # Filter history_cache and lags_cache to keep only rows with top 3 `date_id` values
    history_cache = history_cache.filter(pl.col("date_id").is_in(top_3_dates["date_id"]))
    lags_cache = lags_cache.filter(pl.col("date_id").is_in(top_3_dates["date_id"]))

    # Start with filtered_df as a copy of history_cache
    filtered_df = history_cache

    # Get the current `date_id` from the test set
    current_date_id = test["date_id"][0] if not test.is_empty() else None

    # Filter to only the rows for the current date_id
    current_df = filtered_df.filter(pl.col("date_id") == current_date_id)

    # Create lagged features only for the current date_id
    for lag in range(1, max_lag + 1):
        lagged_cols = [
            pl.col(feature).shift(lag).alias(f"{feature}_lag_{lag}")
            for feature in features_to_lag
        ]
        current_df = current_df.with_columns(lagged_cols)

    # Join the lagged columns back into the main filtered_df
    filtered_df = filtered_df.join(
        current_df.select(["date_id", "time_id", "symbol_id"] + [f"{feature}_lag_{lag}" 
                                                                 for feature in features_to_lag 
                                                                 for lag in range(1, max_lag + 1)]),
        on=["date_id", "time_id", "symbol_id"],
        how="left"
    )

    # Proceed with joining filtered_df with test data
    combined_data = test.join(
        filtered_df,
        on=["date_id", "time_id", "symbol_id"],
        how="left"
    )

    # Step 2: Join the result with `lags_` on `date_id` and `symbol_id`, if lags_ is not None
    if lags_ is not None:
        combined_data = combined_data.join(
            lags_,
            on=["date_id", "symbol_id"],
            how="left"
        )

    # Remove columns ending with '_right' if any
    columns_to_keep = [col for col in combined_data.columns if not col.endswith('_right')]
    combined_data = combined_data.select(columns_to_keep)

    # Fill nulls for missing features dynamically
    combined_data = combined_data.fill_null(0)

    # Check if all feature columns exist in the combined data
    missing_columns = set(feature_columns) - set(combined_data.columns)
    if missing_columns:
        for missing_col in missing_columns:
            combined_data = combined_data.with_columns(
                pl.lit(0).alias(missing_col)
            )

    # Prepare input features for prediction
    X_pred = combined_data.select(feature_columns).to_pandas()
    X_pred.fillna(0, inplace=True)

    # Ensure the feature columns are in the same order as during training
    if hasattr(loaded_model, 'feature_names_in_'):
        expected_features = loaded_model.feature_names_in_
        X_pred = X_pred.reindex(columns=expected_features, fill_value=0)
    else:
        X_pred = X_pred[feature_columns]

    # Predict using the Linear Regression model
    predictions = loaded_model.predict(X_pred)

    # Add predictions to the DataFrame
    combined_data = combined_data.with_columns(pl.Series(name="responder_6", values=predictions))

    # Prepare the final predictions DataFrame
    predictions_df = combined_data.select(['row_id', 'responder_6'])

    # Ensure predictions_df is not empty and has the correct columns
    if isinstance(predictions_df, pl.DataFrame):
        assert predictions_df.columns == ['row_id', 'responder_6']
    elif isinstance(predictions_df, pd.DataFrame):
        assert list(predictions_df.columns) == ['row_id', 'responder_6']
    else:
        # If predictions_df is empty, create an empty DataFrame with required columns
        predictions_df = pl.DataFrame({"row_id": [], "responder_6": []})

    # Confirm that predictions match the test data length
    if len(predictions_df) != len(test):
        # If lengths do not match, adjust predictions_df to match test data
        predictions_df = test.select(['row_id']).with_columns(
            pl.lit(0).alias('responder_6')
        )

    return predictions_df


In [4]:
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            '/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet',
            '/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet',
        )
    )

In [5]:
import os
if os.path.isfile('submission.parquet'):
    pl_sub = pl.read_parquet('submission.parquet')
    display(pl_sub)

row_id,responder_6
i64,f32
0,0.022599
1,0.041672
2,0.008419
3,0.022348
4,0.02917
…,…
34,0.016545
35,0.028216
36,0.013382
37,0.016465
