In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/newmodellinregidksleepy/other/default/1/linreg_model_cv_nonrandom.pkl
/kaggle/input/jane-street-real-time-market-data-forecasting/responders.csv
/kaggle/input/jane-street-real-time-market-data-forecasting/sample_submission.csv
/kaggle/input/jane-street-real-time-market-data-forecasting/features.csv
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=4/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=5/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=6/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=3/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=1/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=8/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/tr

In [2]:
import os

import pandas as pd
import polars as pl

import kaggle_evaluation.jane_street_inference_server

In [38]:
import pickle
import polars as pl
import pandas as pd
from sklearn.linear_model import LinearRegression

# Global variables
history_cache = pl.DataFrame()
loaded_model = None

# Feature and responder configurations
features_to_lag = [
    'feature_06', 'feature_60', 'feature_49', 'feature_04', 'feature_07',
    'feature_58', 'feature_59', 'feature_47', 'feature_51', 'feature_36',
    'feature_52', 'feature_68', 'feature_13', 'feature_02', 'feature_05',
]
responder_columns = [f"responder_{i}" for i in range(9)]
lag_range = range(1, 11)  # t-1 to t-10

# Generate all lagged column names
lagged_feature_cols = [f"{feature}_lag_{lag}" for feature in features_to_lag for lag in lag_range]
lagged_responder_cols = [f"{responder}_lag_{lag}" for responder in responder_columns for lag in lag_range]
feature_columns = features_to_lag + lagged_feature_cols + lagged_responder_cols + ["time_id"]

def align_schema(df: pl.DataFrame) -> pl.DataFrame:
    """
    Ensure the DataFrame columns match the schema expected by the model.
    """
    predefined_schema = {
        **{col: pl.Float32 for col in features_to_lag},
        **{col: pl.Float32 for col in responder_columns},
        "row_id": pl.Int32,
        "date_id": pl.Int32,
        "time_id": pl.Int32,
        "symbol_id": pl.Int32,
    }
    for col, dtype in predefined_schema.items():
        if col in df.columns:
            df = df.with_columns(df[col].cast(dtype))
        else:
            df = df.with_columns(pl.lit(0, dtype=dtype).alias(col))
    return df

def update_lags_incremental(test: pl.DataFrame) -> pl.DataFrame:
    """
    Update the history cache and calculate lagged features dynamically.
    """
    global history_cache

    # Align schemas before concatenation
    test = align_schema(test)
    history_cache = align_schema(history_cache)

    # Append the new test data to the history cache
    history_cache = pl.concat([history_cache, test])

    # Create lagged features for the most recent rows
    lagged_data = test.clone()
    for lag in lag_range:
        for feature in features_to_lag:
            lagged_col = f"{feature}_lag_{lag}"
            shifted_col = history_cache[feature].shift(lag)
            aligned_shifted_col = shifted_col.tail(len(test))
            lagged_data = lagged_data.with_columns(aligned_shifted_col.alias(lagged_col))

        for responder in responder_columns:
            lagged_col = f"{responder}_lag_{lag}"
            shifted_col = history_cache[responder].shift(lag)
            aligned_shifted_col = shifted_col.tail(len(test))
            lagged_data = lagged_data.with_columns(aligned_shifted_col.alias(lagged_col))

    return lagged_data

def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pd.DataFrame:
    """
    Make predictions for test data, dynamically updating lagged features.
    """
    global loaded_model, history_cache

    # Load the Linear Regression model if not already loaded
    if loaded_model is None:
        with open("/kaggle/input/linwithlaggedfeatures/other/default/1/linreg_model_cv_nonrandom (1).pkl", "rb") as f:
            loaded_model = pickle.load(f)

    # Incorporate lags if provided
    if lags is not None:
        lags = align_schema(lags)
        history_cache = pl.concat([history_cache, lags])

    # Update lagged features dynamically
    test_with_lags = update_lags_incremental(test)

    # Fill missing lagged features dynamically
    missing_columns = set(feature_columns) - set(test_with_lags.columns)
    for missing_col in missing_columns:
        test_with_lags = test_with_lags.with_columns(pl.lit(0).alias(missing_col))

    # Convert to Pandas DataFrame for sklearn compatibility
    X_pred = test_with_lags.select(feature_columns).to_pandas()
    X_pred.fillna(0, inplace=True)

    # Predict using the Linear Regression model
    predictions = loaded_model.predict(X_pred)

    # Add predictions to the DataFrame
    result_df = test_with_lags.with_columns(pl.Series(name="responder_6", values=predictions))

    # Return the result as Polars DataFrame
    return result_df.select(['row_id', 'responder_6'])


In [39]:
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            '/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet',
            '/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet',
        )
    )

GatewayRuntimeError: (<GatewayRuntimeErrorType.SERVER_RAISED_EXCEPTION: 3>, 'unable to append to a DataFrame of width 37 with a DataFrame of width 94')

In [37]:
import os
if os.path.isfile('submission.parquet'):
    pl_sub = pl.read_parquet('submission.parquet')
    display(pl_sub)