In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jane-street-real-time-market-data-forecasting/responders.csv
/kaggle/input/jane-street-real-time-market-data-forecasting/sample_submission.csv
/kaggle/input/jane-street-real-time-market-data-forecasting/features.csv
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=4/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=5/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=6/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=3/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=1/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=8/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=2/part-0.parquet
/kaggle/input/jane-street-real-time-market

In [2]:
import cudf
# Load the cudf.pandas extension for pandas-like GPU acceleration
%load_ext cudf.pandas

In [3]:
import pickle
import polars as pl
import pandas as pd
from cuml.linear_model import LinearRegression as cuLinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from glob import glob
import numpy as np

# Initialize variables
saved_model_path = "saved_model_lr_gpu.pkl"  # Path to save the model
model = cuLinearRegression()  # GPU-accelerated Linear Regression

for k in range(0, 10):
    # Directory containing the Parquet files
    data_path = "/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet"
    
    # Collect all Parquet file paths
    parquet_files = glob(f"{data_path}/partition_id={k}/part-0.parquet")
    
    print(f"Processing partition {k}")

    # Process each Parquet file
    final_df = pl.concat([pl.read_parquet(file) for file in parquet_files])
    
    # Print summary
    print(f"Final DataFrame shape: {final_df.shape}")
    
    # Fill NaNs with forward fill
    final_df = final_df.fill_null(strategy="forward").fill_null(strategy="backward")
    
    # List of responder columns
    responder_columns = [f"responder_{i}" for i in range(9)]
    
    # Apply the lagging for each responder column
    for responder in responder_columns:
        lagged_column_name = f"{responder}_lag_1"
        final_df = final_df.with_columns(
            pl.col(responder)
            .shift(1)
            .over(["time_id", "symbol_id"])
            .alias(lagged_column_name)
        )
    
    # Define the selected features and lagged responders
    selected_features = [
        'responder_3_lag_1', 'responder_8_lag_1', 'responder_7_lag_1', 'responder_4_lag_1', 'responder_5_lag_1',
        'responder_0_lag_1', 'responder_2_lag_1', 'responder_1_lag_1', 
        'feature_06', 'feature_60', 'feature_49', 'feature_04', 'feature_07', 
        'feature_58', 'feature_59', 'feature_47', 'feature_51', 'feature_36', 
        'feature_52', 'feature_68', 'feature_13', 'feature_02', 'feature_05', 
        'feature_41', 'feature_01', 'time_id', 'feature_54', 'feature_40', 
        'feature_03', 'feature_55', 'feature_08', 'feature_19', 'feature_48', 
        'feature_00', 'feature_71', 'feature_66', 'feature_45'
    ]
    
    # Drop all columns except the selected features and the target
    final_df = final_df.select(selected_features + ["responder_6", "date_id"])
    final_df = final_df.fill_null(0)
    
    # Convert Polars DataFrame to pandas DataFrame
    df = final_df.to_pandas()

    # Ensure these features exist in the DataFrame
    if not all(feature in df.columns for feature in selected_features):
        missing_features = [feature for feature in selected_features if feature not in df.columns]
        raise ValueError(f"Missing features in the DataFrame: {missing_features}")
    
    # Initialize storage for historical data
    historical_X = pd.DataFrame(columns=selected_features)
    historical_y = pd.Series(dtype=float)

    # Predictions for evaluation
    predictions = []
    actuals = []

    for current_date in sorted(df["date_id"].unique()):
        # Extract data for the current date_id
        current_data = df[df["date_id"] == current_date]
        X_current = current_data[selected_features].to_numpy()
        y_current = current_data["responder_6"].to_numpy()
        
        # Predict for the current rows if there's historical data
        if not historical_X.empty:
            pred = model.predict(X_current)
            predictions.extend(pred)
            actuals.extend(y_current)

        # Update the historical data and train the model
        historical_X = pd.concat([historical_X, current_data[selected_features]])
        historical_y = pd.concat([historical_y, current_data["responder_6"]])
        model.fit(historical_X.to_numpy(), historical_y.to_numpy())

        if len(actuals) == 0 or len(predictions) == 0:
            continue
    
        # Evaluate the model
        mse = mean_squared_error(actuals, predictions)
        r2 = r2_score(actuals, predictions)
    
        # Print model details and evaluation metrics
        print(f"Partition {k}, Date{current_date} - Mean Squared Error (MSE): {mse}")
        print(f"Partition {k}, Date{current_date} - R^2 Score: {r2}")
    
    # Save the model
    with open(saved_model_path, "wb") as f:
        pickle.dump(model, f)
    
    # Display predictions vs actual values
    results = pd.DataFrame({'Actual': actuals, 'Predicted': predictions})
    print(results.head())


  return init_func(self, *args, **filtered_kwargs)


Processing partition 0
Final DataFrame shape: (1944210, 92)
Partition 0, Date1 - Mean Squared Error (MSE): 0.6865974068641663
Partition 0, Date1 - R^2 Score: -0.040895506665343806
Partition 0, Date2 - Mean Squared Error (MSE): 0.6777842044830322
Partition 0, Date2 - R^2 Score: -0.04448766054959741
Partition 0, Date3 - Mean Squared Error (MSE): 0.7724493145942688
Partition 0, Date3 - R^2 Score: -0.03209504639460814
Partition 0, Date4 - Mean Squared Error (MSE): 0.7002988457679749
Partition 0, Date4 - R^2 Score: -0.02611535069144688
Partition 0, Date5 - Mean Squared Error (MSE): 0.6982638835906982
Partition 0, Date5 - R^2 Score: -0.015366612788089151
Partition 0, Date6 - Mean Squared Error (MSE): 0.7041730880737305
Partition 0, Date6 - R^2 Score: -0.012667377418040582
Partition 0, Date7 - Mean Squared Error (MSE): 0.7096636295318604
Partition 0, Date7 - R^2 Score: -0.009068087030298377
Partition 0, Date8 - Mean Squared Error (MSE): 0.8561847805976868
Partition 0, Date8 - R^2 Score: -0.01