In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/xgboost-model-json/xgb_model_2.json
/kaggle/input/jane-street-real-time-market-data-forecasting/responders.csv
/kaggle/input/jane-street-real-time-market-data-forecasting/sample_submission.csv
/kaggle/input/jane-street-real-time-market-data-forecasting/features.csv
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=4/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=5/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=6/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=3/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=1/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=8/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=2/part-0.

In [2]:
import pickle
import polars as pl
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from lightgbm import LGBMRegressor
from glob import glob
from numba import cuda


# Initialize variables
saved_model_path = "saved_model_lightgbm.pkl"  # Path to save the model
is_first_run = True  # To track if it's the first partition

for k in range(0, 10):

    # Directory containing the Parquet files
    data_path = "/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet"
    
    # Collect all Parquet file paths
    parquet_files = glob(f"{data_path}/partition_id={k}/part-0.parquet")
    
    # Get the number of GPUs available
    available_gpus = cuda.gpus
    num_gpus = len(available_gpus)
    
    if num_gpus == 0:
        raise RuntimeError("No GPUs available for processing.")
    
    print(f"Number of GPUs available: {num_gpus}")
    
    # Split files evenly across available GPUs
    files_per_gpu = len(parquet_files) // num_gpus
    gpu_file_splits = [
        parquet_files[i * files_per_gpu : (i + 1) * files_per_gpu]
        for i in range(num_gpus)
    ]
    
    # Adjust the last split to include any remaining files
    if len(parquet_files) % num_gpus != 0:
        gpu_file_splits[-1].extend(parquet_files[num_gpus * files_per_gpu :])
    
    # Function to process files on a specific GPU
    def process_files_on_gpu(files, gpu_id):
        # Select the GPU for processing
        cuda.select_device(gpu_id)
        gpu_batches = []
        for file in files:
            print(f"Processing file on GPU {gpu_id}: {file}")
            
            # Read the file into GPU memory using Polars
            gpu_df = pl.read_parquet(file)
            
            # Keep all columns, no filtering
            gpu_batches.append(gpu_df)
            
            print(f"File {file} processed on GPU {gpu_id}.")
    
        # Concatenate all batches for this GPU into a single DataFrame
        if gpu_batches:
            return pl.concat(gpu_batches)
        else:
            return pl.DataFrame()
    
    # Process files on all available GPUs
    final_gpu_dfs = []
    for gpu_id, files in enumerate(gpu_file_splits):
        gpu_df = process_files_on_gpu(files, gpu_id)
        final_gpu_dfs.append(gpu_df)
    
    # Combine all processed batches across GPUs into a single DataFrame
    final_df = pl.concat(final_gpu_dfs)
    
    # Print summary
    print(f"Final DataFrame shape on GPU: {final_df.shape}")
    
    # Fill NaNs with forward fill
    final_df = final_df.fill_null(strategy="forward").fill_null(strategy="backward")
    
    # List of responder columns
    responder_columns = [f"responder_{i}" for i in range(9)]
    
    # Apply the lagging for each responder column
    for responder in responder_columns:
        lagged_column_name = f"{responder}_lag_1"
        final_df = final_df.with_columns(
            pl.col(responder)
            .shift(1)
            .over(["time_id", "symbol_id"])
            .alias(lagged_column_name)
        )
    
    # Define the selected features and lagged responders
    selected_features = [
        'responder_3_lag_1', 'responder_8_lag_1', 'responder_7_lag_1', 'responder_4_lag_1', 'responder_5_lag_1',
        'responder_0_lag_1', 'responder_2_lag_1', 'responder_1_lag_1', 
        'feature_06', 'feature_60', 'feature_49', 'feature_04', 'feature_07', 
        'feature_58', 'feature_59', 'feature_47', 'feature_51', 'feature_36', 
        'feature_52', 'feature_68', 'feature_13', 'feature_02', 'feature_05', 
        'feature_41', 'feature_01', 'time_id', 'feature_54', 'feature_40', 
        'feature_03', 'feature_55', 'feature_08', 'feature_19', 'feature_48', 
        'feature_00', 'feature_71', 'feature_66', 'feature_45'
    ]
    
    # Drop all columns except the selected features and the target
    final_df = final_df.select(selected_features + ["responder_6"])

    final_df = final_df.fill_null(0)
    # Convert Polars DataFrame to pandas DataFrame
    df = final_df.to_pandas()

    # Ensure these features exist in the DataFrame
    if not all(feature in df.columns for feature in selected_features):
        missing_features = [feature for feature in selected_features if feature not in df.columns]
        raise ValueError(f"Missing features in the DataFrame: {missing_features}")
    
    # Define the feature set and target variable
    X = df[selected_features]
    y = df["responder_6"]
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Load the model if not the first run, else create a new one
    if not is_first_run:
        with open(saved_model_path, "rb") as f:
            model = pickle.load(f)
    else:
        model = LGBMRegressor(device="gpu", random_state=42)
        is_first_run = False  # After the first initialization
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Save the updated model
    with open(saved_model_path, "wb") as f:
        pickle.dump(model, f)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Print model details and evaluation metrics
    print(f"Partition {k} - Mean Squared Error (MSE): {mse}")
    print(f"Partition {k} - R^2 Score: {r2}")
    
    # Display predictions vs actual values
    results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
    print(results.head())


Number of GPUs available: 2
Processing file on GPU 1: /kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=0/part-0.parquet
File /kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=0/part-0.parquet processed on GPU 1.
Final DataFrame shape on GPU: (1944210, 92)
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 8160
[LightGBM] [Info] Number of data points in the train set: 1555368, number of used features: 32
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...




[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 32 dense feature groups (47.47 MB) transferred to GPU in 0.041805 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.001706
Partition 0 - Mean Squared Error (MSE): 0.7297149257331713
Partition 0 - R^2 Score: 0.03912640465639927
           Actual  Predicted
797011  -0.041428   0.081857
770940   1.261587  -0.003163
760952   0.035796  -0.076898
6234     0.265585  -0.014619
1931465  0.106589   0.227639
Number of GPUs available: 2
Processing file on GPU 1: /kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=1/part-0.parquet
File /kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=1/part-0.parquet processed on GPU 1.
Final DataFrame shape on GPU: (2804247, 92)
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 9434
[LightGBM] [Info] Number of data points in the tra

### Hyperparamter optimization

In [3]:
import pickle
import polars as pl
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from lightgbm import LGBMRegressor
from glob import glob
from numba import cuda

# Initialize variables
saved_model_path = "saved_model_lightgbm_tuned.pkl"  # Path to save the model
is_first_run = True  # To track if it's the first partition
optimal_params = None  # Placeholder for optimal hyperparameters

# Define hyperparameter grid for tuning
param_grid = {
    'num_leaves': [31, 50, 70],
    'max_depth': [-1, 10, 20],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 500],
}

for k in range(0, 10):

    # Directory containing the Parquet files
    saved_model_path = "saved_model_lightgbm_tuned.pkl"  # Path to save the model

    data_path = "/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet"
    
    # Collect all Parquet file paths
    parquet_files = glob(f"{data_path}/partition_id={k}/part-0.parquet")
    
    # Get the number of GPUs available
    available_gpus = cuda.gpus
    num_gpus = len(available_gpus)
    
    if num_gpus == 0:
        raise RuntimeError("No GPUs available for processing.")
    
    print(f"Number of GPUs available: {num_gpus}")
    
    # Split files evenly across available GPUs
    files_per_gpu = len(parquet_files) // num_gpus
    gpu_file_splits = [
        parquet_files[i * files_per_gpu : (i + 1) * files_per_gpu]
        for i in range(num_gpus)
    ]
    
    # Adjust the last split to include any remaining files
    if len(parquet_files) % num_gpus != 0:
        gpu_file_splits[-1].extend(parquet_files[num_gpus * files_per_gpu :])
    
    # Function to process files on a specific GPU
    def process_files_on_gpu(files, gpu_id):
        # Select the GPU for processing
        cuda.select_device(gpu_id)
        gpu_batches = []
        for file in files:
            print(f"Processing file on GPU {gpu_id}: {file}")
            
            # Read the file into GPU memory using Polars
            gpu_df = pl.read_parquet(file)
            
            # Keep all columns, no filtering
            gpu_batches.append(gpu_df)
            
            print(f"File {file} processed on GPU {gpu_id}.")
    
        # Concatenate all batches for this GPU into a single DataFrame
        if gpu_batches:
            return pl.concat(gpu_batches)
        else:
            return pl.DataFrame()
    
    # Process files on all available GPUs
    final_gpu_dfs = []
    for gpu_id, files in enumerate(gpu_file_splits):
        gpu_df = process_files_on_gpu(files, gpu_id)
        final_gpu_dfs.append(gpu_df)
    
    # Combine all processed batches across GPUs into a single DataFrame
    final_df = pl.concat(final_gpu_dfs)
    
    # Print summary
    print(f"Final DataFrame shape on GPU: {final_df.shape}")
    
    # Fill NaNs with forward fill
    final_df = final_df.fill_null(strategy="forward").fill_null(strategy="backward")
    
    # List of responder columns
    responder_columns = [f"responder_{i}" for i in range(9)]
    
    # Apply the lagging for each responder column
    for responder in responder_columns:
        lagged_column_name = f"{responder}_lag_1"
        final_df = final_df.with_columns(
            pl.col(responder)
            .shift(1)
            .over(["time_id", "symbol_id"])
            .alias(lagged_column_name)
        )
    
    # Define the selected features and lagged responders
    selected_features = [
        'responder_3_lag_1', 'responder_8_lag_1', 'responder_7_lag_1', 'responder_4_lag_1', 'responder_5_lag_1',
        'responder_0_lag_1', 'responder_2_lag_1', 'responder_1_lag_1', 
        'feature_06', 'feature_60', 'feature_49', 'feature_04', 'feature_07', 
        'feature_58', 'feature_59', 'feature_47', 'feature_51', 'feature_36', 
        'feature_52', 'feature_68', 'feature_13', 'feature_02', 'feature_05', 
        'feature_41', 'feature_01', 'time_id', 'feature_54', 'feature_40', 
        'feature_03', 'feature_55', 'feature_08', 'feature_19', 'feature_48', 
        'feature_00', 'feature_71', 'feature_66', 'feature_45'
    ]
    
    # Drop all columns except the selected features and the target
    final_df = final_df.select(selected_features + ["responder_6"])

    final_df = final_df.fill_null(0)
    # Convert Polars DataFrame to pandas DataFrame
    df = final_df.to_pandas()

    # Ensure these features exist in the DataFrame
    if not all(feature in df.columns for feature in selected_features):
        missing_features = [feature for feature in selected_features if feature not in df.columns]
        raise ValueError(f"Missing features in the DataFrame: {missing_features}")
    
    # Define the feature set and target variable
    X = df[selected_features]
    y = df["responder_6"]
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Perform hyperparameter tuning only during the first run
    if is_first_run:
        model = LGBMRegressor(device="gpu", random_state=42)
        grid_search = GridSearchCV(
            estimator=model,
            param_grid=param_grid,
            scoring="neg_mean_squared_error",
            cv=3,
            verbose=1
        )
        grid_search.fit(X_train, y_train)
        optimal_params = grid_search.best_params_
        print(f"Optimal Parameters: {optimal_params}")
        is_first_run = False
    else:
        model = LGBMRegressor(device="gpu", random_state=42, **optimal_params)
    
    # Train the model with optimal parameters
    model.fit(X_train, y_train)
    
    # Save the updated model
    with open(saved_model_path, "wb") as f:
        pickle.dump(model, f)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Print model details and evaluation metrics
    print(f"Partition {k} - Mean Squared Error (MSE): {mse}")
    print(f"Partition {k} - R^2 Score: {r2}")
    
    # Display predictions vs actual values
    results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
    print(results.head())


Number of GPUs available: 2
Processing file on GPU 1: /kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=0/part-0.parquet
File /kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=0/part-0.parquet processed on GPU 1.
Final DataFrame shape on GPU: (1944210, 92)
Fitting 3 folds for each of 81 candidates, totalling 243 fits
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 8160
[LightGBM] [Info] Number of data points in the train set: 1036912, number of used features: 32
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 32 dense feature groups (31.64 MB) transferred to GPU in 0.028478 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.001537
[LightGBM] [Info] This is the GPU train