In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jane-street-real-time-market-data-forecasting/responders.csv
/kaggle/input/jane-street-real-time-market-data-forecasting/sample_submission.csv
/kaggle/input/jane-street-real-time-market-data-forecasting/features.csv
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=4/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=5/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=6/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=3/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=1/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=8/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=2/part-0.parquet
/kaggle/input/jane-street-real-time-market

In [2]:
import pickle
import polars as pl
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from glob import glob
from numba import cuda


# Initialize variables
saved_model_path = "saved_model_xgboost.pkl"  # Path to save the model
is_first_run = True  # To track if it's the first partition

for k in range(0, 10):

    # Directory containing the Parquet files
    data_path = "/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet"
    
    # Collect all Parquet file paths
    parquet_files = glob(f"{data_path}/partition_id={k}/part-0.parquet")
    
    # Get the number of GPUs available
    available_gpus = cuda.gpus
    num_gpus = len(available_gpus)
    
    if num_gpus == 0:
        raise RuntimeError("No GPUs available for processing.")
    
    print(f"Number of GPUs available: {num_gpus}")
    
    # Split files evenly across available GPUs
    files_per_gpu = len(parquet_files) // num_gpus
    gpu_file_splits = [
        parquet_files[i * files_per_gpu : (i + 1) * files_per_gpu]
        for i in range(num_gpus)
    ]
    
    # Adjust the last split to include any remaining files
    if len(parquet_files) % num_gpus != 0:
        gpu_file_splits[-1].extend(parquet_files[num_gpus * files_per_gpu :])
    
    # Function to process files on a specific GPU
    def process_files_on_gpu(files, gpu_id):
        # Select the GPU for processing
        cuda.select_device(gpu_id)
        gpu_batches = []
        for file in files:
            print(f"Processing file on GPU {gpu_id}: {file}")
            
            # Read the file into GPU memory using Polars
            gpu_df = pl.read_parquet(file)
            
            # Keep all columns, no filtering
            gpu_batches.append(gpu_df)
            
            print(f"File {file} processed on GPU {gpu_id}.")
    
        # Concatenate all batches for this GPU into a single DataFrame
        if gpu_batches:
            return pl.concat(gpu_batches)
        else:
            return pl.DataFrame()
    
    # Process files on all available GPUs
    final_gpu_dfs = []
    for gpu_id, files in enumerate(gpu_file_splits):
        gpu_df = process_files_on_gpu(files, gpu_id)
        final_gpu_dfs.append(gpu_df)
    
    # Combine all processed batches across GPUs into a single DataFrame
    final_df = pl.concat(final_gpu_dfs)
    
    # Print summary
    print(f"Final DataFrame shape on GPU: {final_df.shape}")
    
    # Fill NaNs with forward fill
    final_df = final_df.fill_null(strategy="forward").fill_null(strategy="backward")
    
    # List of responder columns
    responder_columns = [f"responder_{i}" for i in range(9)]
    
    # Apply the lagging for each responder column
    for responder in responder_columns:
        lagged_column_name = f"{responder}_lag_1"
        final_df = final_df.with_columns(
            pl.col(responder)
            .shift(1)
            .over(["time_id", "symbol_id"])
            .alias(lagged_column_name)
        )
    
    # Define the selected features and lagged responders
    selected_features = [
        'responder_3_lag_1', 'responder_8_lag_1', 'responder_7_lag_1', 'responder_4_lag_1', 'responder_5_lag_1',
        'responder_0_lag_1', 'responder_2_lag_1', 'responder_1_lag_1', 
        'feature_06', 'feature_60', 'feature_49', 'feature_04', 'feature_07', 
        'feature_58', 'feature_59', 'feature_47', 'feature_51', 'feature_36', 
        'feature_52', 'feature_68', 'feature_13', 'feature_02', 'feature_05', 
        'feature_41', 'feature_01', 'time_id', 'feature_54', 'feature_40', 
        'feature_03', 'feature_55', 'feature_08', 'feature_19', 'feature_48', 
        'feature_00', 'feature_71', 'feature_66', 'feature_45'
    ]
    
    # Drop all columns except the selected features and the target
    final_df = final_df.select(selected_features + ["responder_6"])

    final_df = final_df.fill_null(0)
    # Convert Polars DataFrame to pandas DataFrame
    df = final_df.to_pandas()

    # Ensure these features exist in the DataFrame
    if not all(feature in df.columns for feature in selected_features):
        missing_features = [feature for feature in selected_features if feature not in df.columns]
        raise ValueError(f"Missing features in the DataFrame: {missing_features}")
    
    # Define the feature set and target variable
    X = df[selected_features]
    y = df["responder_6"]
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Load the model if not the first run, else create a new one
    if not is_first_run:
        with open(saved_model_path, "rb") as f:
            model = pickle.load(f)
    else:
        model = XGBRegressor(tree_method="gpu_hist", predictor="gpu_predictor", random_state=42)
        is_first_run = False  # After the first initialization
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Save the updated model
    with open(saved_model_path, "wb") as f:
        pickle.dump(model, f)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Print model details and evaluation metrics
    print(f"Partition {k} - Mean Squared Error (MSE): {mse}")
    print(f"Partition {k} - R^2 Score: {r2}")
    
    # Display predictions vs actual values
    results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
    print(results.head())


Number of GPUs available: 2
Processing file on GPU 1: /kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=0/part-0.parquet
File /kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=0/part-0.parquet processed on GPU 1.
Final DataFrame shape on GPU: (1944210, 92)



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Partition 0 - Mean Squared Error (MSE): 0.7062031030654907
Partition 0 - R^2 Score: 0.07008616361244224
           Actual  Predicted
797011  -0.041428   0.098133
770940   1.261587   0.050287
760952   0.035796  -0.094505
6234     0.265585  -0.054859
1931465  0.106589   0.266660
Number of GPUs available: 2
Processing file on GPU 1: /kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=1/part-0.parquet
File /kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=1/part-0.parquet processed on GPU 1.
Final DataFrame shape on GPU: (2804247, 92)



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



Partition 1 - Mean Squared Error (MSE): 0.768883466720581
Partition 1 - R^2 Score: 0.05308922019924711
           Actual  Predicted
2782573 -1.683401  -0.119252
130953   0.083945  -0.018238
147060   0.808741   0.208013
2127284 -0.118344   0.028621
1642514  1.901179   0.085337
Number of GPUs available: 2
Processing file on GPU 1: /kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=2/part-0.parquet
File /kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=2/part-0.parquet processed on GPU 1.
Final DataFrame shape on GPU: (3036873, 92)



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



Partition 2 - Mean Squared Error (MSE): 0.8396878838539124
Partition 2 - R^2 Score: 0.04995824815912331
           Actual  Predicted
2217279 -0.625125  -0.003195
1030393 -0.124242  -0.337703
897674   0.672475  -0.034500
898751  -3.727389   0.010760
1069276  0.230122  -0.079127
Number of GPUs available: 2
Processing file on GPU 1: /kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=3/part-0.parquet
File /kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=3/part-0.parquet processed on GPU 1.
Final DataFrame shape on GPU: (4016784, 92)



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



Partition 3 - Mean Squared Error (MSE): 0.9731647968292236
Partition 3 - R^2 Score: 0.044079945155716116
           Actual  Predicted
1690287 -0.002241   0.035366
2682959  2.004758  -0.003419
1873672 -0.228921   0.075045
3263262 -0.875409  -0.041136
2280224  1.002869  -0.018047
Number of GPUs available: 2
Processing file on GPU 1: /kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=4/part-0.parquet
File /kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=4/part-0.parquet processed on GPU 1.
Final DataFrame shape on GPU: (5022952, 92)



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



Partition 4 - Mean Squared Error (MSE): 0.8304966688156128
Partition 4 - R^2 Score: 0.06827565530697377
           Actual  Predicted
4146102 -0.087424   0.005858
607231   0.118195  -0.020518
3550372  0.399575  -0.054012
2043701 -0.081758   0.003472
2678921  1.206762   0.100608
Number of GPUs available: 2
Processing file on GPU 1: /kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=5/part-0.parquet
File /kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=5/part-0.parquet processed on GPU 1.
Final DataFrame shape on GPU: (5348200, 92)



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



Partition 5 - Mean Squared Error (MSE): 0.7729962468147278
Partition 5 - R^2 Score: 0.04915802326504459
           Actual  Predicted
2609585 -0.064984   0.095745
2223243  0.067053  -0.025598
410356   0.328395   0.088327
2371339 -0.308415  -0.091437
3959117  0.025044   0.043233
Number of GPUs available: 2
Processing file on GPU 1: /kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=6/part-0.parquet
File /kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=6/part-0.parquet processed on GPU 1.
Final DataFrame shape on GPU: (6203912, 92)



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



Partition 6 - Mean Squared Error (MSE): 0.7507171630859375
Partition 6 - R^2 Score: 0.040078074568486
           Actual  Predicted
3440608  0.004181  -0.053488
4719333 -0.223833   0.076048
5603015  3.837440  -0.331245
2729380 -0.256071   0.004791
63731    0.696696  -0.027104
Number of GPUs available: 2
Processing file on GPU 1: /kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=7/part-0.parquet
File /kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=7/part-0.parquet processed on GPU 1.
Final DataFrame shape on GPU: (6335560, 92)



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



Partition 7 - Mean Squared Error (MSE): 0.6597557663917542
Partition 7 - R^2 Score: 0.041148732026732415
           Actual  Predicted
3568581 -0.541344   0.006169
5975732  1.809481   0.073574
6051193 -0.032710   0.039627
402213   0.763295  -0.012490
3302697 -0.035247  -0.111918
Number of GPUs available: 2
Processing file on GPU 1: /kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=8/part-0.parquet
File /kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=8/part-0.parquet processed on GPU 1.
Final DataFrame shape on GPU: (6140024, 92)



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



Partition 8 - Mean Squared Error (MSE): 0.7141674160957336
Partition 8 - R^2 Score: 0.051412808757244854
           Actual  Predicted
3230214  0.594553  -0.098681
5775295 -0.357634   0.009935
1462816 -0.667630   0.053848
3629140  0.099862  -0.053964
1812052 -0.071230  -0.056088
Number of GPUs available: 2
Processing file on GPU 1: /kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=9/part-0.parquet
File /kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=9/part-0.parquet processed on GPU 1.
Final DataFrame shape on GPU: (6274576, 92)



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



Partition 9 - Mean Squared Error (MSE): 0.639310896396637
Partition 9 - R^2 Score: 0.03630109919430746
           Actual  Predicted
4746181 -2.192561   0.148878
4976500 -1.429121   0.348629
3337130  1.722727   0.037951
1452878 -0.442353  -0.027031
1525985  0.397046  -0.144801
