In [1]:
import polars as pl
import pandas as pd
import numpy as np
import os
import joblib
import warnings; warnings.filterwarnings(action='ignore')

from gc import collect
from xgboost import XGBRegressor

In [2]:
TRAINING = True

TARGET = "responder_6"
WEIGHT = 'weight'
FEATURES = [f"feature_{i:02d}" for i in range(79)]

RANDOM_STATE = 42
N_FOLDS = 5

In [3]:
def load_data(file_path): 
    id_col = pl.int_range(pl.len(), dtype=pl.UInt32).alias("id") # Generate an id column
    all_cols = pl.all() # Select all columns

    # Read the parquet file and select the specified columns
    data = pl.scan_parquet(file_path).select(id_col, all_cols)
    
    all_col_names = data.collect_schema().names()
    
    # Cols to not look for when classifying train and target column names
    cols_of_disinterest = ("weight", "id", "date_id", "time_id", "partition_id")
    target_columns, selected_columns = [], []

    # Factory for loop to classify train and target column names
    for col in all_col_names: 
        if col.startswith("responder"):
            target_columns.append(col)

        elif not col.startswith(cols_of_disinterest):
            selected_columns.append(col)
            
    data = data.collect()
    
    return data

In [6]:
if TRAINING: 
    file_path = "Kaggle-Mastery/JaneStreet/data/train.parquet"

    raw_data = load_data(file_path)

    dates_to_skip = 500 # Too many missing stocks in the early days
    num_test_dates = 100

    # Filter the DataFrame to include only dates greater than or equal to dates_to_skip
    raw_data = raw_data.filter(pl.col('date_id') >= dates_to_skip)

    # Get unique dates from the DataFrame
    dates = raw_data['date_id'].unique()

    # Define validation dates as the last `num_test_dates` dates
    test_dates = dates[-num_test_dates:]

    # Define training dates as all dates except the last `num_test_dates` dates
    train_dates = dates[:-num_test_dates]

    raw_data.head(3)

FileNotFoundError: No such file or directory (os error 2): ../data/train.parquet

This error occurred with the following context stack:
	[1] 'parquet scan'
	[2] 'select'


In [6]:
# Prepare validation data for training
if TRAINING: 
    test_data = raw_data.filter(pl.col('date_id').is_in(test_dates))

    X_test = test_data[FEATURES]
    y_test = test_data[TARGET]
    w_test = test_data[WEIGHT]

In [7]:
model_path = 'Kaggle-Mastery/JaneStreet/models/xgb_01/'
models = []

def train(model, model_name=None):
    # Not training, load `model_name` instead
    if not TRAINING: 
        models.append(joblib.load(f'{model_path}/{model_name}_{i}.model'))
        
        return 
    
    # Select dates for training based on the fold number
    selected_dates = [date for ii, date in enumerate(train_dates) if ii % N_FOLDS != i]
    
    train_data = raw_data.filter(pl.col('date_id').is_in(train_dates))
    
    X_train = train_data[FEATURES]
    y_train = train_data[TARGET]
    w_train = train_data[WEIGHT]
        
    # Train XGBoost model with verbose logging
    model.fit(X_train, y_train, sample_weight=w_train, 
              eval_set=[(X_test, y_test)], 
              sample_weight_eval_set=[w_test], 
              verbose=50)
    
    # Append the trained model to the list
    models.append(model)
    
    del X_train, y_train, w_train
    collect()

    # Save the trained model to a file
    joblib.dump(model, f'models/{model_name}_{i}.model')
    
    return 

# Custom R2 metric for XGBoost
def r2_xgb(y_true, y_pred, sample_weight):
    r2 = 1 - np.average((y_pred - y_true) ** 2, weights=sample_weight) / (np.average((y_true) ** 2, weights=sample_weight) + 1e-38)
    
    return -r2 # Must be negative for early stopping to work

In [8]:
xgb_params = {'n_estimators': 2000,
              'learning_rate': 0.03, 
              'max_depth': 6, 
              'tree_method': 'hist', 
              'objective': 'reg:squarederror',
              'early_stopping_rounds': 30,
              'eval_metric': r2_xgb,
              'disable_default_eval_metric': True,
              'device': 'cuda'}

xgb = XGBRegressor(**xgb_params)

for i in range(N_FOLDS):
    train(xgb)

[0]	validation_0-r2_xgb:-0.00073
[50]	validation_0-r2_xgb:-0.00603
[100]	validation_0-r2_xgb:-0.00651
[150]	validation_0-r2_xgb:-0.00667
[200]	validation_0-r2_xgb:-0.00638
[249]	validation_0-r2_xgb:-0.00583
[0]	validation_0-r2_xgb:-0.00073
[50]	validation_0-r2_xgb:-0.00603
[100]	validation_0-r2_xgb:-0.00651
[150]	validation_0-r2_xgb:-0.00667
[200]	validation_0-r2_xgb:-0.00638
[250]	validation_0-r2_xgb:-0.00582
[0]	validation_0-r2_xgb:-0.00073
[50]	validation_0-r2_xgb:-0.00603
[100]	validation_0-r2_xgb:-0.00651
[150]	validation_0-r2_xgb:-0.00667
[200]	validation_0-r2_xgb:-0.00638
[250]	validation_0-r2_xgb:-0.00582
[0]	validation_0-r2_xgb:-0.00073
[50]	validation_0-r2_xgb:-0.00603
[100]	validation_0-r2_xgb:-0.00651
[150]	validation_0-r2_xgb:-0.00667
[200]	validation_0-r2_xgb:-0.00638
[249]	validation_0-r2_xgb:-0.00583
[0]	validation_0-r2_xgb:-0.00073
[50]	validation_0-r2_xgb:-0.00603
[100]	validation_0-r2_xgb:-0.00651
[150]	validation_0-r2_xgb:-0.00667
[200]	validation_0-r2_xgb:-0.00638
