In [1]:
import polars as pl
import pandas as pd
import numpy as np
import os
import joblib
import warnings; warnings.filterwarnings(action='ignore')
import time 

from gc import collect
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from lightgbm.callback import early_stopping, log_evaluation 

from model_info import get_model, fit_model

In [2]:
TRAINING = True
LGB = True # Training LGB model, switch to pandas

TARGET = "responder_6"
WEIGHT = 'weight'

FEATURES = [f"feature_{i:02d}" for i in range(79)]
FEATURES.append('symbol_id')

RANDOM_STATE = 42
N_FOLDS = 5

In [3]:
def load_data(file_path): 
    id_col = pl.int_range(pl.len(), dtype=pl.UInt32).alias("id") # Generate an id column
    all_cols = pl.all() # Select all columns

    # Read the parquet file and select the specified columns
    data = pl.scan_parquet(file_path).select(id_col, all_cols)
    
    all_col_names = data.collect_schema().names()
    
    # Cols to not look for when classifying train and target column names
    cols_of_disinterest = ("weight", "id", "date_id", "time_id", "partition_id")
    target_columns, selected_columns = [], []

    # Factory for loop to classify train and target column names
    for col in all_col_names: 
        if col.startswith("responder"):
            target_columns.append(col)

        elif not col.startswith(cols_of_disinterest):
            selected_columns.append(col)
            
    data = data.collect()
    
    return data

In [4]:
if TRAINING: 
    file_path = "data/train.parquet"

    raw_data = load_data(file_path)

    dates_to_skip = 500 # Too many missing stocks in the early days
    num_test_dates = 100

    # Filter the DataFrame to include only dates greater than or equal to dates_to_skip
    raw_data = raw_data.filter(pl.col('date_id') >= dates_to_skip)
            
    # Get unique dates from the DataFrame
    dates = raw_data['date_id'].unique()

    # Define validation dates as the last `num_test_dates` dates
    test_dates = dates[-num_test_dates:]

    # Define training dates as all dates except the last `num_test_dates` dates
    train_dates = dates[:-num_test_dates]
    
    # Prepare validation data for training
    test_data = raw_data.filter(pl.col('date_id').is_in(test_dates))

    X_test = test_data[FEATURES]
    y_test = test_data[TARGET]
    w_test = test_data[WEIGHT]
    
    # Training LGB model, must use numpy
    if LGB: 
        X_test = test_data[FEATURES].to_numpy()
        y_test = test_data[TARGET].to_numpy()
        w_test = test_data[WEIGHT].to_numpy()

In [5]:
model_path = 'models/lgb_02'
models, cv_scores = [], []

def train(model, model_name):
    # Not training, load `model_name` instead
    if not TRAINING: 
        models.append(joblib.load(f'{model_path}/{model_name}_{i}.model'))
        
        return 
    
    start_time = time.time()

    # Select dates for training based on the fold number
    selected_dates = [date for ii, date in enumerate(train_dates) if ii % N_FOLDS != i]
    
    train_data = raw_data.filter(pl.col('date_id').is_in(train_dates))
    
    X_train = train_data[FEATURES]
    y_train = train_data[TARGET]
    w_train = train_data[WEIGHT]
    
    if model_name == 'xgb':
        # Train XGBoost model with verbose logging
        fit_model(model_name, model, X_test, y_test, w_train, w_test)
        
        cv_score = model.best_score
    else: 
        # LGBM is incompatible with polars
        X_train = X_train.to_numpy()
        y_train = y_train.to_numpy()
        w_train = w_train.to_numpy()
    
        # Train LightGBM model with early stopping and evaluation logging
        model.fit(X_train, y_train, w_train,  
                  eval_metric=[r2_lgb],
                  eval_set=[(X_test, y_test, w_test)], 
                  callbacks=[
                      early_stopping(30), 
                      log_evaluation(50)
                  ])
        
        cv_score = model.best_score_['valid_0'][r2_lgb.__name__]
    
    end_time = time.time()
    time_elapsed = end_time - start_time 
    
    print(f"FOLD {i} COMPLETE, TIME: {time_elapsed:.3f}")
    
    # Append the trained model to the list
    models.append(model)
    
    del X_train, y_train, w_train
    collect()

    # Save the trained model to a file
    joblib.dump(model, f'{model_path}/{model_name}_{i}.model')
    
    return cv_score

# Custom R2 metric for XGBoost
def r2_xgb(y_true, y_pred, sample_weight):
    r2 = 1 - np.average((y_pred - y_true) ** 2, weights=sample_weight) / (np.average((y_true) ** 2, weights=sample_weight) + 1e-38)
    
    return -r2 # Must be negative for early stopping to work

# Custom R2 metric for LightGBM
def r2_lgb(y_true, y_pred, sample_weight):
    r2 = 1 - np.average((y_pred - y_true) ** 2, weights=sample_weight) / (np.average((y_true) ** 2, weights=sample_weight) + 1e-38)
    
    return 'r2', r2, True

In [6]:
model_name = 'lgb'
model = get_model(model_name)

for i in range(1, N_FOLDS+1):
    cv_score = train(model, model_name)
    cv_scores.append(cv_score)
    
mean_cv_score = sum(cv_scores) / len(cv_scores)
print(f"Mean CV Score: {mean_cv_score}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.526039 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 19482
[LightGBM] [Info] Number of data points in the train set: 35861029, number of used features: 80
[LightGBM] [Info] Start training from score -0.002140
Training until validation scores don't improve for 30 rounds
[50]	valid_0's l2: 0.645387	valid_0's r2: 0.00454162
[100]	valid_0's l2: 0.644597	valid_0's r2: 0.00575897
[150]	valid_0's l2: 0.644253	valid_0's r2: 0.00629026
[200]	valid_0's l2: 0.644059	valid_0's r2: 0.00658946
[250]	valid_0's l2: 0.643921	valid_0's r2: 0.00680253
[300]	valid_0's l2: 0.643844	valid_0's r2: 0.006921
[350]	valid_0's l2: 0.643767	valid_0's r2: 0.00704041
[400]	valid_0's l2: 0.643752	valid_0's r2: 0.0070622
Early stopping, best iteration is:
[385]	valid_0's l2: 0.643724	valid_0's r2: 0.00710665


KeyError: 'r2_lgb'