In [1]:
# Disable warnings
import warnings
warnings.filterwarnings('ignore')

# Handle data
import polars as pl
import numpy as np

# Dataclasses
from dataclasses import dataclass, asdict

# Model and evaluation
from powershap import PowerShap
from xgboost import XGBRegressor

# Progress bar
from tqdm import tqdm

# File system
from pathlib import Path
import os

# import kaggle_evaluation.default_inference_server

In [2]:
# ============ PATHS ============
if 'KAGGLE_KERNEL_RUN_TYPE' in os.environ:
    DATA_PATH: Path = Path('/kaggle/input/hull-tactical-market-prediction/')
else:
    DATA_PATH: Path = Path('./')
    
# ============ RETURNS TO SIGNAL CONFIGS ============
MIN_SIGNAL: float = 0.0                         # Minimum value for the daily signal 
MAX_SIGNAL: float = 2.0                         # Maximum value for the daily signal 
SIGNAL_MULTIPLIER: float = 100.0                # Multiplier of the OLS forward returns predictions to signal
    
# ============ FEATURE AUGMENTATION CONFIGS ============
LAGS = [1, 3, 5, 10, 20]
DIFFS = [1, 3, 5, 10, 20]
ROLL_WINDOWS = [5, 10, 20]
ZSCORE_WINDOW = 20

# ============ ROLLING TRAIN CONFIGS ============
WINDOW = 252
DELTA = 1
H = 1
PS_PERIODS = 20

# ============ MODEL CONFIGS ============
N_ESTIMATORS = 500
LEARNING_RATE = 0.05
MAX_DEPTH = 4
MIN_CHILD_WEIGHT = 5
COLSAMPLE_BYTREE = 0.7

In [3]:
@dataclass
class DatasetOutput:
    X_train : pl.DataFrame 
    X_test: pl.DataFrame
    y_train: pl.Series
    y_test: pl.Series
    
@dataclass
class XGBRegressorParameters:
    n_estimators: int = N_ESTIMATORS
    learning_rate: float = LEARNING_RATE
    max_depth: int = MAX_DEPTH
    min_child_weight: int = MIN_CHILD_WEIGHT
    colsample_bytree: float = COLSAMPLE_BYTREE

In [4]:
xgb_params = XGBRegressorParameters()

In [5]:
def load_trainset() -> pl.DataFrame:
    """
    Loads and preprocesses the training dataset.

    Returns:
        pl.DataFrame: The preprocessed training DataFrame.
    """
    return (
        pl.read_csv(DATA_PATH / "train.csv")
        .rename({'forward_returns':'target'})
        .with_columns(
            pl.exclude('date_id').cast(pl.Float64, strict=False)
        )
    )

def load_testset() -> pl.DataFrame:
    """
    Loads and preprocesses the testing dataset.

    Returns:
        pl.DataFrame: The preprocessed testing DataFrame.
    """
    return (
        pl.read_csv(DATA_PATH / "test.csv")
        .rename({'lagged_forward_returns':'target'})
        .with_columns(
            pl.exclude('date_id').cast(pl.Float64, strict=False)
        )
    )

def create_augmented_features(df: pl.DataFrame) -> pl.DataFrame:
    df = df.drop_nulls()
    
    feat_cols: list[str] = [c for c in df.columns if c not in ['date_id', 'target']]
    is_bin = np.array([c.startswith('D') for c in feat_cols])
    feat_cols = np.array(feat_cols)
    D_cols = feat_cols[is_bin]
    cont_cols = feat_cols[~is_bin]
    
    # ---- helpers ----
    def signed_log_expr(c: str) -> pl.Expr:
        x = pl.col(c)
        return pl.when(x >= 0).then((x.abs() + 1).log()).otherwise(-((x.abs() + 1).log()))

    exprs: list[pl.Expr] = []

    # ========= D* (binary / discrete state) =========
    for c in D_cols:
        # lags
        for l in LAGS:
            exprs.append(pl.col(c).shift(l).alias(f"{c}_lag{l}"))

        # rolling means
        for w in ROLL_WINDOWS:
            exprs.append(pl.col(c).rolling_mean(window_size=w).alias(f"{c}_roll_mean{w}"))

        # state duration (run length of consecutive equal values)
        # run_id increments whenever value changes; within each run, count from 1..run_len
        run_id = (pl.col(c) != pl.col(c).shift(1)).cast(pl.Int8).fill_null(1).cum_sum()
        exprs.append(pl.int_range(0, pl.len()).over(run_id).add(1).alias(f"{c}_runlen"))

    # ========= continuous (E/I/M/P/S/...) =========
    slogs: list[pl.Expr] = [signed_log_expr(c).alias(f"{c}_slog") for c in cont_cols]
    for c in cont_cols:
        # lags
        for l in LAGS:
            exprs.append(pl.col(c).shift(l).alias(f"{c}_lag{l}"))

        # diffs
        for d in DIFFS:
            exprs.append((pl.col(c) - pl.col(c).shift(d)).alias(f"{c}_diff{d}"))

        # rolling mean/std
        for w in ROLL_WINDOWS:
            exprs.append(pl.col(c).rolling_mean(window_size=w).alias(f"{c}_roll_mean{w}"))
            exprs.append(pl.col(c).rolling_std(window_size=w).alias(f"{c}_roll_std{w}"))

        # signed log + diff
        for d in DIFFS:
            exprs.append((pl.col(f"{c}_slog") - pl.col(f"{c}_slog").shift(d)).alias(f"{c}_slog_diff{d}"))

        # z-score (window = 20)
        m = pl.col(c).rolling_mean(window_size=ZSCORE_WINDOW)
        sd = pl.col(c).rolling_std(window_size=ZSCORE_WINDOW) + 1e-12
        exprs.append(((pl.col(c) - m) / sd).alias(f"{c}_zscore{ZSCORE_WINDOW}"))

    return df.with_columns(slogs).with_columns(exprs)
    
def join_train_test_dataframes(train: pl.DataFrame, test: pl.DataFrame) -> pl.DataFrame:
    """
    Joins two dataframes by common columns and concatenates them vertically.

    Args:
        train (pl.DataFrame): The training DataFrame.
        test (pl.DataFrame): The testing DataFrame.

    Returns:
        pl.DataFrame: A single DataFrame with vertically stacked data from common columns.
    """
    common_columns: list[str] = [col for col in train.columns if col in test.columns]
    
    return pl.concat([train.select(common_columns), test.select(common_columns)], how="vertical").group_by('date_id', maintain_order=True).agg(pl.all().first())

def split_dataset(train: pl.DataFrame, test: pl.DataFrame, features: list[str]) -> DatasetOutput: 
    """
    Splits the data into features (X) and target (y), and scales the features.

    Args:
        train (pl.DataFrame): The processed training DataFrame.
        test (pl.DataFrame): The processed testing DataFrame.
        features (list[str]): List of features to used in model. 

    Returns:
        DatasetOutput: A dataclass containing the scaled feature sets, target series, and the fitted scaler.
    """
    X_train = train.drop(['date_id','target']).select(features)
    y_train = train.get_column('target')
    X_test = test.drop(['date_id','target']).select(features)
    y_test = test.get_column('target')
    
    return DatasetOutput(
        X_train = X_train,
        y_train = y_train, 
        X_test = X_test, 
        y_test = y_test,
    )

In [6]:
def convert_ret_to_signal(
    ret_arr: np.ndarray,
) -> np.ndarray:
    """
    Converts raw model predictions (returns) into a trading signal.

    Args:
        ret_arr (np.ndarray): The array of predicted returns.

    Returns:
        np.ndarray: The resulting trading signal, clipped between min and max values.
    """
    return np.clip(
        ret_arr * SIGNAL_MULTIPLIER + 1, MIN_SIGNAL, MAX_SIGNAL
    )

In [7]:
train: pl.DataFrame = load_trainset()
test: pl.DataFrame = load_testset()
df: pl.DataFrame = join_train_test_dataframes(train, test)
df: pl.DataFrame = create_augmented_features(df)
train: pl.DataFrame = df.filter(pl.col('date_id').is_in(train.get_column('date_id')))
test: pl.DataFrame = df.filter(pl.col('date_id').is_in(test.get_column('date_id')))
ALL_FEATURES: list[str] = [col for col in test.columns if col not in ['date_id', 'target']]

dataset: DatasetOutput = split_dataset(train=train, test=test, features=ALL_FEATURES) 

X_train: pl.DataFrame = dataset.X_train
X_test: pl.DataFrame = dataset.X_test
y_train: pl.DataFrame = dataset.y_train
y_test: pl.DataFrame = dataset.y_test

In [8]:
# Total time points
T = X_train.height

# Number of iterations
K = int((T - WINDOW) / PS_PERIODS)

# Selected features
feats = set()

for k in tqdm(range(K)):
    t = k * PS_PERIODS
    
    # Polars slicing
    X_train_sub = X_train.slice(t, WINDOW)
    y_train_sub = y_train.slice(t, WINDOW)
    
    if any(X_train_sub.null_count().sum_horizontal() > 0):
        continue

    selector = PowerShap(
        model = XGBRegressor(**asdict(xgb_params)),
        automatic=True,
        verbose=0,
        show_progress=False
    )

    selector.fit(X_train_sub, y_train_sub, verbose=0)
    
    feats = feats.union(selector.get_feature_names_out())

FEATURES: list[str] = list(feats)
FEATURES

100%|██████████| 90/90 [30:37<00:00, 20.42s/it]


['E19_diff3',
 'M5_zscore20',
 'E19_diff10',
 'I9_diff3',
 'S6_slog_diff3',
 'I7_diff1',
 'V4_slog_diff1',
 'V8_roll_std5',
 'I9_diff1',
 'M12_lag1',
 'S12_zscore20',
 'M10_diff1',
 'I7_slog_diff1',
 'P6_lag20',
 'P4_lag10',
 'M6_diff20',
 'M16_lag20',
 'V3_diff1',
 'P3_diff5',
 'P3_roll_std5',
 'M6_lag1',
 'M13_diff1',
 'M11_diff1',
 'M17_diff10',
 'M4',
 'P6_lag3',
 'M18_roll_std20',
 'M16_diff20',
 'P2_diff20',
 'P3_slog_diff20',
 'V4_roll_std5',
 'M6_diff3',
 'V9_diff1',
 'P4_roll_mean5',
 'P3_diff3',
 'D3_lag10',
 'E19_lag10',
 'V13_lag3',
 'M1',
 'P5_diff5',
 'P4_lag20',
 'P8_slog_diff3',
 'E19_diff1',
 'I8_roll_std10',
 'I4_lag20',
 'P12_diff5',
 'P4_lag3',
 'S9_slog_diff5',
 'M9_diff20',
 'S1_diff1',
 'I9_slog_diff5',
 'M3_lag10',
 'S8_diff1',
 'V5',
 'M3_diff5',
 'E19_slog_diff1',
 'S4_lag20',
 'P3_lag1',
 'S10_diff3',
 'M1_roll_mean5',
 'S9_lag5',
 'P3_diff20',
 'V8_roll_std10',
 'P6_diff1',
 'I9_slog_diff3',
 'I4_slog_diff3']

In [12]:
np.save('selected_features.npy', FEATURES)

In [9]:
X_train = X_train.select(FEATURES)

model = XGBRegressor(**asdict(xgb_params))
model.fit(X_train, y_train)

0,1,2
,objective,'reg:squarederror'
,base_score,0.5
,booster,'gbtree'
,callbacks,
,colsample_bylevel,1
,colsample_bynode,1
,colsample_bytree,0.7
,early_stopping_rounds,
,enable_categorical,False
,eval_metric,


In [10]:
def predict(test: pl.DataFrame) -> float:
    test = test.rename({'lagged_forward_returns':'target'})
    df: pl.DataFrame = create_augmented_features(test)
    X_test: pl.DataFrame = df.select(FEATURES)
    raw_pred: float = model.predict(X_test[20:])
    return np.r_[np.ones(20), convert_ret_to_signal(raw_pred)]

In [11]:
# inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

# if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
#     inference_server.serve()
# else:
#     inference_server.run_local_gateway(('/kaggle/input/hull-tactical-market-prediction/',))