# Imports & Configuration

In [1]:
# CELL 1: Imports & basic config

import os
from pathlib import Path
from dataclasses import dataclass

import numpy as np
import pandas as pd
import polars as pl

from sklearn.linear_model import RidgeCV
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

# Kaggle evaluation server
from kaggle_evaluation.default_inference_server import DefaultInferenceServer

DATA_PATH = Path("/kaggle/input/hull-tactical-market-prediction/")

## Constants & Target Mapping

In [2]:
# CELL 2: Metric + signal mapping

MIN_SIGNAL = 0.0
MAX_SIGNAL = 2.0

# This is k in position = clip(1 + k * ret, 0, 2)
# You can tweak this later (60.645161... gave 6.367 for you earlier)
SIGNAL_MULTIPLIER = 60.645161290322584  # KNOB #1


@dataclass(frozen=True)
class RetToSignalParameters:
    signal_multiplier: float
    min_signal: float = MIN_SIGNAL
    max_signal: float = MAX_SIGNAL


ret_signal_params = RetToSignalParameters(signal_multiplier=SIGNAL_MULTIPLIER)


class ParticipantVisibleError(Exception):
    pass


def ScoreMetric(solution: pd.DataFrame,
                submission: pd.DataFrame,
                row_id_column_name: str) -> float:
    """
    Official competition metric (vol-adjusted Sharpe).
    Copied from the competition starter.
    """
    solut = solution.copy()
    solut["position"] = submission["prediction"].values

    if solut["position"].max() > MAX_SIGNAL:
        raise ParticipantVisibleError(
            f'Position of {solut["position"].max()} exceeds maximum of {MAX_SIGNAL}'
        )
    if solut["position"].min() < MIN_SIGNAL:
        raise ParticipantVisibleError(
            f'Position of {solut["position"].min()} below minimum of {MIN_SIGNAL}'
        )

    solut["strategy_returns"] = (
        solut["risk_free_rate"] * (1.0 - solut["position"]) +
        solut["forward_returns"] * solut["position"]
    )

    strategy_excess_returns = solut["strategy_returns"] - solut["risk_free_rate"]
    strategy_excess_cumulative = (1.0 + strategy_excess_returns).prod()
    strategy_mean_excess_return = strategy_excess_cumulative ** (1.0 / len(solut)) - 1.0
    strategy_std = solut["strategy_returns"].std()

    trading_days_per_yr = 252
    if strategy_std == 0:
        raise ZeroDivisionError

    sharpe = strategy_mean_excess_return / strategy_std * np.sqrt(trading_days_per_yr)
    strategy_volatility = float(strategy_std * np.sqrt(trading_days_per_yr) * 100.0)

    market_excess_returns = solut["forward_returns"] - solut["risk_free_rate"]
    market_excess_cumulative = (1.0 + market_excess_returns).prod()
    market_mean_excess_return = market_excess_cumulative ** (1.0 / len(solut)) - 1.0
    market_std = solut["forward_returns"].std()
    market_volatility = float(market_std * np.sqrt(trading_days_per_yr) * 100.0)

    excess_vol = (
        max(0.0, strategy_volatility / market_volatility - 1.2)
        if market_volatility > 0
        else 0.0
    )
    vol_penalty = 1.0 + excess_vol

    return_gap = max(
        0.0,
        (market_mean_excess_return - strategy_mean_excess_return)
        * 100.0
        * trading_days_per_yr,
    )
    return_penalty = 1.0 + (return_gap ** 2) / 100.0

    adjusted_sharpe = sharpe / (vol_penalty * return_penalty)
    return min(float(adjusted_sharpe), 1_000_000.0)


def convert_ret_to_signal(ret_arr: np.ndarray,
                          params: RetToSignalParameters) -> np.ndarray:
    """
    Map predicted returns -> position in [0, 2]:
        position = clip(1 + k * ret, MIN_SIGNAL, MAX_SIGNAL)
    """
    return np.clip(
        1.0 + params.signal_multiplier * ret_arr,
        params.min_signal,
        params.max_signal,
    )

In [3]:
# CELL 3: Train base models + set stacking weights

# 1) Load and sort full train
train_df = pd.read_csv(DATA_PATH / "train.csv").sort_values("date_id").reset_index(drop=True)

# 2) Use only last N rows (recency bias)
LAST_N = 3000  # KNOB #2 — you can try 2500 / 2800 / 3200 later
train_df = train_df.iloc[-LAST_N:].copy()

# 3) Simple feature engineering (must match predict())
train_df["U1"] = train_df["I2"] - train_df["I1"]
train_df["U2"] = train_df["M11"] / ((train_df["I2"] + train_df["I9"] + train_df["I7"]) / 3.0)

# 4) Target = market_forward_excess_returns
train_df["target"] = train_df["market_forward_excess_returns"]

# 5) Choose numeric features (exclude obvious non-features)
exclude_cols = [
    "row_id",
    "date_id",
    "forward_returns",
    "risk_free_rate",
    "market_forward_excess_returns",
    "target",
]

feature_cols_eval = [
    c for c in train_df.columns
    if c not in exclude_cols and train_df[c].dtype != "object"
]

print("Stacking model: using", len(feature_cols_eval), "features.")

# 6) Median fill + store medians for predict()
feature_medians = {}
for c in feature_cols_eval:
    med = train_df[c].median()
    train_df[c] = train_df[c].fillna(med)
    feature_medians[c] = med

X_full = train_df[feature_cols_eval].values
y_full = train_df["target"].values

# 7) Define 3 base models (keep them light to avoid timeout)

lgb_model = LGBMRegressor(
    n_estimators=600,
    learning_rate=0.03,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_samples=50,
    reg_lambda=1.0,
    reg_alpha=0.0,
    random_state=42,
    n_jobs=-1,
)

ridge_model = RidgeCV(
    alphas=np.logspace(-4, 3, 20)
)

rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=8,
    min_samples_leaf=10,
    random_state=42,
    n_jobs=-1,
)

print("Fitting LGBM...")
lgb_model.fit(X_full, y_full)

print("Fitting RidgeCV...")
ridge_model.fit(X_full, y_full)

print("Fitting RandomForest...")
rf_model.fit(X_full, y_full)

# 8) Manual stacking weights (KNOBS #3–#5)
W_LGBM = 0.6
W_RIDGE = 0.25
W_RF   = 0.15

print(
    f"Trained 3 base models on {len(train_df)} rows "
    f"with {len(feature_cols_eval)} features.\n"
    f"Stacking weights: LGBM={W_LGBM}, Ridge={W_RIDGE}, RF={W_RF}\n"
    f"SIGNAL_MULTIPLIER={SIGNAL_MULTIPLIER:.4f}"
)

Stacking model: using 96 features.
Fitting LGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002104 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21600
[LightGBM] [Info] Number of data points in the train set: 3000, number of used features: 96
[LightGBM] [Info] Start training from score 0.000197
Fitting RidgeCV...
Fitting RandomForest...
Trained 3 base models on 3000 rows with 96 features.
Stacking weights: LGBM=0.6, Ridge=0.25, RF=0.15
SIGNAL_MULTIPLIER=60.6452


In [4]:
# CELL 4: predict() for Kaggle + inference server

def predict(test_pl: pl.DataFrame) -> float:
    """
    Kaggle calls this with a single-row Polars DataFrame from test.csv.
    We:
      - convert to pandas
      - engineer U1, U2
      - align to feature_cols_eval
      - fill NaNs with stored medians
      - get predictions from 3 base models
      - blend using fixed weights
      - map to [0, 2] using convert_ret_to_signal
    """
    # 1) Polars -> pandas (single row)
    df = test_pl.to_pandas()

    # 2) Feature engineering (MUST match Cell 3)
    df["U1"] = df["I2"] - df["I1"]
    df["U2"] = df["M11"] / ((df["I2"] + df["I9"] + df["I7"]) / 3.0)

    # 3) Ensure all feature columns exist
    for c in feature_cols_eval:
        if c not in df.columns:
            df[c] = feature_medians[c]

    df = df[feature_cols_eval]

    # 4) Fill NaNs with stored medians
    for c in feature_cols_eval:
        df[c] = df[c].fillna(feature_medians[c])

    X = df.values

    # 5) Base model predictions
    pred_lgb   = float(lgb_model.predict(X)[0])
    pred_ridge = float(ridge_model.predict(X)[0])
    pred_rf    = float(rf_model.predict(X)[0])

    # 6) Stacked prediction (weighted blend)
    raw_pred = (
        W_LGBM * pred_lgb +
        W_RIDGE * pred_ridge +
        W_RF * pred_rf
    )

    # 7) Convert to signal in [0, 2]
    signal = convert_ret_to_signal(np.array([raw_pred]), ret_signal_params)[0]
    return float(signal)


# Optional: small local sanity check (won't run on hidden test)
if not os.getenv("KAGGLE_IS_COMPETITION_RERUN"):
    print("Sanity check on first 5 test rows (dev only):")
    test_df = pl.read_csv(DATA_PATH / "test.csv")
    for i in range(10):
        row = test_df[i:i+1]
        print(int(row["date_id"][0]), predict(row))

# Inference server for Kaggle
inference_server = DefaultInferenceServer(predict)

if os.getenv("KAGGLE_IS_COMPETITION_RERUN"):
    inference_server.serve()
else:
    inference_server.run_local_gateway((str(DATA_PATH),))

Sanity check on first 5 test rows (dev only):
8980 0.8218593401208992
8981 0.7758620305073602
8982 1.189463020888973
8983 1.241175572446061
8984 0.9270511381648872
8985 1.091022961436742
8986 1.081738011997523
8987 1.0980128641111622
8988 1.2317958228927042
8989 0.979972290698451
