In [19]:
# ============================================================
# STEP 2 – LightGBM (LGBMRegressor 기반, Kaggle-safe version)
# Hull Tactical Market Prediction
# ============================================================

import os
import numpy as np
import pandas as pd
import polars as pl
from lightgbm import LGBMRegressor
import kaggle_evaluation.default_inference_server

TRAIN_PATH = "/kaggle/input/hull-tactical-market-prediction/train.csv"
TEST_PATH  = "/kaggle/input/hull-tactical-market-prediction/test.csv"
TARGET_NAME = "market_forward_excess_returns"

IS_RERUN = bool(os.getenv("KAGGLE_IS_COMPETITION_RERUN"))

model = None
feature_cols = []
GLOBAL_ALPHA = 0.5
df_train = None
y_full = None

print("Loading training data...")

# ------------------------------------------------------------
# 1. 유틸 함수
# ------------------------------------------------------------

def make_time_folds(df, n_folds=3, val_window=180):
    dates = np.sort(df["date_id"].unique())
    n_dates = len(dates)

    total_val = n_folds * val_window
    if total_val >= n_dates:
        val_window = max(30, n_dates // (n_folds + 1))
        total_val = n_folds * val_window

    start_val = n_dates - total_val
    folds = []

    for k in range(n_folds):
        val_s = start_val + k * val_window
        val_e = val_s + val_window
        val_dates = dates[val_s:val_e]
        tr_dates  = dates[:val_s]
        folds.append((tr_dates, val_dates))

    return folds


def pred_to_allocation(y_pred, ref_series, alpha):
    mean = np.mean(ref_series)
    std  = np.std(ref_series)
    z = (y_pred - mean) / (std + 1e-9)
    w = 1.0 + alpha * z
    return np.clip(w, 0, 2)


def hull_like_metric(df_val, positions):
    fwd = df_val["forward_returns"].values
    rf  = df_val["risk_free_rate"].values
    mkt = df_val["market_forward_excess_returns"].values

    strat = positions * fwd
    excess = strat - rf

    mu_s = np.mean(excess)
    mu_m = np.mean(mkt)
    vol_s = np.std(excess)
    vol_m = np.std(mkt)

    sharpe_s = mu_s / (vol_s + 1e-9)
    sharpe_m = mu_m / (vol_m + 1e-9)

    vol_ratio = vol_s / (vol_m + 1e-9)

    score = sharpe_s
    if vol_ratio > 1.2:
        score -= (vol_ratio - 1.2)
    if mu_s < mu_m:
        score -= (mu_m - mu_s)

    return score, vol_ratio, sharpe_s, sharpe_m, mu_s, mu_m


# ------------------------------------------------------------
# 2. 모델 학습
# ------------------------------------------------------------

if os.path.exists(TRAIN_PATH):
    train_df = pd.read_csv(TRAIN_PATH).sort_values("date_id")
    df_train = train_df.copy()
    y_full = train_df[TARGET_NAME].values

    # test 샘플로 feature 교집합 추출
    sample_test = pd.read_csv(TEST_PATH, nrows=10)
    common_cols = list(set(train_df.columns) & set(sample_test.columns))
    feature_cols = sorted([c for c in common_cols if c not in ["date_id", "is_scored"]])

    print("Num features:", len(feature_cols))

    X_full = train_df[feature_cols]

    # 모델 설정 (sklearn API)
    base_model = LGBMRegressor(
        n_estimators=600,
        learning_rate=0.01,
        num_leaves=64,
        subsample=0.8,
        colsample_bytree=0.8,
        min_child_samples=50,
        random_state=42
    )

    # -------------------------
    # CV + alpha tuning (local only)
    # -------------------------
    if not IS_RERUN:
        print("Running CV for alpha tuning...")
        folds = make_time_folds(train_df, n_folds=2, val_window=150)
        alpha_grid = [0.3, 0.4, 0.5, 0.6]

        best_alphas = []

        for fold_id, (tr_dates, va_dates) in enumerate(folds):
            print("\nFold", fold_id)

            tr_mask = train_df["date_id"].isin(tr_dates)
            va_mask = train_df["date_id"].isin(va_dates)

            X_tr, y_tr = X_full[tr_mask], y_full[tr_mask]
            X_va, y_va = X_full[va_mask], y_full[va_mask]

            # 학습
            cv_model = LGBMRegressor(
                n_estimators=600,
                learning_rate=0.01,
                num_leaves=64,
                subsample=0.8,
                colsample_bytree=0.8,
                min_child_samples=50,
                random_state=42
            )
            cv_model.fit(X_tr, y_tr)

            pred_va = cv_model.predict(X_va)
            df_val = train_df[va_mask].copy()

            best_score = -np.inf
            best_alpha = 0.5

            for a in alpha_grid:
                alloc = pred_to_allocation(pred_va, y_full, a)
                score, vr, sharpe_s, sharpe_m, mu_s, mu_m = hull_like_metric(df_val, alloc)

                print(f" alpha={a} score={score:.4f} vol_ratio={vr:.3f}")

                if vr <= 1.2 and score > best_score:
                    best_score = score
                    best_alpha = a

            print(" Best alpha:", best_alpha)
            best_alphas.append(best_alpha)

        GLOBAL_ALPHA = float(np.median(best_alphas))
        print("\nSelected GLOBAL_ALPHA:", GLOBAL_ALPHA)

    else:
        GLOBAL_ALPHA = 0.5
        print("Kaggle rerun detected → alpha fixed to 0.5")

    # -------------------------
    # 최종 모델 학습
    # -------------------------
    print("\nTraining final model...")

    final_model = LGBMRegressor(
        n_estimators=600,
        learning_rate=0.01,
        num_leaves=64,
        subsample=0.8,
        colsample_bytree=0.8,
        min_child_samples=50,
        random_state=42
    )

    final_model.fit(X_full, y_full)
    model = final_model

    print("Final model ready.")

else:
    print("train.csv not found → fallback")
    model = None
    feature_cols = []
    GLOBAL_ALPHA = 0.5


# ------------------------------------------------------------
# 3. predict() — Kaggle 스트리밍 입력 처리 함수
# ------------------------------------------------------------

def predict(test: pl.DataFrame) -> pl.DataFrame:
    test_pd = test.to_pandas()

    if model is None or len(feature_cols) == 0:
        alloc = np.ones(len(test_pd), dtype="float32")
        return pl.DataFrame({"date_id": test["date_id"], "prediction": alloc})

    # feature 매칭
    X_test = pd.DataFrame(index=test_pd.index)
    for f in feature_cols:
        if f in test_pd.columns:
            X_test[f] = test_pd[f]
        else:
            X_test[f] = 0.0

    X_test = X_test.fillna(0.0)

    pred = model.predict(X_test)
    alloc = pred_to_allocation(pred, y_full, GLOBAL_ALPHA).astype("float32")

    return pl.DataFrame({
        "date_id": test["date_id"],
        "prediction": alloc
    })


# ------------------------------------------------------------
# 4. 서버 실행 (수정 금지)
# ------------------------------------------------------------

inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

if os.getenv("KAGGLE_IS_COMPETITION_RERUN"):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        ("/kaggle/input/hull-tactical-market-prediction/", )
    )

Loading training data...
Num features: 94
Running CV for alpha tuning...

Fold 0
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004696 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21576
[LightGBM] [Info] Number of data points in the train set: 8721, number of used features: 94
[LightGBM] [Info] Start training from score 0.000047
 alpha=0.3 score=0.0177 vol_ratio=1.030
 alpha=0.4 score=0.0169 vol_ratio=1.041
 alpha=0.5 score=0.0162 vol_ratio=1.053
 alpha=0.6 score=0.0154 vol_ratio=1.065
 Best alpha: 0.3

Fold 1
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004358 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21575
[LightGBM] [Info] Number of data points in the train set: 8871, number of used features: 94
[LightGBM] [Info] Start training from score 0.000044
 alpha=0.3 score=0.0729 vol_ratio=1.056
 alpha=0.4