# Step 2 — Simple LightGBM Model
LightGBM을 사용하여 시계열 train/validation split로 학습하는 기본 모델.

In [None]:
# ============================================================
# STEP 2 — Baseline LightGBM (Leakage-free, Time-based Split)
# ============================================================

import pandas as pd
import numpy as np
import lightgbm as lgb

TRAIN_PATH = "/kaggle/input/hull-tactical-market-prediction/train.csv"
TARGET_NAME = "market_forward_excess_returns"

print("=== STEP 2: Simple Baseline LightGBM Model ===")

# ------------------------------------------------------------
# 1. Load & Sort Data
# ------------------------------------------------------------
df = pd.read_csv(TRAIN_PATH).sort_values("date_id")

# ------------------------------------------------------------
# 2. Leakage-free Feature Selection
#    (미래 수익 관련 feature로 포함 X)
# ------------------------------------------------------------
leak_cols = [
    "date_id",
    "forward_returns",                # 실제 미래 수익
    "risk_free_rate",
    TARGET_NAME                       # 예측 대상 (market_forward_excess_returns)
]

# feature 추출 (M*, V*, E*, I*, S*, P*, D*, MOM*)
feature_cols = [c for c in df.columns if c not in leak_cols]

print(f"Total usable features: {len(feature_cols)}")

# ------------------------------------------------------------
# 3. Time-based Train/Validation Split
# ------------------------------------------------------------
dates = np.sort(df["date_id"].unique())

# 마지막 180일을 validation으로
val_days = 180
train_dates = dates[:-val_days]
val_dates   = dates[-val_days:]

train_df = df[df["date_id"].isin(train_dates)]
val_df   = df[df["date_id"].isin(val_dates)]

X_tr, y_tr = train_df[feature_cols].fillna(0), train_df[TARGET_NAME]
X_va, y_va = val_df[feature_cols].fillna(0), val_df[TARGET_NAME]

print(f"Train size: {len(X_tr)},  Val size: {len(X_va)}")

# ------------------------------------------------------------
# 4. LightGBM Parameters
# ------------------------------------------------------------
params = {
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.02,
    "num_leaves": 64,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "min_data_in_leaf": 50,
    "verbosity": -1,  
}

# ------------------------------------------------------------
# 5. Train Model
# ------------------------------------------------------------
dtrain = lgb.Dataset(X_tr, label=y_tr)

print("Training LightGBM...")
model = lgb.train(params, dtrain, num_boost_round=1500)
print("Training finished.")

# ------------------------------------------------------------
# 6. Validation RMSE
# ------------------------------------------------------------
pred = model.predict(X_va)
rmse = np.sqrt(np.mean((pred - y_va) ** 2))

print(f"\n=== STEP 2 Validation RMSE: {rmse:.12f} ===")

=== STEP 2: Simple Baseline LightGBM Model ===
Total usable features: 94
Train size: 8841,  Val size: 180
Training LightGBM...
Training finished.

=== STEP 2 Validation RMSE: 0.011852002479 ===
