This file contains the code used to train GBRT model.

In [1]:
import warnings
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import psutil
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor

warnings.filterwarnings("ignore")

Constructing the ROS function to evaluate the model performance based on the definition in the paper.

In [2]:
def r2_oos(y_true, y_pred):
    mask = (~np.isnan(y_true)) & (~np.isnan(y_pred))
    y, yp = np.asarray(y_true)[mask], np.asarray(y_pred)[mask]
    if len(y) == 0:
        return np.nan
    rss, tss = np.sum((y - yp) ** 2), np.sum(y ** 2)
    return 1 - rss / tss if tss > 0 else np.nan

Because this model consumes a lot of memory during runtime, a memory detection function has been added.

In [3]:
def low_memory_mode(threshold_gb=8):
    mem = psutil.virtual_memory()
    available_gb = mem.available / (1024**3)
    print(f"Available memory: {available_gb:.2f} GB")
    return available_gb < threshold_gb

gbrt_rolling_validation() implements an annual rolling validation framework for gradient boosting models following the methodology in Gu, Kelly & Xiu (2020).
It automatically detects system memory and, if available memory is below a threshold (default 8 GB), switches to HistGradientBoostingRegressor for efficient low-memory computation.

This function is optimized for large-scale financial panel data (e.g., monthly firm characteristics and macro variables) and outputs both performance metrics and feature importance visualizations.

In [4]:
def gbrt_rolling_validation(
    data_path="data/processed/features.parquet",
    result_dir="results",
    train_start=1957,
    test_start=1987,
    test_end=2016,
    target="ret_excess_t_plus_1",
    feature_prefixes=("c_", "m_", "sic_"),
    depths=(2, 3, 5),
    learning_rates=(0.05, 0.1),
    n_trees=(100, 300, 500),
    max_features=50,
    max_samples=200000,
    low_mem_threshold=8
):
   

    os.makedirs(result_dir, exist_ok=True)
    result_path = os.path.join(result_dir, "GBRT_rolling_lowmem.parquet")
    param_path = os.path.join(result_dir, "GBRT_best_params_lowmem.parquet")


    use_hist = low_memory_mode(threshold_gb=low_mem_threshold)


    df = pd.read_parquet(data_path)
    df["month"] = pd.to_datetime(df["month"], errors="coerce")
    df["year"] = df["month"].dt.year

    feature_cols = [c for c in df.columns if c.startswith(feature_prefixes)]
    df = df.dropna(subset=[target]).reset_index(drop=True)
    df[feature_cols] = df[feature_cols].fillna(0).astype(np.float32)
    df[target] = df[target].astype(np.float32)

    print(f"✅ Total features used: {len(feature_cols)}")


    results, feat_imps = [], []

    for Y in tqdm(range(test_start, test_end + 1), desc="Rolling Years"):
        tr_mask = (df["year"] >= train_start) & (df["year"] <= Y - 13)
        va_mask = (df["year"] >= Y - 12) & (df["year"] <= Y - 1)
        te_mask = (df["year"] == Y)

        X_tr, y_tr = df.loc[tr_mask, feature_cols], df.loc[tr_mask, target]
        X_va, y_va = df.loc[va_mask, feature_cols], df.loc[va_mask, target]
        X_te, y_te = df.loc[te_mask, feature_cols], df.loc[te_mask, target]

        if len(X_tr) == 0 or len(X_va) == 0 or len(X_te) == 0:
            continue


        if len(X_tr) > max_samples:
            idx = np.random.choice(len(X_tr), max_samples, replace=False)
            X_tr, y_tr = X_tr.iloc[idx], y_tr.iloc[idx]

        best_r2, best_params = -np.inf, None

        for depth in depths:
            for lr in learning_rates:
                for n_est in n_trees:
                    if use_hist:
                        model = HistGradientBoostingRegressor(
                            max_depth=depth,
                            learning_rate=lr,
                            max_iter=n_est,
                            random_state=42,
                        )
                    else:
                        model = GradientBoostingRegressor(
                            n_estimators=n_est,
                            learning_rate=lr,
                            max_depth=depth,
                            max_features=max_features,
                            subsample=0.8,
                            random_state=42,
                        )

                    model.fit(X_tr.astype(np.float32), y_tr.astype(np.float32))
                    yhat_val = model.predict(X_va.astype(np.float32))
                    r2_val = r2_oos(y_va, yhat_val)

                    if not np.isnan(r2_val) and r2_val > best_r2:
                        best_r2 = r2_val
                        best_params = (depth, lr, n_est, model)

        if best_params is None:
            continue

        depth, lr, n_est, model_final = best_params


        X_trva = pd.concat([X_tr, X_va]).astype(np.float32)
        y_trva = pd.concat([y_tr, y_va]).astype(np.float32)

        model_final.fit(X_trva, y_trva)
        yhat_train = model_final.predict(X_trva)
        yhat_test = model_final.predict(X_te.astype(np.float32))

        r2_train = r2_oos(y_trva, yhat_train)
        r2_test = r2_oos(y_te, yhat_test)

        if hasattr(model_final, "feature_importances_"):
            feat_imp = model_final.feature_importances_
        else:
            feat_imp = np.zeros(len(feature_cols))
        feat_imps.append(feat_imp)

        results.append({
            "year": Y,
            "depth": depth,
            "lr": lr,
            "trees": n_est,
            "val_r2": best_r2,
            "train_r2": r2_train,
            "test_r2": r2_test,
            "y_true": y_te.values,
            "y_pred": yhat_test,
        })

        print(f"[{Y}] depth={depth}, lr={lr}, trees={n_est}, "
              f"ValR²={best_r2:.6f}, TrainR²={r2_train:.6f}, TestR²={r2_test:.6f}")


    df_results = pd.DataFrame(results)
    overall_r2 = r2_oos(
        np.concatenate(df_results["y_true"].values),
        np.concatenate(df_results["y_pred"].values)
    )

    print("\n" + "=" * 60)
    print(f"[GBRT-LowMem] Overall Out-of-Sample R² = {overall_r2:.6f}")
    print("=" * 60)

    df_results.to_parquet(result_path, index=False)
    df_results[["year", "depth", "lr", "trees", "val_r2", "train_r2", "test_r2"]].to_parquet(param_path, index=False)

    plt.figure(figsize=(10, 5))
    plt.plot(df_results["year"], df_results["test_r2"], marker="o", label="Test R²")
    plt.plot(df_results["year"], df_results["train_r2"], marker="x", alpha=0.6, label="Train R²")
    plt.axhline(overall_r2, color="red", linestyle="--", alpha=0.7, label=f"Overall={overall_r2:.4f}")
    plt.title("GBRT (Low-Memory) — Annual Out-of-Sample R²")
    plt.xlabel("Year")
    plt.ylabel("R²_oos")
    plt.legend()
    plt.grid(alpha=0.5, linestyle="--")
    plt.tight_layout()
    plt.show()

    avg_imp = np.mean(np.stack(feat_imps), axis=0)
    top_idx = np.argsort(avg_imp)[::-1][:15]

    plt.figure(figsize=(8, 5))
    sns.barplot(
        x=avg_imp[top_idx],
        y=np.array(feature_cols)[top_idx],
        palette="viridis",
        orient="h"
    )
    plt.title("GBRT — Average Feature Importances (Low-Memory Mode)")
    plt.tight_layout()
    plt.show()

    return df_results, overall_r2

building main function and run it.