This file contains the code to train random forest model.

In [2]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.ensemble import RandomForestRegressor
import os

warnings.filterwarnings("ignore")

First, based on the definition in the paper, we construct the ROS function to evaluate the model performance.

In [3]:
def r2_oos(y_true, y_pred):
    """Gu, Kelly & Xiu (2020, Eq.19): 样本外 R²"""
    mask = (~np.isnan(y_true)) & (~np.isnan(y_pred))
    y, yp = np.asarray(y_true)[mask], np.asarray(y_pred)[mask]
    if len(y) == 0:
        return np.nan
    rss, tss = np.sum((y - yp) ** 2), np.sum(y ** 2)
    return 1 - rss / tss if tss > 0 else np.nan

In [None]:
def random_forest_rolling_validation(
    data_path="data/processed/features.parquet",
    result_path="results/RF_rolling_opt.parquet",
    train_start=1957,
    test_start=1987,
    test_end=2016,
    target="ret_excess_t_plus_1",
    feature_prefixes=("c_", "m_", "sic_"),
    depths=(3, 5, 7),
    n_trees=(100, 300, 500),
    max_features=(3, 5, 13)
):
   

    os.makedirs(os.path.dirname(result_path), exist_ok=True)

    df = pd.read_parquet(data_path)
    df["month"] = pd.to_datetime(df["month"], errors="coerce")
    df["year"] = df["month"].dt.year

    feature_cols = [c for c in df.columns if c.startswith(feature_prefixes)]
    df = df.dropna(subset=[target]).reset_index(drop=True)
    df[feature_cols] = df[feature_cols].fillna(0).astype(np.float32)
    df[target] = df[target].astype(np.float32)

    print(f"✅ Total features used: {len(feature_cols)}")

    results, feat_imps = [], []

    for Y in tqdm(range(test_start, test_end + 1), desc="Rolling Years"):
        tr_mask = (df["year"] >= train_start) & (df["year"] <= Y - 13)
        va_mask = (df["year"] >= Y - 12) & (df["year"] <= Y - 1)
        te_mask = (df["year"] == Y)

        X_tr, y_tr = df.loc[tr_mask, feature_cols], df.loc[tr_mask, target]
        X_va, y_va = df.loc[va_mask, feature_cols], df.loc[va_mask, target]
        X_te, y_te = df.loc[te_mask, feature_cols], df.loc[te_mask, target]

        if len(X_tr) == 0 or len(X_va) == 0 or len(X_te) == 0:
            continue

        best_r2, best_params = -np.inf, None

        for depth in depths:
            for n_tree in n_trees:
                for mf in max_features:
                    rf = RandomForestRegressor(
                        n_estimators=n_tree,
                        max_depth=depth,
                        max_features=mf,
                        min_samples_leaf=50,
                        bootstrap=True,
                        max_samples=0.7,
                        n_jobs=-1,
                        random_state=42,
                    )
                    rf.fit(X_tr, y_tr)
                    yhat_val = rf.predict(X_va)
                    r2_val = r2_oos(y_va, yhat_val)
                    if not np.isnan(r2_val) and r2_val > best_r2:
                        best_r2 = r2_val
                        best_params = (depth, n_tree, mf)

        if best_params is None:
            continue

        depth, n_tree, mf = best_params
        X_trva = pd.concat([X_tr, X_va])
        y_trva = pd.concat([y_tr, y_va])

        final_rf = RandomForestRegressor(
            n_estimators=n_tree,
            max_depth=depth,
            max_features=mf,
            min_samples_leaf=50,
            bootstrap=True,
            max_samples=0.7,
            n_jobs=-1,
            random_state=42,
        )
        final_rf.fit(X_trva, y_trva)

        yhat_test = final_rf.predict(X_te)
        r2_test = r2_oos(y_te, yhat_test)

        feat_imps.append(final_rf.feature_importances_)
        results.append({
            "year": Y,
            "depth": depth,
            "trees": n_tree,
            "max_features": mf,
            "val_r2": best_r2,
            "test_r2": r2_test,
            "y_true": y_te.values,
            "y_pred": yhat_test,
        })

        print(f"[{Y}] depth={depth}, trees={n_tree}, features={mf}, "
              f"ValR²={best_r2:.6f}, TestR²={r2_test:.6f}")

    df_results = pd.DataFrame(results)
    overall_r2 = r2_oos(
        np.concatenate(df_results["y_true"].values),
        np.concatenate(df_results["y_pred"].values)
    )

    print("\n" + "=" * 60)
    print(f"[RandomForest] Overall Out-of-Sample R² (fast) = {overall_r2:.6f}")
    print("=" * 60)

    df_results.to_parquet(result_path, index=False)
    print(f"✅ Saved detailed results to {result_path}")

    plt.figure(figsize=(10, 5))
    plt.plot(df_results["year"], df_results["test_r2"], marker="o", label="Test R²")
    plt.axhline(overall_r2, color="red", linestyle="--", alpha=0.7, label=f"Overall={overall_r2:.4f}")
    plt.title("Random Forest — Annual Out-of-Sample R² (Rolling Validation)")
    plt.xlabel("Year")
    plt.ylabel("R²_oos")
    plt.legend()
    plt.grid(alpha=0.5, linestyle="--")
    plt.tight_layout()
    plt.show()


    avg_imp = np.mean(np.stack(feat_imps), axis=0)
    top_idx = np.argsort(avg_imp)[::-1][:15]

    plt.figure(figsize=(8, 5))
    sns.barplot(
        x=avg_imp[top_idx],
        y=np.array(feature_cols)[top_idx],
        palette="viridis",
        orient="h"
    )
    plt.title("Random Forest — Average Feature Importances (Impurity-based)")
    plt.tight_layout()
    plt.show()

    return df_results, overall_r2


This module implements a **yearly rolling validation framework** for **Random Forest regression**,  
following the empirical asset pricing methodology of **Gu, Kelly, & Xiu (2020)**.  
It evaluates the **out-of-sample R²** of Random Forest models on firm-level and macroeconomic predictors  
using a rolling training–validation–testing scheme

In [None]:
def main():
    
    df_results, overall_r2 = random_forest_rolling_validation()
    print(" Random Forest Rolling Validation Complete!")
    print(f"Final Overall Out-of-Sample R²: {overall_r2:.6f}")


if __name__ == "__main__":
    main()