Since performing a complete annual rolling random forest with optimal parameters takes a long time, we use the pre-trained optimal parameters to reproduce the results when submitting the file.

In [13]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.ensemble import RandomForestRegressor

In [14]:
DATA_PATH = "data/processed/features1.parquet"
TRAIN_START, TEST_START, TEST_END = 1957, 1987, 2016
TARGET = "ret_excess_t_plus_1"
FEATURE_PREFIXES = ("c_", "m_", "sic_")

In [15]:
param_map = {
    1987: (3, 100), 1988: (3, 500), 1989: (3, 500), 1990: (3, 500), 1991: (3, 500),
    1992: (3, 500), 1993: (3, 500), 1994: (3, 300), 1995: (5, 100), 1996: (7, 500),
    1997: (7, 500), 1998: (7, 500), 1999: (7, 300), 2000: (3, 500), 2001: (3, 500),
    2002: (3, 500), 2003: (3, 500), 2004: (3, 500), 2005: (3, 500), 2006: (3, 300),
    2007: (3, 500), 2008: (3, 300), 2009: (3, 500), 2010: (5, 300), 2011: (5, 100),
    2012: (3, 500), 2013: (5, 100), 2014: (5, 100), 2015: (7, 100), 2016: (3, 500)
}

In [16]:
def r2_oos(y_true, y_pred):
    mask = (~np.isnan(y_true)) & (~np.isnan(y_pred))
    y, yp = np.asarray(y_true)[mask], np.asarray(y_pred)[mask]
    if len(y) == 0:
        return np.nan
    rss, tss = np.sum((y - yp) ** 2), np.sum(y ** 2)
    return 1 - rss / tss if tss > 0 else np.nan

In [17]:
print("Loading dataset ...")
df = pd.read_parquet(DATA_PATH)
df["month"] = pd.to_datetime(df["month"], errors="coerce")
df["year"] = df["month"].dt.year

feature_cols = [c for c in df.columns if c.startswith(FEATURE_PREFIXES)]
df = df.dropna(subset=[TARGET]).reset_index(drop=True)
df[feature_cols] = df[feature_cols].fillna(0).astype(np.float32)
df[TARGET] = df[TARGET].astype(np.float32)

print(f"Total features used: {len(feature_cols)}")
print(f" Data range: {df['year'].min()}‚Äì{df['year'].max()} | Total rows: {len(df):,}")

Loading dataset ...
Total features used: 176
 Data range: 1957‚Äì2021 | Total rows: 4,320,692


In [None]:
results, feat_imps = [], []

for Y in tqdm(range(TEST_START, TEST_END + 1), desc="üîÅ Rolling Years"):
    if Y not in param_map:
        continue

    depth, n_trees = param_map[Y]
    tr_mask = (df["year"] >= TRAIN_START) & (df["year"] <= Y - 13)
    va_mask = (df["year"] >= Y - 12) & (df["year"] <= Y - 1)
    te_mask = (df["year"] == Y)

    X_trva = df.loc[tr_mask | va_mask, feature_cols]
    y_trva = df.loc[tr_mask | va_mask, TARGET]
    X_te = df.loc[te_mask, feature_cols]
    y_te = df.loc[te_mask, TARGET]

    if len(X_te) == 0:
        continue

    rf = RandomForestRegressor(
        n_estimators=n_trees,
        max_depth=depth,
        max_features="sqrt",
        min_samples_leaf=50,
        bootstrap=True,
        max_samples=0.7,
        n_jobs=-1,
        random_state=42,
    )
    rf.fit(X_trva, y_trva)
    yhat = rf.predict(X_te)

    feat_imps.append(rf.feature_importances_)
    r2_test = r2_oos(y_te, yhat)

    results.append({
        "year": Y,
        "depth": depth,
        "trees": n_trees,
        "r2_test": r2_test,
        "y_true": y_te.values,
        "y_pred": yhat,
    })

    print(f"[{Y}] depth={depth}, trees={n_trees}, TestR¬≤={r2_test:.6f}")

üîÅ Rolling Years:   3%|‚ñà‚ñà‚ñè                                                              | 1/30 [01:09<33:28, 69.27s/it]

[1987] depth=3, trees=100, TestR¬≤=0.003819


In [None]:
df_results = pd.DataFrame(results)
overall_r2 = r2_oos(
    np.concatenate(df_results["y_true"].values),
    np.concatenate(df_results["y_pred"].values)
)
print("\n" + "=" * 60)
print(f"[RandomForest] Overall Out-of-Sample R¬≤ = {overall_r2:.6f}")

print("=" * 60)

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(df_results["year"], df_results["r2_test"], marker="o", label="Annual R¬≤")
plt.axhline(overall_r2, color="red", linestyle="--", alpha=0.7, label=f"Overall={overall_r2:.4f}")
plt.title("Random Forest ‚Äî Annual Out-of-Sample R¬≤ (Fixed Parameters)")
plt.xlabel("Year")
plt.ylabel("R¬≤_oos")
plt.legend()
plt.grid(alpha=0.5, linestyle="--")
plt.tight_layout()
plt.show()
avg_imp = np.mean(np.stack(feat_imps), axis=0)
top_idx = np.argsort(avg_imp)[::-1][:15]

plt.figure(figsize=(8, 5))
sns.barplot(
    x=avg_imp[top_idx],
    y=np.array(feature_cols)[top_idx],
    palette="viridis",
    orient="h"
)
plt.title("Random Forest ‚Äî Average Feature Importances (Fixed Parameters)")
plt.tight_layout()
plt.show()