In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor

DATA_PATH = "features.parquet"
PARAM_PATH = "GBRT_best_params_lowmem.parquet"
RESULT_PATH = "GBRT_refit_from_params.parquet"

TRAIN_START, TEST_START, TEST_END = 1957, 1987, 2016
TARGET = "ret_excess_t_plus_1"
FEATURE_PREFIXES = ("c_", "m_", "sic_")

USE_HIST = True  


def r2_oos(y_true, y_pred):
    mask = (~np.isnan(y_true)) & (~np.isnan(y_pred))
    y, yp = np.asarray(y_true)[mask], np.asarray(y_pred)[mask]
    if len(y) == 0:
        return np.nan
    rss, tss = np.sum((y - yp) ** 2), np.sum(y ** 2)
    return 1 - rss / tss if tss > 0 else np.nan



print(" Loading feature data...")
df = pd.read_parquet(DATA_PATH)
params_df = pd.read_parquet(PARAM_PATH)

df["month"] = pd.to_datetime(df["month"], errors="coerce")
df["year"] = df["month"].dt.year

feature_cols = [c for c in df.columns if c.startswith(FEATURE_PREFIXES)]
df = df.dropna(subset=[TARGET]).reset_index(drop=True)
df[feature_cols] = df[feature_cols].fillna(0).astype(np.float32)
df[TARGET] = df[TARGET].astype(np.float32)

print(f"✅ Total features used: {len(feature_cols)}")
print(f"✅ Loaded best parameter file with {len(params_df)} yearly entries")



results, feat_imps = [], []

for _, row in tqdm(params_df.iterrows(), total=len(params_df), desc="Refitting GBRT by saved params"):
    Y = int(row["year"])
    depth, lr, n_trees = int(row["depth"]), float(row["lr"]), int(row["trees"])

    tr_mask = (df["year"] >= TRAIN_START) & (df["year"] <= Y - 13)
    va_mask = (df["year"] >= Y - 12) & (df["year"] <= Y - 1)
    te_mask = (df["year"] == Y)

    X_trva = df.loc[tr_mask | va_mask, feature_cols]
    y_trva = df.loc[tr_mask | va_mask, TARGET]
    X_te = df.loc[te_mask, feature_cols]
    y_te = df.loc[te_mask, TARGET]

    if len(X_te) == 0 or len(X_trva) == 0:
        continue

    if USE_HIST:
        model = HistGradientBoostingRegressor(
            max_depth=depth,
            learning_rate=lr,
            max_iter=n_trees,
            random_state=42,
        )
    else:
        model = GradientBoostingRegressor(
            n_estimators=n_trees,
            learning_rate=lr,
            max_depth=depth,
            max_features=50,
            subsample=0.8,
            random_state=42,
        )

    model.fit(X_trva.astype(np.float32), y_trva.astype(np.float32))
    yhat_te = model.predict(X_te.astype(np.float32))
    r2_test = r2_oos(y_te, yhat_te)

    feat_imp = getattr(model, "feature_importances_", np.zeros(len(feature_cols)))
    feat_imps.append(feat_imp)

    results.append({
        "year": Y,
        "depth": depth,
        "lr": lr,
        "trees": n_trees,
        "test_r2": r2_test,
        "y_true": y_te.values,
        "y_pred": yhat_te,
    })

    print(f"[{Y}] depth={depth}, lr={lr}, trees={n_trees}, TestR²={r2_test:.6f}")

df_results = pd.DataFrame(results)
overall_r2 = r2_oos(
    np.concatenate(df_results["y_true"].values),
    np.concatenate(df_results["y_pred"].values)
)

os.makedirs(os.path.dirname(RESULT_PATH), exist_ok=True)
df_results.to_parquet(RESULT_PATH, index=False)

print("\n" + "=" * 60)
print(f"[GBRT Refit] Overall Out-of-Sample R² = {overall_r2:.6f}")
print("=" * 60)
print(f"✅ Saved refitted yearly results to {RESULT_PATH}")


plt.figure(figsize=(10, 5))
plt.plot(df_results["year"], df_results["test_r2"], marker="o", label="Test R²")
plt.axhline(overall_r2, color="red", linestyle="--", alpha=0.7, label=f"Overall={overall_r2:.4f}")
plt.title("GBRT — Annual Out-of-Sample R² (Refitted from Best Params)")
plt.xlabel("Year")
plt.ylabel("R²_oos")
plt.legend()
plt.grid(alpha=0.5, linestyle="--")
plt.tight_layout()
plt.show()


avg_imp = np.mean(np.stack(feat_imps), axis=0)
top_idx = np.argsort(avg_imp)[::-1][:15]

plt.figure(figsize=(8, 5))
sns.barplot(
    x=avg_imp[top_idx],
    y=np.array(feature_cols)[top_idx],
    palette="viridis",
    orient="h"
)
plt.title("GBRT — Average Feature Importances (Refitted)")
plt.tight_layout()
plt.show()


 Loading feature data...
✅ Total features used: 176
✅ Loaded best parameter file with 30 yearly entries


Refitting GBRT by saved params:   3%|▎         | 1/30 [01:28<42:44, 88.43s/it]

[1987] depth=2, lr=0.05, trees=100, TestR²=0.034017


Refitting GBRT by saved params:   7%|▋         | 2/30 [02:51<39:45, 85.21s/it]

[1988] depth=2, lr=0.05, trees=100, TestR²=-0.011719


Refitting GBRT by saved params:  10%|█         | 3/30 [03:45<31:59, 71.10s/it]

[1989] depth=2, lr=0.05, trees=100, TestR²=-0.005354


Refitting GBRT by saved params:  13%|█▎        | 4/30 [04:16<23:54, 55.19s/it]

[1990] depth=2, lr=0.05, trees=100, TestR²=-0.008097


Refitting GBRT by saved params:  17%|█▋        | 5/30 [04:46<19:12, 46.09s/it]

[1991] depth=2, lr=0.05, trees=100, TestR²=0.007816


Refitting GBRT by saved params:  20%|██        | 6/30 [05:18<16:28, 41.19s/it]

[1992] depth=2, lr=0.05, trees=100, TestR²=-0.018832


Refitting GBRT by saved params:  23%|██▎       | 7/30 [06:19<18:21, 47.90s/it]

[1993] depth=3, lr=0.05, trees=100, TestR²=-0.011654


Refitting GBRT by saved params:  27%|██▋       | 8/30 [08:02<23:56, 65.28s/it]

[1994] depth=2, lr=0.05, trees=100, TestR²=-0.002911


Refitting GBRT by saved params:  30%|███       | 9/30 [09:01<22:12, 63.45s/it]

[1995] depth=3, lr=0.05, trees=100, TestR²=0.008196


Refitting GBRT by saved params:  33%|███▎      | 10/30 [09:50<19:41, 59.06s/it]

[1996] depth=2, lr=0.05, trees=100, TestR²=-0.008711


Refitting GBRT by saved params:  37%|███▋      | 11/30 [10:44<18:10, 57.39s/it]

[1997] depth=2, lr=0.05, trees=100, TestR²=-0.011700


Refitting GBRT by saved params:  40%|████      | 12/30 [11:33<16:24, 54.70s/it]

[1998] depth=2, lr=0.05, trees=100, TestR²=-0.034964


Refitting GBRT by saved params:  43%|████▎     | 13/30 [13:29<20:47, 73.36s/it]

[1999] depth=5, lr=0.05, trees=100, TestR²=-0.050930


Refitting GBRT by saved params:  47%|████▋     | 14/30 [15:40<24:12, 90.75s/it]

[2000] depth=2, lr=0.05, trees=100, TestR²=0.004533


Refitting GBRT by saved params:  50%|█████     | 15/30 [18:56<30:36, 122.42s/it]

[2001] depth=5, lr=0.05, trees=100, TestR²=-0.248516


Refitting GBRT by saved params:  53%|█████▎    | 16/30 [21:18<29:57, 128.36s/it]

[2002] depth=2, lr=0.05, trees=100, TestR²=-0.048092


Refitting GBRT by saved params:  57%|█████▋    | 17/30 [23:48<29:12, 134.83s/it]

[2003] depth=2, lr=0.05, trees=100, TestR²=-0.001669


Refitting GBRT by saved params:  60%|██████    | 18/30 [26:14<27:37, 138.16s/it]

[2004] depth=2, lr=0.05, trees=100, TestR²=-0.036091


Refitting GBRT by saved params:  63%|██████▎   | 19/30 [28:26<25:02, 136.55s/it]

[2005] depth=3, lr=0.05, trees=100, TestR²=0.005233


Refitting GBRT by saved params:  67%|██████▋   | 20/30 [29:32<19:13, 115.35s/it]

[2006] depth=2, lr=0.05, trees=100, TestR²=0.001597


Refitting GBRT by saved params:  70%|███████   | 21/30 [30:37<15:00, 100.09s/it]

[2007] depth=2, lr=0.05, trees=100, TestR²=-0.033290


Refitting GBRT by saved params:  73%|███████▎  | 22/30 [31:57<12:33, 94.19s/it] 

[2008] depth=2, lr=0.05, trees=100, TestR²=-0.040869


Refitting GBRT by saved params:  77%|███████▋  | 23/30 [33:17<10:29, 89.92s/it]

[2009] depth=2, lr=0.05, trees=100, TestR²=0.007114


Refitting GBRT by saved params:  80%|████████  | 24/30 [34:43<08:51, 88.56s/it]

[2010] depth=2, lr=0.05, trees=100, TestR²=-0.017755


Refitting GBRT by saved params:  83%|████████▎ | 25/30 [36:19<07:34, 90.81s/it]

[2011] depth=3, lr=0.05, trees=100, TestR²=-0.039455


Refitting GBRT by saved params:  87%|████████▋ | 26/30 [39:04<07:32, 113.12s/it]

[2012] depth=2, lr=0.05, trees=100, TestR²=0.004002


Refitting GBRT by saved params:  90%|█████████ | 27/30 [41:58<06:33, 131.30s/it]

[2013] depth=2, lr=0.05, trees=100, TestR²=0.010490


Refitting GBRT by saved params:  93%|█████████▎| 28/30 [44:51<04:47, 143.83s/it]

[2014] depth=2, lr=0.05, trees=100, TestR²=-0.003776


Refitting GBRT by saved params:  97%|█████████▋| 29/30 [46:41<02:13, 133.86s/it]

[2015] depth=2, lr=0.05, trees=100, TestR²=-0.011911


Refitting GBRT by saved params: 100%|██████████| 30/30 [48:12<00:00, 96.40s/it] 

[2016] depth=2, lr=0.05, trees=100, TestR²=0.016481





FileNotFoundError: [WinError 3] 系统找不到指定的路径。: ''