# Elasticnet batch 
什麼時候用它？
當溫度感測點高度共線（很常見）且你希望自動選擇/壓縮特徵、降低過度擬合。ElasticNet 介於 Lasso 與 Ridge 之間，很適合這類多溫度感測器的資料。

做法概觀

讀取多個訓練 CSV 並合併；2) 只用數值欄（去掉目標）做特徵；3) 標準化；4) 用 ElasticNetCV（交叉驗證調 α 與 l1_ratio）；5) 以「每檔案第 101 筆起」計算競賽版 RMSE。

注意：下方評估是「單一全域模型」在每個檔案上第 101 筆之後做預測；不會偷用第 100 筆之後的正解（避免洩漏）。

可再加強的特徵（先把 baseline 跑起來再疊）：

時間衍生：Time 的多項式/分段（因三階段轉速進給），或加「階段 one-hot」。

溫度動態：移動平均/移動差分（例如各溫度 5/15 分鐘均值與斜率）。

相對溫度：各點相對於環境基準（如 PT01 或全體平均）的差。

以上都能用 pd.DataFrame.rolling() / .diff() 做出來，只要避免用到未來資訊即可（使用當下與過去）。

In [15]:
# ==== ElasticNet 版 ====
import numpy as np
import pandas as pd
from pathlib import Path
from glob import glob

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNetCV
from sklearn.multioutput import MultiOutputRegressor

# ---- 實際資料夾路徑 (已代入絕對路徑) ----
BASE_DIR = "/Users/benjamin/1132/11325/AI_Race/2025_dataset_0806 3"
TRAIN_DIR = f"{BASE_DIR}/train"          # 43 個訓練檔所在
TEST_DIR  = f"{BASE_DIR}/train"          # 暫時先指向同資料夾做流程驗證；若有正式 test 資料夾再改
OUT_DIR   = "/Users/benjamin/1132/11325/AI_Race/preds_out"  # 輸出預測

TARGETS = ["Disp. X", "Disp. Z"]

# ========== 輔助：檢查資料夾 ==========
def _assert_folder_has_csv(folder, role):
    files = sorted(glob(str(Path(folder) / "*.csv")))
    if len(files) == 0:
        raise ValueError(f"{role} 資料夾 '{folder}' 下找不到任何 CSV，請確認路徑是否正確 (絕對路徑 & 無打錯)。")
    return files

# ========== 載入 ==========
def load_many_csvs(folder):
    files = _assert_folder_has_csv(folder, "訓練")
    dfs = []
    for f in files:
        df = pd.read_csv(f, low_memory=False)
        df["__file__"] = Path(f).name  # 評估與輸出用
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True), files

# ========== 組特徵/目標 ==========
def build_Xy(df):
    # 取出非目標欄且非輔助欄位
    raw_feat_cols = [c for c in df.columns if c not in TARGETS + ["__file__"]]
    # 僅保留可轉成數值的欄位，避免 "could not convert string to float" 類錯誤
    numeric_cols = []
    for c in raw_feat_cols:
        ser_num = pd.to_numeric(df[c], errors="coerce")
        valid_ratio = ser_num.notna().mean()
        if valid_ratio > 0.95:  # 門檻可調
            numeric_cols.append(c)
    if len(numeric_cols) == 0:
        raise ValueError("沒有可用的數值特徵欄位；請檢查資料是否載入正確或調整特徵挑選邏輯。")
    X = df[numeric_cols].to_numpy(dtype=float)
    y = None
    if all(t in df.columns for t in TARGETS):
        y = df[TARGETS].to_numpy(dtype=float)
    return X, y, numeric_cols

# ========== 建模 ==========
def fit_elasticnet(X, y, random_state=42):
    if y is None:
        raise ValueError("訓練資料缺少目標欄位，請確認 TRAIN_DIR 內 CSV 是否包含 Disp. X / Disp. Z。")
    if X.shape[0] == 0:
        raise ValueError("沒有任何訓練樣本：檢查資料夾或特徵過濾條件。")
    en = Pipeline([
        ("scaler", StandardScaler()),
        ("reg", MultiOutputRegressor(ElasticNetCV(
            l1_ratio=[0.1, 0.3, 0.5, 0.7, 0.9],
            alphas=np.logspace(-4, 2, 30),
            max_iter=5000,
            cv=5,
            n_jobs=None,
            random_state=random_state
        )))
    ])
    en.fit(X, y)
    return en

# 競賽 RMSE：對每個檔案，只計第 101 列以後
def competition_rmse(df_all, model):
    rmse_sum = 0.0
    n_rows_eval = 0
    for fname, g in df_all.groupby("__file__"):
        if not all(t in g.columns for t in TARGETS):
            continue  # 測試集沒有正解就略過
        if len(g) <= 100:
            continue
        feat_cols = [c for c in g.columns if c not in TARGETS + ["__file__"]]
        Xg_df = g[feat_cols].apply(pd.to_numeric, errors="coerce")
        Xg_df = Xg_df.dropna(axis=1, how="any")
        Xg = Xg_df.to_numpy(dtype=float)
        yg = g[TARGETS].to_numpy(dtype=float)
        if Xg.shape[0] <= 100:
            continue
        yhat = model.predict(Xg[100:])          # 只評 101+
        diff2 = (yhat - yg[100:]) ** 2
        rmse_sum += diff2.sum()
        n_rows_eval += len(g) - 100
    if n_rows_eval == 0:
        raise ValueError("評估時沒有可用的列 (可能所有檔案都不足 101 列或缺目標)。")
    return np.sqrt(rmse_sum / (2 * n_rows_eval))  # 2 來自 X/Z

# ---- 主流程（訓練 + 評估（在訓練資料上做類測試切法））----
train_df, train_files = load_many_csvs(TRAIN_DIR)
X_train, y_train, feat_cols = build_Xy(train_df)
print(f"Loaded {len(train_files)} train files, shape={train_df.shape}, features_kept={len(feat_cols)}")

en_model = fit_elasticnet(X_train, y_train)

rmse_train_like = competition_rmse(train_df, en_model)
print(f"[ElasticNet] pseudo-eval RMSE on TRAIN (101+): {rmse_train_like:.6f}")

# ---- 產生 13 個提交檔 ----
def write_submission_files(model, test_folder, out_folder):
    Path(out_folder).mkdir(parents=True, exist_ok=True)
    test_files = sorted(glob(str(Path(test_folder) / "*.csv")))
    if len(test_files) == 0:
        raise ValueError(f"測試資料夾 '{test_folder}' 無 CSV。")
    for f in test_files:
        df = pd.read_csv(f, low_memory=False)
        feat_cols = [c for c in df.columns if c not in TARGETS]  # 測試集頭 100 有真值欄位
        Xdf = df[feat_cols].apply(pd.to_numeric, errors="coerce")
        Xdf = Xdf.fillna(method="ffill").fillna(method="bfill")  # 簡單填補 (可改進)
        yhat = model.predict(Xdf.to_numpy(dtype=float))
        out = df.copy()
        if all(t in out.columns for t in TARGETS):
            out.loc[100:, "Disp. X"] = yhat[100:, 0]
            out.loc[100:, "Disp. Z"] = yhat[100:, 1]
        else:
            out["Disp. X"] = df.get("Disp. X", np.nan)
            out["Disp. Z"] = df.get("Disp. Z", np.nan)
            out.loc[100:, "Disp. X"] = yhat[100:, 0]
            out.loc[100:, "Disp. Z"] = yhat[100:, 1]
        out_path = Path(out_folder) / Path(f).name
        out.to_csv(out_path, index=False)
        print("wrote:", out_path)

# (如需輸出，請在下個 debug/submission cell 呼叫 write_submission_files)


Loaded 43 train files, shape=(29795, 28), features_kept=25


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

[ElasticNet] pseudo-eval RMSE on TRAIN (101+): 8.088681


  model = cd_fast.enet_coordinate_descent(


In [14]:
# === Debug / 快速檢視 & 產出提交檔 ===
from pathlib import Path
import pandas as pd

print("TRAIN_DIR:", TRAIN_DIR)
print("TEST_DIR :", TEST_DIR)
print("OUT_DIR  :", OUT_DIR)

# 1) 列出前 5 個訓練檔案名稱
train_files_list = sorted(Path(TRAIN_DIR).glob('*.csv'))
print(f"訓練檔數: {len(train_files_list)} (列出前5)")
for p in train_files_list[:5]:
    print("  -", p.name)

# 2) 讀第一個檔案看欄位與前幾列
if train_files_list:
    sample_df = pd.read_csv(train_files_list[0], nrows=8)
    print("樣本檔欄位:", list(sample_df.columns))
    display(sample_df.head())

# 3) 顯示特徵保留數
print(f"保留特徵數: {len(feat_cols)} → 前10: {feat_cols[:10]}")

# 4) 產出提交檔 (目前 TEST_DIR=TRAIN_DIR 僅驗證流程)
print("開始寫出預測檔 ...")
write_submission_files(en_model, TEST_DIR, OUT_DIR)
print("完成。可到 preds_out 查看。")

# 5) 檢查輸出資料夾內前幾個檔案
out_files = sorted(Path(OUT_DIR).glob('*.csv'))
print(f"輸出檔數: {len(out_files)} (列出前5)")
for p in out_files[:5]:
    print("  ->", p.name)


TRAIN_DIR: /Users/benjamin/1132/11325/AI_Race/2025_dataset_0806 3/train
TEST_DIR : /Users/benjamin/1132/11325/AI_Race/2025_dataset_0806 3/train
OUT_DIR  : /Users/benjamin/1132/11325/AI_Race/preds_out
訓練檔數: 43 (列出前5)
  - _20200615_GV1-1203_2000rpm_XZ-5m-min_5H(wAC-from0-25C).csv
  - _20200616_GV1-1203_1000rpm_XZ-5m-min_5H(wAC-from0-25C).csv
  - _20200617_GV1-1203_1k+2krpm_XZ-5m-min_5H(wAC-from0-25C).csv
  - _20200618_GV1-1203_2k+1krpm_XZ-5m-min_5H(wAC-from0-25C).csv
  - _20200701_GV1-1203_2000rpm_XZ-5m-min_5H(wAC-from0-20C).csv
樣本檔欄位: ['Time', 'PT01', 'PT02', 'PT03', 'PT04', 'PT05', 'PT06', 'PT07', 'PT08', 'PT09', 'PT10', 'PT11', 'PT12', 'PT13', 'TC01', 'TC02', 'TC03', 'TC04', 'TC05', 'TC06', 'TC07', 'TC08', 'Spindle Motor', 'X Motor', 'Z Motor', 'Disp. X', 'Disp. Z']


Unnamed: 0,Time,PT01,PT02,PT03,PT04,PT05,PT06,PT07,PT08,PT09,...,TC04,TC05,TC06,TC07,TC08,Spindle Motor,X Motor,Z Motor,Disp. X,Disp. Z
0,1.017,21.5,21.7,21.8,21.1,21.2,20.9,20.9,21.6,21.1,...,19.1,19.4,19.6,21.6,22.3,19.0,19.0,20.0,0.0,0.0
1,1.185,21.5,21.8,21.8,21.1,21.2,20.9,20.9,21.6,21.2,...,19.4,19.5,19.7,21.7,22.4,19.0,19.0,20.0,0.0,0.086104
2,1.354,21.5,21.8,21.8,21.1,21.2,20.9,20.9,21.6,21.2,...,19.2,19.5,19.8,21.7,22.4,19.0,20.0,20.0,-1.188165,0.086104
3,1.523,21.5,21.7,21.8,21.1,21.2,21.0,20.9,21.7,21.2,...,19.2,19.5,19.7,21.7,22.5,19.0,19.0,20.0,-2.178043,0.171982
4,1.691,21.5,21.8,21.8,21.1,21.2,21.0,20.9,21.7,21.3,...,19.3,19.6,19.8,21.8,22.5,19.0,19.0,20.0,-2.178043,0.171982


保留特徵數: 25 → 前10: ['Time', 'PT01', 'PT02', 'PT03', 'PT04', 'PT05', 'PT06', 'PT07', 'PT08', 'PT09']
開始寫出預測檔 ...
wrote: /Users/benjamin/1132/11325/AI_Race/preds_out/_20200615_GV1-1203_2000rpm_XZ-5m-min_5H(wAC-from0-25C).csv
wrote: /Users/benjamin/1132/11325/AI_Race/preds_out/_20200616_GV1-1203_1000rpm_XZ-5m-min_5H(wAC-from0-25C).csv
wrote: /Users/benjamin/1132/11325/AI_Race/preds_out/_20200617_GV1-1203_1k+2krpm_XZ-5m-min_5H(wAC-from0-25C).csv
wrote: /Users/benjamin/1132/11325/AI_Race/preds_out/_20200618_GV1-1203_2k+1krpm_XZ-5m-min_5H(wAC-from0-25C).csv
wrote: /Users/benjamin/1132/11325/AI_Race/preds_out/_20200701_GV1-1203_2000rpm_XZ-5m-min_5H(wAC-from0-20C).csv
wrote: /Users/benjamin/1132/11325/AI_Race/preds_out/_20200702_GV1-1203_1000rpm_XZ-5m-min_5H(wAC-from0-20C).csv
wrote: /Users/benjamin/1132/11325/AI_Race/preds_out/_20200703_GV1-1203_1k+2krpm_XZ-5m-min_5H(wAC-from0-20C).csv
wrote: /Users/benjamin/1132/11325/AI_Race/preds_out/_20200706_GV1-1203_2k+1krpm_XZ-5m-min_5H(wAC-from0-20C).cs

  Xdf = Xdf.fillna(method="ffill").fillna(method="bfill")  # 簡單填補 (可改進)
  Xdf = Xdf.fillna(method="ffill").fillna(method="bfill")  # 簡單填補 (可改進)
  Xdf = Xdf.fillna(method="ffill").fillna(method="bfill")  # 簡單填補 (可改進)
  Xdf = Xdf.fillna(method="ffill").fillna(method="bfill")  # 簡單填補 (可改進)
  Xdf = Xdf.fillna(method="ffill").fillna(method="bfill")  # 簡單填補 (可改進)
  Xdf = Xdf.fillna(method="ffill").fillna(method="bfill")  # 簡單填補 (可改進)
  Xdf = Xdf.fillna(method="ffill").fillna(method="bfill")  # 簡單填補 (可改進)
  Xdf = Xdf.fillna(method="ffill").fillna(method="bfill")  # 簡單填補 (可改進)
  Xdf = Xdf.fillna(method="ffill").fillna(method="bfill")  # 簡單填補 (可改進)
  Xdf = Xdf.fillna(method="ffill").fillna(method="bfill")  # 簡單填補 (可改進)
  Xdf = Xdf.fillna(method="ffill").fillna(method="bfill")  # 簡單填補 (可改進)
  Xdf = Xdf.fillna(method="ffill").fillna(method="bfill")  # 簡單填補 (可改進)
  Xdf = Xdf.fillna(method="ffill").fillna(method="bfill")  # 簡單填補 (可改進)
  Xdf = Xdf.fillna(method="ffill").fillna(method="bfill")  # 簡單填

wrote: /Users/benjamin/1132/11325/AI_Race/preds_out/_20200813_GV1-1203_2000rpm_XZ-5m-min_6H(wAC-from0-30Cto20C).csv
wrote: /Users/benjamin/1132/11325/AI_Race/preds_out/_20200814_GV1-1203_1000rpm_XZ-5m-min_6H(wAC-from0-30Cto20C).csv
wrote: /Users/benjamin/1132/11325/AI_Race/preds_out/_20200817_GV1-1203_2000rpm_XZ-5m-min_6H(wAC-from0-20Cto30C).csv
wrote: /Users/benjamin/1132/11325/AI_Race/preds_out/_20200818_GV1-1203_1000rpm_XZ-5m-min_6H(wAC-from0-20Cto30C).csv
wrote: /Users/benjamin/1132/11325/AI_Race/preds_out/_20200820_GV1-1203_2000rpm_XZ-5m-min_5H(wAC-from0-18C).csv
wrote: /Users/benjamin/1132/11325/AI_Race/preds_out/_20200821_GV1-1203_1000rpm_XZ-5m-min_5H(wAC-from0-22C).csv
wrote: /Users/benjamin/1132/11325/AI_Race/preds_out/_20200824_GV1-1203_1000rpm_XZ-5m-min_5H(wAC-from0-15C).csv
wrote: /Users/benjamin/1132/11325/AI_Race/preds_out/_20200827_GV1-1203_2000rpm_XZ-5m-min_5H(wAC-from0-15C).csv
wrote: /Users/benjamin/1132/11325/AI_Race/preds_out/_20200828_GV1-1203_2000rpm_XZ-5m-min_5H(

  Xdf = Xdf.fillna(method="ffill").fillna(method="bfill")  # 簡單填補 (可改進)
  Xdf = Xdf.fillna(method="ffill").fillna(method="bfill")  # 簡單填補 (可改進)
  Xdf = Xdf.fillna(method="ffill").fillna(method="bfill")  # 簡單填補 (可改進)
  Xdf = Xdf.fillna(method="ffill").fillna(method="bfill")  # 簡單填補 (可改進)
  Xdf = Xdf.fillna(method="ffill").fillna(method="bfill")  # 簡單填補 (可改進)
  Xdf = Xdf.fillna(method="ffill").fillna(method="bfill")  # 簡單填補 (可改進)
  Xdf = Xdf.fillna(method="ffill").fillna(method="bfill")  # 簡單填補 (可改進)
  Xdf = Xdf.fillna(method="ffill").fillna(method="bfill")  # 簡單填補 (可改進)
  Xdf = Xdf.fillna(method="ffill").fillna(method="bfill")  # 簡單填補 (可改進)
  Xdf = Xdf.fillna(method="ffill").fillna(method="bfill")  # 簡單填補 (可改進)
  Xdf = Xdf.fillna(method="ffill").fillna(method="bfill")  # 簡單填補 (可改進)
  Xdf = Xdf.fillna(method="ffill").fillna(method="bfill")  # 簡單填補 (可改進)
  Xdf = Xdf.fillna(method="ffill").fillna(method="bfill")  # 簡單填補 (可改進)
  Xdf = Xdf.fillna(method="ffill").fillna(method="bfill")  # 簡單填

wrote: /Users/benjamin/1132/11325/AI_Race/preds_out/_20200929_GV1-1203_2krpm_XZ-5m-min_2-5H+Stop1H+1krpm_XZ-5m-min_2-5H(wAC-from0-20C).csv
wrote: /Users/benjamin/1132/11325/AI_Race/preds_out/_20201005_GV1-1203_1krpm_XZ-5m-min_2-5H+Stop1H+1krpm_XZ-5m-min_2-5H(wAC-from0-20C).csv
wrote: /Users/benjamin/1132/11325/AI_Race/preds_out/_20201006_GV1-1203_2krpm_XZ-5m-min_2-5H+Stop1H+2krpm_XZ-5m-min_2-5H(wAC-from0-20C).csv
wrote: /Users/benjamin/1132/11325/AI_Race/preds_out/_20201016_GV1-1203_1000rpm_XZ-5m-min_6H(wAC-from0-30to20to30C).csv
完成。可到 preds_out 查看。
輸出檔數: 43 (列出前5)
  -> _20200615_GV1-1203_2000rpm_XZ-5m-min_5H(wAC-from0-25C).csv
  -> _20200616_GV1-1203_1000rpm_XZ-5m-min_5H(wAC-from0-25C).csv
  -> _20200617_GV1-1203_1k+2krpm_XZ-5m-min_5H(wAC-from0-25C).csv
  -> _20200618_GV1-1203_2k+1krpm_XZ-5m-min_5H(wAC-from0-25C).csv
  -> _20200701_GV1-1203_2000rpm_XZ-5m-min_5H(wAC-from0-20C).csv


  Xdf = Xdf.fillna(method="ffill").fillna(method="bfill")  # 簡單填補 (可改進)
  Xdf = Xdf.fillna(method="ffill").fillna(method="bfill")  # 簡單填補 (可改進)
  Xdf = Xdf.fillna(method="ffill").fillna(method="bfill")  # 簡單填補 (可改進)


# Bayesian Linear Regression

什麼時候用它？
當你要「線性關係 + 不確定性」估計，且特徵很多又易共線時，BayesianRidge 透過先驗/後驗得到更穩的權重估計，常比一般最小平方法更抗雜訊。

下方流程與 ElasticNet 幾乎相同，只是把估計器換成 BayesianRidge。

In [None]:
# === bayesian_batch.py ===
import os
import glob
import pandas as pd

from sklearn.linear_model import BayesianRidge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# ======== 路徑設定 ========
TRAIN_DIR = "train_dir"
TEST_DIR = "test_dir"
OUTPUT_DIR = "output_dir_bayes"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ======== 資料欄位設定（依題目 PDF） ========
FEATURES = (
    ["Time"] +
    [f"PT{i:02d}" for i in range(1, 14)] +
    [f"TC{i:02d}" for i in range(1, 9)] +
    ["Spindle Motor", "X Motor", "Z Motor"]
)
TARGET_X = "Disp. X"
TARGET_Z = "Disp. Z"

def load_train_df():
    files = glob.glob(os.path.join(TRAIN_DIR, "*.csv"))
    dfs = []
    for f in files:
        df = pd.read_csv(f)
        dfs.append(df)
    train_df = pd.concat(dfs, ignore_index=True)
    return train_df

def make_bayes_pipeline():
    return Pipeline([
        ("scaler", StandardScaler()),
        ("model", BayesianRidge(
            # alpha_1, alpha_2, lambda_1, lambda_2 可依需求微調先驗；先用預設即可
        ))
    ])

def fit_models(train_df):
    X = train_df[FEATURES]
    yx = train_df[TARGET_X]
    yz = train_df[TARGET_Z]

    pipe_x = make_bayes_pipeline()
    pipe_z = make_bayes_pipeline()

    pipe_x.fit(X, yx)
    pipe_z.fit(X, yz)
    return pipe_x, pipe_z

def predict_and_save(pipe_x, pipe_z):
    test_files = glob.glob(os.path.join(TEST_DIR, "*.csv"))
    for f in test_files:
        df = pd.read_csv(f)
        if len(df) > 100:
            df.loc[100:, TARGET_X] = pipe_x.predict(df[FEATURES].iloc[100:])
            df.loc[100:, TARGET_Z] = pipe_z.predict(df[FEATURES].iloc[100:])
        out = os.path.join(OUTPUT_DIR, os.path.basename(f))
        df.to_csv(out, index=False)

def main():
    train_df = load_train_df()
    pipe_x, pipe_z = fit_models(train_df)

    # 你也可以查看後驗係數均值與標準差（不確定性）
    brx = pipe_x.named_steps["model"]
    brz = pipe_z.named_steps["model"]
    print(f"[Bayesian] X coef shape: {brx.coef_.shape}, alpha_={brx.alpha_:.4f}, lambda_={brx.lambda_:.4f}")
    print(f"[Bayesian] Z coef shape: {brz.coef_.shape}, alpha_={brz.alpha_:.4f}, lambda_={brz.lambda_:.4f}")

    predict_and_save(pipe_x, pipe_z)

if __name__ == "__main__":
    main()


## 為什麼 ElasticNet 訓練這麼快？
線性模型 (ElasticNetCV) 在這個資料集上會很快是正常的：
- 問題是凸優化，使用座標下降 (coordinate descent) 收斂快。
- 特徵欄位數（篩掉非數值後）遠少於深度模型動輒上千萬參數。
- 雖然 ElasticNetCV 做 5-fold * 多個 alpha * 多個 l1_ratio (≈ 5 * 30 * 5 = 750 次擬合 / 目標)，資料量仍不大所以幾秒就結束。
- MultiOutputRegressor 只是對兩個目標各跑一次，仍屬輕量。

如果：
- 你看到幾乎瞬間完成，有可能是資料列數沒想像多（可檢查 train_df.shape）。
- 要花「幾小時」通常出現在：極大量樣本、極多特徵、高維稀疏矩陣、或使用深度學習/大型樹模型。

下面提供：
1. 重新計時訓練耗時。
2. 每個檔案 (101+ 行) 的 RMSE 分解表。
3. 係數 (前後 10 個最大絕對值特徵)。
4. RMSE 分佈簡單統計。
5. 若要更慢/更強：可改用 Gradient Boosting / LightGBM / XGBoost / 神經網路。


In [None]:
# === 模型診斷：耗時 / 每檔 RMSE / 重要特徵 ===
import time, math
import numpy as np
import pandas as pd
from pathlib import Path
from IPython.display import display

# 重新計時訓練 (僅示意，若已在記憶體就快速重跑)
_start = time.time()
X_train2, y_train2, feat_cols2 = X_train, y_train, feat_cols  # 已有資料
model_tmp = fit_elasticnet(X_train2, y_train2)
_train_time = time.time() - _start
print(f"重新訓練耗時: {_train_time:.2f} 秒 (含 CV)")

# 每檔案 RMSE (101+)
rows = []
for fname, g in train_df.groupby("__file__"):
    if not all(t in g.columns for t in TARGETS) or len(g) <= 100:
        continue
    feat_cols_file = [c for c in g.columns if c not in TARGETS + ["__file__"]]
    Xg = g[feat_cols_file].apply(pd.to_numeric, errors="coerce").dropna(axis=1, how="any").to_numpy(dtype=float)
    yg = g[TARGETS].to_numpy(dtype=float)
    if Xg.shape[0] <= 100:
        continue
    yhat = en_model.predict(Xg[100:])
    diff2 = (yhat - yg[100:]) ** 2
    rmse_file = math.sqrt(diff2.sum() / (2 * (len(g) - 100)))
    rows.append({"file": fname, "rmse_101p": rmse_file, "rows_total": len(g)})

rmse_df = pd.DataFrame(rows).sort_values("rmse_101p")
print("每檔案 RMSE (前 10 佳):")
display(rmse_df.head(10))
print("整體描述統計:")
print(rmse_df["rmse_101p"].describe())

# 重要特徵：取兩個子模型的係數 (ElasticNetCV 每目標一組)
reg_list = en_model.named_steps['reg'].estimators_
coef_X = reg_list[0].coef_
coef_Z = reg_list[1].coef_
coef_df = pd.DataFrame({
    'feature': feat_cols,
    'coef_X': coef_X,
    'coef_Z': coef_Z,
    'abs_X': np.abs(coef_X),
    'abs_Z': np.abs(coef_Z),
    'abs_mean': (np.abs(coef_X)+np.abs(coef_Z))/2,
})
# 取前後各 10 個 (按 abs_mean)
important = pd.concat([
    coef_df.sort_values('abs_mean', ascending=False).head(10),
    coef_df.sort_values('abs_mean', ascending=True).head(10)
])
print("特徵重要性 (前 10 大與最小 10):")
display(important)

# 若要視覺化可再加：
# import seaborn as sns, matplotlib.pyplot as plt
# sns.barplot(data=coef_df.sort_values('abs_mean', ascending=False).head(20), x='abs_mean', y='feature')
# plt.show()
