In [None]:
import pandas as pd
import polars as pl
import numpy as np
import json
import gc
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score

# === 設定 ===
N_SPLIT = 10
PARQUET_PATH = "FILE_PATH"

def get_feature_importance(model, model_name, X, y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

    if model_name == "lgb":
        model.fit(X_train, y_train,
                  eval_set=[(X_val, y_val)],
                  eval_metric="binary_logloss",
                  callbacks=[lgb.early_stopping(50, verbose=False)])
        return pd.DataFrame({
            "feature": X.columns,
            "importance": model.feature_importances_
        }).sort_values(by="importance", ascending=False)

    elif model_name == "xgb":
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
        return pd.DataFrame({
            "feature": X.columns,
            "importance": model.feature_importances_
        }).sort_values(by="importance", ascending=False)

    elif model_name == "cat":
        model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50, verbose=0)
        return pd.DataFrame({
            "feature": X.columns,
            "importance": model.get_feature_importance()
        }).sort_values(by="importance", ascending=False)

df = pl.read_parquet(PARQUET_PATH).to_pandas()
pos_idx_all = df[df["飆股"] == 1].index.to_numpy()
neg_idx_all = df[df["飆股"] == 0].index.to_numpy()
np.random.shuffle(pos_idx_all)
np.random.shuffle(neg_idx_all)

pos_chunks = np.array_split(pos_idx_all, N_SPLIT)
neg_chunks = np.array_split(neg_idx_all, N_SPLIT)
del df
gc.collect()

# === 分批跑特徵重要性分析 ===
for i in range(N_SPLIT):
    print(f"\n===  第 {i+1} 次抽樣開始 ===")

    # (1) 重新讀 parquet 並抽取該次樣本
    df = pl.read_parquet(PARQUET_PATH).to_pandas()
    idx = np.concatenate([pos_chunks[i], neg_chunks[i]])
    df_sample = df.iloc[idx].reset_index(drop=True)
    del df
    gc.collect()

    X = df_sample.drop(columns=["ID", "飆股"]).fillna(0)
    y = df_sample["飆股"].to_numpy()
    del df_sample
    gc.collect()

    # (2) 建立模型
    lgb_model = lgb.LGBMClassifier(
        n_estimators=500,
        max_depth=6,
        is_unbalance=True,
        objective="binary",
        device="cpu",
        random_state=42,
        verbose=-1,
        n_jobs=-1
    )

    xgb_model = xgb.XGBClassifier(
        n_estimators=500,
        max_depth=6,
        scale_pos_weight=(y == 0).sum() / (y == 1).sum(),
        objective="binary:logistic",
        eval_metric="logloss",
        tree_method="hist",
        device="cuda",
        random_state=42,
        n_jobs=-1
    )

    cat_model = CatBoostClassifier(
        iterations=500,
        depth=6,
        learning_rate=0.05,
        auto_class_weights="Balanced",
        task_type="GPU",
        eval_metric="F1",
        loss_function="Logloss",
        thread_count=-1,
        random_seed=42,
        verbose=0
    )

    # (3) 計算 & 儲存 importance
    imp_lgb = get_feature_importance(lgb_model, "lgb", X, y)
    imp_xgb = get_feature_importance(xgb_model, "xgb", X, y)
    imp_cat = get_feature_importance(cat_model, "cat", X, y)

    imp_lgb.to_csv(f"importance_lgb_fold{i+1}.csv", index=False)
    imp_xgb.to_csv(f"importance_xgb_fold{i+1}.csv", index=False)
    imp_cat.to_csv(f"importance_cat_fold{i+1}.csv", index=False)
    print(f" Fold {i+1} 完成，已儲存三個模型的 importance")

    # (4) 清記憶體
    del X, y, imp_lgb, imp_xgb, imp_cat, lgb_model, xgb_model, cat_model
    gc.collect()


In [None]:
def aggregate_importance(model_name, topk=50):
    dfs = [pd.read_csv(f"importance_{model_name}_part{i+1}.csv") for i in range(10)]
    merged = dfs[0][["feature"]].copy()
    for i, df in enumerate(dfs):
        merged[f"importance_{i+1}"] = df["importance"]
    merged["mean_importance"] = merged.iloc[:, 1:].mean(axis=1)
    merged = merged.sort_values(by="mean_importance", ascending=False)
    merged.to_csv(f"importance_{model_name}_avg.csv", index=False)
    print(f" Saved averaged importance: importance_{model_name}_avg.csv")
    return merged.head(topk)

# 使用：
top_lgb = aggregate_importance("lgb")
top_xgb = aggregate_importance("xgb")
top_cat = aggregate_importance("cat")
