In [None]:
import polars as pl
import gc

# 讀入原始訓練資料
df_base = pl.read_parquet("FILE_PATH")

# 分離出目標與 ID
id_col = df_base["ID"]
y = df_base["飆股"]
df_base = df_base.drop(["ID", "飆股"])

# 讀入你剛剛合併的衍生特徵
df_derived = pl.read_parquet("FILE_PATH")

# 合併原始 + 衍生
df_all = df_base.hstack(df_derived)
# 合併完畢後釋放記憶體
del df_base
del df_derived
gc.collect()

print(f" 合併完成，共 {df_all.shape[1]} 欄")
print(" 已釋放 df_base 和 df_derived 記憶體")

def downcast_df(df):
    return df.with_columns([
        pl.col(c).cast(pl.Float32) if df[c].dtype == pl.Float64 else
        pl.col(c).cast(pl.Int32) if df[c].dtype == pl.Int64 else
        pl.col(c) for c in df.columns
    ])

df_all = downcast_df(df_all)
df_all = df_all.fill_null(0)
print(" downcast & 補 0 完成")

import lightgbm as lgb
import pandas as pd
import numpy as np
import json
from sklearn.metrics import roc_auc_score

def auc_eval(y_true, y_pred):
    return 'auc', roc_auc_score(y_true, y_pred), True

NEG_SAMPLE_SIZE = 8000

pos_idx = np.where(y == 1)[0]
neg_idx = np.where(y == 0)[0]
np.random.seed(42)
neg_sample = np.random.choice(neg_idx, size=NEG_SAMPLE_SIZE, replace=False)

with open("LGB.json", "r") as f: params_lgb = json.load(f)

selected_idx = np.concatenate([pos_idx, neg_sample])
np.random.shuffle(selected_idx)
print("正樣本數：", len(pos_idx))
print("負樣本抽樣數：", len(neg_sample))
print("使用樣本總數：", len(selected_idx))
X = df_all.select(selected_cols)[selected_idx].to_pandas().values
y_ = y[selected_idx]

model = lgb.LGBMClassifier(**params_lgb)
model.fit(X, y_)
del X
del y_
gc.collect()

# 特徵選擇
importance = model.feature_importances_
imp_df = pd.DataFrame({"feature": df_all.columns, "importance": importance})
imp_df.to_csv("FILE_PATH", index=False)
print(" 已輸出 feature importance")

low_imp_cols = imp_df[imp_df["importance"] <= 1]["feature"].tolist()
pd.DataFrame({"column": low_imp_cols}).to_csv("FILE_PATH", index=False)
print(f" droplist.csv 已儲存，共 {len(low_imp_cols)} 欄")

selected_cols = imp_df[imp_df["importance"] > 1]["feature"].tolist()
df_selected = df_all.select(selected_cols)

df_final = df_selected.with_columns([
    id_col,
    y.rename("飆股")
]).select(["ID"] + df_selected.columns + ["飆股"])

df_final.write_parquet("FILE_PATH", compression="zstd")
print(" 寫入完成")


In [None]:
import polars as pl
import pandas as pd

#  1. 載入 importance 檔案
imp_df = pd.read_csv("FILE_PATH")

# ➤ 根據 importance 選出欄位（可依門檻調整）
selected_cols = imp_df[imp_df["importance"] > 1]["feature"].tolist()
print(f" 保留特徵欄位數：{len(selected_cols)}")

#  2. 定義對齊函數
def align_columns(df: pl.DataFrame, selected_cols: list[str], id_col: str = "ID"):
    # 補缺的欄位為 0
    for col in selected_cols:
        if col not in df.columns:
            df = df.with_columns(pl.lit(0).alias(col))

    # 排序欄位一致
    df = df.select([id_col] + selected_cols if id_col in df.columns else selected_cols)
    return df

#  3. 讀取 test / private 資料
df_test = pl.read_parquet("FILE_PATH")
df_private = pl.read_parquet("FILE_PATH")

#  4. 套用對齊 + 輸出
df_test_aligned = align_columns(df_test, selected_cols)
df_test_aligned.write_parquet("FILE_PATH", compression="zstd")

df_private_aligned = align_columns(df_private, selected_cols)
df_private_aligned.write_parquet("FILE_PATH", compression="zstd")

print(" test/private 對齊完成並寫出")


In [None]:
# 顯示欄位數（包含 ID / 飆股）
print(" 欄位總數（含 ID/飆股）")
print(f"train   欄位數：{df_train.width}")
print(f"test     欄位數：{df_test.width}")
print(f"private 欄位數：{df_private.width}")

# 若你只想看特徵欄位數（去掉 ID / 飆股）
drop_cols = {"ID", "飆股"}
print("\n僅特徵欄位數（不含 ID/飆股）")
print(f"train   特徵欄位：{len(set(df_train.columns) - drop_cols)}")
print(f"test    特徵欄位：{len(set(df_test.columns) - drop_cols)}")
print(f"private 特徵欄位：{len(set(df_private.columns) - drop_cols)}")