In [30]:
import polars as pl, pandas as pd, numpy as np, json
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import ExtraTreesClassifier, StackingClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from collections import Counter
import lightgbm as lgb, xgboost as xgb
from catboost import CatBoostClassifier
from itertools import product
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from collections import defaultdict
import os
import warnings
warnings.filterwarnings("ignore")

# === 全域參數設定 ===
n_clusters = 5
top_k_features = 36
pca_components = 10
n_selected_features = 400
pseudo_rounds = 10                     # 最大 pseudo-labeling 次數
max_total_pos = 20000                  # 最多可以加多少個 pseudo-positive
max_pos_per_round = 1500              # 每一輪最多加幾個 pseudo-positive
total_pos_added = 0                  # 用來追蹤目前已加了幾個 pseudo-positive
neg_sample_ratio = 1                 # 每個 positive 對應的負樣本比例
base_clip = 0.1                      # 錯誤 pseudo-label 的 clip 下限
max_clip = 1                         # 正確 pseudo-label 的權重上限
n_splits_p = 5                       # pseudo-label 用的內部 CV 次數
top_pct = 0.00731838457              # private set top x% 要預測為正樣本
fixed_test_topk = 176               # public test set 固定取前 k 個當作正樣本
top_k = 200

# === 評估函式定義 ===
def add_features(X, cluster_labels=None, top_k_features=10):
    df = X.copy()
    df["row_mean"] = df.mean(axis=1)
    df["row_max"] = df.max(axis=1)
    df["row_min"] = df.min(axis=1)
    df["row_std"] = df.std(axis=1)
    from scipy.stats import skew, kurtosis
    df["row_skew"] = df.apply(skew, axis=1)
    df["row_kurt"] = df.apply(kurtosis, axis=1)

    cols = df.columns[:top_k_features]
    for i in range(len(cols)):
        for j in range(i + 1, len(cols)):
            df[f"{cols[i]}_x_{cols[j]}"] = df[cols[i]] * df[cols[j]]

    scaled_data = StandardScaler().fit_transform(df.fillna(0))
    pca = PCA(n_components=pca_components)
    pca_out = pca.fit_transform(scaled_data)
    for i in range(pca_components):
        df[f"pca_{i+1}"] = pca_out[:, i]

    if cluster_labels is not None:
        df["cluster"] = cluster_labels
        cluster_mean = df.groupby("cluster").transform("mean")
        original_cols = [col for col in df.columns if col != "cluster"]
        diff_df = df[original_cols] - cluster_mean[original_cols]
        diff_df = diff_df.add_suffix("_diff_cluster_mean")
        df = pd.concat([df.drop(columns="cluster"), diff_df], axis=1)
    return df
def f1(y_true, y_pred):
    y_pred_label = (y_pred > 0.5).astype(int)
    return 'f1', f1_score(y_true, y_pred_label), True
def fit_lgbm_with_early_stopping(model, X, y, sample_weight=None, eval_ratio=0.3, random_state=42):
    X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=eval_ratio, stratify=y, random_state=random_state)
    if sample_weight is not None:
        sample_weight = np.array(sample_weight)
        sw_tr, sw_val = train_test_split(sample_weight, test_size=eval_ratio, stratify=y, random_state=random_state)
    else:
        sw_tr = sw_val = None
    model.fit(X_tr, y_tr,sample_weight=sw_tr, eval_set=[(X_val, y_val)], eval_metric="binary_logloss",
              callbacks=[lgb.early_stopping(50, verbose=False), lgb.log_evaluation(period=0)])
    return model
def fit_xgb_with_early_stopping(model, X, y, sample_weight=None, eval_ratio=0.3, random_state=42):
    X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=eval_ratio, stratify=y, random_state=random_state)
    if sample_weight is not None:
        sample_weight = np.array(sample_weight)
        sw_tr, sw_val = train_test_split(sample_weight, test_size=eval_ratio, stratify=y, random_state=random_state)
    else:
        sw_tr = sw_val = None
    model.fit(X_tr, y_tr, sample_weight=sw_tr, eval_set=[(X_val, y_val)], verbose=False)
    return model
def fit_cat_with_early_stopping(model, X, y, sample_weight=None, eval_ratio=0.3, random_state=42):
    X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=eval_ratio, stratify=y, random_state=random_state)
    if sample_weight is not None:
        sample_weight = np.array(sample_weight)
        sw_tr, sw_val = train_test_split(sample_weight, test_size=eval_ratio, stratify=y, random_state=random_state)
    else:
        sw_tr = sw_val = None
    model.fit(X_tr, y_tr,sample_weight=sw_tr,eval_set=(X_val, y_val),early_stopping_rounds=50,verbose=0)
    return model
def sample_negatives(n, seed=42):
    n=int(n)
    available_indices = list(neg_pool_indices)
    selected_idx = np.random.choice(available_indices, size=n, replace=True)
    neg_pool_indices.difference_update(selected_idx)
    return selected_idx

In [None]:
# === 讀取經過特徵工程的 parquet 檔 ===
df_train_fe = pd.read_parquet("FILE_PATH")
df_test_fe = pd.read_parquet("FILE_PATH")

# === 拆分欄位 ===
X_train = df_train_fe.drop(columns=["ID", "飆股"])
y_train = df_train_fe["飆股"].to_numpy()
X_test = df_test_fe.drop(columns=["ID", "dataset"])
ids = df_test_fe["ID"].to_numpy()
dataset_flag = df_test_fe["dataset"].to_numpy
is_test = (dataset_flag == "test")
is_private = (dataset_flag == "private")

# === 載入已選好的特徵名稱，僅取前top_k ===
with open("FILE_PATH", "r") as f:
    selected_feature_names = json.load(f)
selected_feature_names = selected_feature_names[:top_k]

# === 套用前200個特徵 ===
X_train = X_train[selected_feature_names]
X_test = X_test[selected_feature_names]
# === 載入模型參數 ===
with open("CAT_params.json", "r") as f: params_cat = json.load(f)
with open("LGB_params.json", "r") as f: params_lgb = json.load(f)
with open("XGB_params.json", "r") as f: params_xgb = json.load(f)

In [None]:
# === 特徵工程 + 特徵選擇：只有在 JSON 不存在時才執行 ===
if os.path.exists("FILE_PATH"):
    print("已偵測到 features，跳過特徵選擇")

    with open("FILE_PATH", "r") as f:
        selected_feature_names = json.load(f)

    X_all = pd.concat([X_train_raw, X_test_raw]).fillna(0)
    X_scaled = StandardScaler().fit_transform(X_all)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(X_scaled)
    
    X_train = add_features(X_train_raw, cluster_labels=kmeans.labels_[:len(X_train_raw)], top_k_features=top_k_features)
    X_test = add_features(X_test_raw, cluster_labels=kmeans.labels_[len(X_train_raw):], top_k_features=top_k_features)

    X_train = X_train[selected_feature_names]
    X_test = X_test[selected_feature_names]

else:
    print(" 執行特徵工程與 LGB 特徵選擇...")
    # 合併 train+test 做聚類
    X_all = pd.concat([X_train_raw, X_test_raw]).fillna(0)
    X_scaled = StandardScaler().fit_transform(X_all)

    # 聚類
    kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(X_scaled)
    labels = kmeans.labels_

    # 特徵擴充（交叉 + PCA + 群差）
    X_train = add_features(X_train_raw, cluster_labels=labels[:len(X_train_raw)], top_k_features=top_k_features)
    X_test = add_features(X_test_raw, cluster_labels=labels[len(X_train_raw):], top_k_features=top_k_features)

    # 特徵選擇（用 LGB）
    lgb_fs = lgb.LGBMClassifier(**params_lgb)
    lgb_fs.fit(X_train, y_train)
    importances = lgb_fs.feature_importances_
    top_k_idx = np.argsort(importances)[::-1][:n_selected_features]
    selected_feature_names = X_train.columns[top_k_idx].tolist()

    # 儲存下來
    with open("selected_features.json", "w") as f:
        json.dump(selected_feature_names, f)

    # 保留特徵
    X_train = X_train[selected_feature_names]
    X_test = X_test[selected_feature_names]

In [None]:
# === 分割訓練集做初始驗證 ===
X_train_all = X_train.copy()
y_train_all = y_train.copy()
X_train, X_val, y_train, y_val = train_test_split(X_train_all, y_train_all, test_size=0.3, stratify=y_train_all, random_state=42)

# === 計算 sample weight（用於 LGB / XGB） ===
counter = Counter(y_train)
scale_pos_weight = counter[0] / counter[1]
full_sample_weight = np.where(y_train == 1, scale_pos_weight, 1.0)

# === 初始模型交叉驗證訓練 ===
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
auc_list, f1_list = [], []
for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]
    train_weight = full_sample_weight[train_idx]
    
    model_lgb = fit_lgbm_with_early_stopping(lgb.LGBMClassifier(**params_lgb), X_tr, y_tr, sample_weight=train_weight)
    model_xgb = fit_xgb_with_early_stopping(xgb.XGBClassifier(**params_xgb), X_tr, y_tr, sample_weight=train_weight)
    model_cat = fit_cat_with_early_stopping(CatBoostClassifier(**params_cat), X_tr, y_tr, sample_weight=train_weight)

    probs_lgb = model_lgb.predict_proba(X_val)[:, 1]
    probs_xgb = model_xgb.predict_proba(X_val)[:, 1]
    probs_cat = model_cat.predict_proba(X_val)[:, 1]

    rank_lgb = probs_lgb.argsort().argsort()
    rank_xgb = probs_xgb.argsort().argsort()
    rank_cat = probs_cat.argsort().argsort()
    ensemble_rank = 0.6 * rank_lgb + 0.2 * rank_xgb + 0.2 * rank_cat

    n_top = max(1, int(len(y_val) * 0.00731838457))
    pred_ensemble = np.zeros_like(y_val)
    pred_ensemble[np.argsort(-ensemble_rank)[:n_top]] = 1

    auc_ens = roc_auc_score(y_val, (ensemble_rank - ensemble_rank.min()) / (ensemble_rank.max() - ensemble_rank.min()))
    f1_ens = f1_score(y_val, pred_ensemble)
    auc_list.append(auc_ens)
    f1_list.append(f1_ens)

    print(f"[Fold {fold+1}] ENSEMBLE | AUC = {auc_ens:.4f} | F1 = {f1_ens:.4f}")

print(f"\n[SK-Fold Validation Result (With Feature Selection)]")
print(f"Average AUC = {np.mean(auc_list):.4f} | Average F1 = {np.mean(f1_list):.4f}")


In [None]:
# === 建立正負樣本池 ===
X_pos_real = X_train[y_train == 1].reset_index(drop=True)
y_pos_real = np.ones(len(X_pos_real))
X_neg_all = X_train[y_train == 0].reset_index(drop=True)
y_neg_all = y_train[y_train == 0].copy()
neg_pool_indices = set(range(len(X_neg_all)))
#neg_sample_idx = sample_negatives(len(X_pos_real) * neg_sample_ratio)
neg_sample_idx = sample_negatives(int(len(X_neg_all)*0.8))
X_neg_real = X_neg_all.iloc[neg_sample_idx].reset_index(drop=True)
y_neg_real = y_neg_all[neg_sample_idx]
X_aug = pd.concat([X_pos_real, X_neg_real], ignore_index=True)
y_aug = np.concatenate([y_pos_real, y_neg_real])
sample_weight = np.ones(len(y_aug))
last_f1, no_improve_rounds = 0, 0
mask_pool = np.ones(len(X_train), dtype=bool)
all_error_features, all_is_correct = [], []
error_detector = None
pseudo_pos_counter = defaultdict(int)

# === Pseudo-labeling 主迴圈 ===
for pseudo_round in range(pseudo_rounds):
    # (1) 模型訓練
    model_lgb = fit_lgbm_with_early_stopping(model_lgb, X_aug, y_aug, sample_weight)
    model_xgb = fit_xgb_with_early_stopping(model_xgb, X_aug, y_aug, sample_weight)
    model_cat = fit_cat_with_early_stopping(model_cat, X_aug, y_aug, sample_weight)
    
    # 預測 pseudo pool
    X_pool = X_train[mask_pool]
    y_pool = y_train[mask_pool]
    n_candidates = len(X_pool)
    
    probs_lgb = model_lgb.predict_proba(X_pool)[:, 1]
    probs_xgb = model_xgb.predict_proba(X_pool)[:, 1]
    probs_cat = model_cat.predict_proba(X_pool)[:, 1]
    
    # Rank-based Ensemble
    rank_lgb = probs_lgb.argsort().argsort()
    rank_xgb = probs_xgb.argsort().argsort()
    rank_cat = probs_cat.argsort().argsort()
    ensemble_rank = 0.6 * rank_lgb + 0.2 * rank_xgb + 0.2 * rank_cat
    ensemble_probs = ensemble_rank  # 用於篩選 top-k
    
    # 一致性條件
    cm = ((np.abs(probs_lgb - probs_xgb) < 0.05) & (np.abs(probs_cat - probs_lgb) < 0.05))
    
    # (2) 找出可信度高的 pseudo-positive (Top-K + 一致性過濾)
    n_top_pseudo = min(max_pos_per_round, max_total_pos - total_pos_added)
    top_k_indices = np.argsort(ensemble_probs)[:n_top_pseudo]
    pool_indices = np.where(mask_pool)[0]
    pos_indices_global = pool_indices[top_k_indices]
    valid_pos = [i for i, gidx in enumerate(pos_indices_global) if pseudo_pos_counter[gidx] < 10]
    
    top_k_indices = top_k_indices[valid_pos]
    pos_idx = top_k_indices[cm[top_k_indices]]
    
    # 避免超過 max_total_pos
    remaining_pos = max_total_pos - total_pos_added
    available_this_round = min(remaining_pos, max_pos_per_round)
    if len(pos_idx) > available_this_round:
        pos_idx = pos_idx[np.argsort(ensemble_probs[pos_idx])[:available_this_round]]
    
    total_pos_added += len(pos_idx)
    
    # (3) 組成新一輪 pseudo 樣本
    neg_sample_idx = sample_negatives(len(pos_idx) * neg_sample_ratio)
    X_new_neg = X_neg_all.iloc[neg_sample_idx].reset_index(drop=True)
    y_new_neg = y_neg_all[neg_sample_idx]

    #neg_sample_idx = np.random.choice(len(X_neg_all), size=int(0.8 * len(X_neg_all)), replace=False)
    #X_new_neg = X_neg_all.iloc[neg_sample_idx].reset_index(drop=True)
    #y_new_neg = y_neg_all[neg_sample_idx]
    #X_new_neg = X_neg_all.reset_index(drop=True)
    #y_new_neg = y_neg_all

    X_pseudo = pd.concat([X_pool.iloc[pos_idx], X_new_neg], ignore_index=True)
    y_pseudo_raw = np.concatenate([np.ones(len(pos_idx)), np.zeros(len(X_new_neg))])
    true_labels = np.concatenate([y_pool[pos_idx], y_new_neg])
    
    # 使用 rank 越小表示信心越高 => confidence = 1 - (rank / max_rank)
    pseudo_conf = np.concatenate([
        1 - (ensemble_rank[pos_idx] / (np.max(ensemble_rank) + 1e-6)),
        np.zeros(len(X_new_neg))  # negative 的 confidence 是 0
    ])
    
    # 紀錄被使用的 pseudo 樣本索引
    pool_indices = np.where(mask_pool)[0]
    pos_indices_global = pool_indices[pos_idx]
    for gidx in pos_indices_global:
        pseudo_pos_counter[gidx] += 1
    n_neg = len(X_new_neg)
    zero_logits = np.zeros(n_neg)
    
    # (4) 建構錯誤偵測器（從信心不一緻度估計）
    ef = np.column_stack([
        np.concatenate([probs_lgb[pos_idx], zero_logits]),
        np.concatenate([probs_xgb[pos_idx], zero_logits]),
        np.concatenate([probs_cat[pos_idx], zero_logits]),pseudo_conf,
        np.abs(np.concatenate([probs_lgb[pos_idx], zero_logits]) - np.concatenate([probs_cat[pos_idx], zero_logits])),
        np.std([
            np.concatenate([probs_lgb[pos_idx], zero_logits]),
            np.concatenate([probs_xgb[pos_idx], zero_logits]),
            np.concatenate([probs_cat[pos_idx], zero_logits])], axis=0)])
    all_error_features.append(ef)
    all_is_correct.append((y_pseudo_raw == true_labels).astype(int))
    if pseudo_round >= 2:
        all_err_X = np.vstack(all_error_features)
        all_err_y = np.concatenate(all_is_correct)
        if len(np.unique(all_err_y)) < 2:
            print(f"[Pseudo Round {pseudo_round}] Error detector 未訓練（只有一類）")
            error_detector = None
        else:
            error_detector = lgb.LGBMClassifier(
                n_estimators=300,
                max_depth=6,
                learning_rate=0.05,
                class_weight='balanced',
                random_state=42,
                n_jobs=-1
            )
            X_tr_ed, X_val_ed, y_tr_ed, y_val_ed = train_test_split(
                all_err_X, all_err_y, test_size=0.2, stratify=all_err_y, random_state=42
            )
            error_detector.fit(
                X_tr_ed, y_tr_ed,
                eval_set=[(X_val_ed, y_val_ed)],
                eval_metric='auc',
                callbacks=[lgb.early_stopping(20), lgb.log_evaluation(period=0)]
            )
            train_pred = error_detector.predict(X_val_ed)
            train_acc = np.mean(train_pred == y_val_ed)
            print(f"[Pseudo Round {pseudo_round}] Error Detector (LGB) Valid Accuracy = {train_acc:.4f}")
            if train_acc < 0.6:
                print(f"[Pseudo Round {pseudo_round}] Error Detector 表現太差（< 0.6），跳過使用")
                error_detector = None
    if error_detector:
        proba = error_detector.predict_proba(ef)
        if proba.shape[1] == 2:
            error_pred = proba[:, 1]
        else:
            error_pred = np.ones(len(ef))
        keep_mask = error_pred < 0.5
        kept = keep_mask.sum()
        if kept < 10:
            print(f"[Pseudo Round {pseudo_round}] Too few pseudo kept ({kept}/{len(keep_mask)}), fallback to original")
            keep_mask[:] = True
        else:
            print(f"[Pseudo Round {pseudo_round}] Kept {kept}/{len(keep_mask)} pseudo samples after filtering")
        X_pseudo = X_pseudo[keep_mask]
        y_pseudo_raw = y_pseudo_raw[keep_mask]
        true_labels = true_labels[keep_mask]
        pseudo_conf = pseudo_conf[keep_mask]
    
    # (5) 過濾錯誤 pseudo 並加入新的訓練樣本
    skf_check = StratifiedKFold(n_splits=n_splits_p, shuffle=True, random_state=42)
    cv_probs = np.zeros(len(y_aug))
    for tr, val in skf_check.split(X_aug, y_aug):
        X_tr, X_val = X_aug.iloc[tr], X_aug.iloc[val]
        y_tr, y_val = y_aug[tr], y_aug[val]
        w_tr = sample_weight[tr]
        l = lgb.LGBMClassifier(**model_lgb.get_params())
        l = fit_lgbm_with_early_stopping(l, X_tr, y_tr, sample_weight=w_tr)
        x = xgb.XGBClassifier(**model_xgb.get_params())
        x = fit_xgb_with_early_stopping(x, X_tr, y_tr, sample_weight=w_tr)
        c = CatBoostClassifier(**model_cat.get_params())
        c = fit_cat_with_early_stopping(c, X_tr, y_tr, sample_weight=w_tr)
        val_l, val_x, val_c = l.predict_proba(X_val)[:, 1], x.predict_proba(X_val)[:, 1], c.predict_proba(X_val)[:, 1]
        
        s = np.vstack([val_l, val_x, val_c])
        sm = np.exp(s) / np.sum(np.exp(s), axis=0)
        cv_probs[val] = np.sum(s * sm, axis=0)
        rank_l = val_l.argsort().argsort()
        rank_x = val_x.argsort().argsort()
        rank_c = val_c.argsort().argsort()
        cv_probs[val] = (rank_l + rank_x + rank_c) / 3  # rank average

    n_top = max(1, int(np.sum(y_aug))) 
    threshold = np.sort(cv_probs)[-n_top]
    curr_f1 = f1_score(y_aug, (cv_probs >= threshold).astype(int))
    n_incorrect = np.sum(y_pseudo_raw != true_labels)
    if len(y_pseudo_raw) > 0:
        n_incorrect = np.sum(y_pseudo_raw != true_labels)
        pseudo_precision = 1 - n_incorrect / len(y_pseudo_raw)
        pseudo_error_rate = n_incorrect / len(y_pseudo_raw)
        clip_min = base_clip + (max_clip - base_clip) * pseudo_error_rate
    else:
        n_incorrect = 0
        pseudo_precision = np.nan
        clip_min = max_clip
    pseudo_weights = [conf if p == t else clip_min for p, t, conf in zip(y_pseudo_raw, true_labels, pseudo_conf)]
    X_aug = pd.concat([X_aug, X_pseudo], ignore_index=True)
    y_aug = np.concatenate([y_aug, true_labels])
    sample_weight = np.concatenate([sample_weight, pseudo_weights])
    print(f"[Pseudo Round {pseudo_round}] F1 = {curr_f1:.4f} / 上一輪 = {last_f1:.4f} | P: {len(pos_idx)}, N: {len(X_new_neg)}, Total: {len(y_aug)}")
    print(f"[Pseudo Round {pseudo_round}] 錯誤樣本: {n_incorrect} / {len(y_pseudo_raw)}（Precision = {pseudo_precision:.4f}）" 
          if len(y_pseudo_raw) > 0 else f"[Pseudo Round {pseudo_round}] 沒有加入 pseudo 樣本")
    if error_detector and 'keep_mask' in locals():
        print(f"[Pseudo Round {pseudo_round}] keep_mask true count: {keep_mask.sum()} / {len(keep_mask)}")
    else:
        print(f"[Pseudo Round {pseudo_round}] 沒有使用 error detector，跳過 keep_mask")
        
    # (6) 檢查是否 early stop
    if curr_f1 - last_f1 < 0.005:
        no_improve_rounds += 1
        if no_improve_rounds >= 2: break
    else:
        no_improve_rounds = 0
    last_f1 = curr_f1

In [None]:
# === 最終 5-Fold Ensemble 模型訓練與驗證 ===
print("\n[SK-Fold] 開始對 pseudo-label 擴充後的資料做交叉驗證與 Meta-Features 建構...")
final_auc_list, final_f1_list = [], []
model_lgbs, model_xgbs, model_cats = [], [], []

meta_X = np.zeros((len(X_aug), 16))
meta_test = np.zeros((len(X_test), 16))

probs_lgb_test_all = []
probs_xgb_test_all = []
probs_cat_test_all = []

skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

for fold, (tr_idx, val_idx) in enumerate(skf.split(X_aug, y_aug)):
    X_tr, X_val = X_aug.iloc[tr_idx], X_aug.iloc[val_idx]
    y_tr, y_val = y_aug[tr_idx], y_aug[val_idx]
    w_tr = sample_weight[tr_idx]


    model_lgb_f = fit_lgbm_with_early_stopping(lgb.LGBMClassifier(**model_lgb.get_params()), X_tr, y_tr, sample_weight=w_tr)
    model_xgb_f = fit_xgb_with_early_stopping(xgb.XGBClassifier(**model_xgb.get_params()), X_tr, y_tr, sample_weight=w_tr)
    model_cat_f = fit_cat_with_early_stopping(CatBoostClassifier(**model_cat.get_params()), X_tr, y_tr, sample_weight=w_tr)

    # 儲存模型（後面 ensemble 用）
    model_lgbs.append(model_lgb_f)
    model_xgbs.append(model_xgb_f)
    model_cats.append(model_cat_f)

    # Validation 預測
    val_lgb = model_lgb_f.predict_proba(X_val)[:, 1]
    val_xgb = model_xgb_f.predict_proba(X_val)[:, 1]
    val_cat = model_cat_f.predict_proba(X_val)[:, 1]

    # Test 預測
    test_lgb = model_lgb_f.predict_proba(X_test)[:, 1]
    test_xgb = model_xgb_f.predict_proba(X_test)[:, 1]
    test_cat = model_cat_f.predict_proba(X_test)[:, 1]

    # === Rank Ensemble 預測與評估 ===
    rank_lgb = val_lgb.argsort().argsort()
    rank_xgb = val_xgb.argsort().argsort()
    rank_cat = val_cat.argsort().argsort()
    ensemble_rank = 0.6 * rank_lgb + 0.2 * rank_xgb + 0.2 * rank_cat
    val_probs = ensemble_rank
    auc = roc_auc_score(y_val, (val_probs - val_probs.min()) / (val_probs.max() - val_probs.min()))
    n_top = max(1, int(len(y_val) * top_pct))
    f1 = f1_score(y_val, (val_probs >= np.sort(val_probs)[-n_top]).astype(int))
    final_auc_list.append(auc)
    final_f1_list.append(f1)
    print(f"[Fold {fold+1}] AUC = {auc:.4f} | F1 = {f1:.4f}")

    # === 蒐集 Meta-Features ===
    s_val = np.vstack([val_lgb, val_xgb, val_cat])
    s_test = np.vstack([test_lgb, test_xgb, test_cat])
    r_val = np.vstack([val_lgb.argsort().argsort()/len(val_lgb), val_xgb.argsort().argsort()/len(val_xgb), val_cat.argsort().argsort()/len(val_cat)])
    r_test = np.vstack([test_lgb.argsort().argsort()/len(test_lgb), test_xgb.argsort().argsort()/len(test_xgb), test_cat.argsort().argsort()/len(test_cat)])
    sm_val = np.exp(s_val) / np.sum(np.exp(s_val), axis=0)
    sm_test = np.exp(s_test) / np.sum(np.exp(s_test), axis=0)
    top1_same = ((r_val[0].argmin() == r_val[1].argmin()) & (r_val[1].argmin() == r_val[2].argmin())).astype(float)
    top1_same_test = ((r_test[0].argmin() == r_test[1].argmin()) & (r_test[1].argmin() == r_test[2].argmin())).astype(float)
    rank_diff = np.ptp(r_val, axis=0)
    rank_diff_test = np.ptp(r_test, axis=0)
    rank_bias = np.abs(r_val - np.mean(r_val, axis=0, keepdims=True)).mean(axis=0)
    rank_bias_test = np.abs(r_test - np.mean(r_test, axis=0, keepdims=True)).mean(axis=0)

    # 填入 meta_X
    meta_X[val_idx, :3] = s_val.T
    meta_X[val_idx, 3] = np.min(s_val, axis=0)
    meta_X[val_idx, 4] = np.max(s_val, axis=0)
    meta_X[val_idx, 5] = np.mean(s_val, axis=0)
    meta_X[val_idx, 6] = np.std(s_val, axis=0)
    meta_X[val_idx, 7:10] = r_val.T
    meta_X[val_idx, 10:12] = np.stack([np.mean(r_val, axis=0), np.std(r_val, axis=0)], axis=1)
    meta_X[val_idx, 12] = np.mean(sm_val, axis=0)
    meta_X[val_idx, 13] = top1_same
    meta_X[val_idx, 14] = rank_diff
    meta_X[val_idx, 15] = rank_bias

    # 累積 meta_test（平均每折結果）
    meta_test[:, :3] += s_test.T / skf.n_splits
    meta_test[:, 3] += np.min(s_test, axis=0) / skf.n_splits
    meta_test[:, 4] += np.max(s_test, axis=0) / skf.n_splits
    meta_test[:, 5] += np.mean(s_test, axis=0) / skf.n_splits
    meta_test[:, 6] += np.std(s_test, axis=0) / skf.n_splits
    meta_test[:, 7:10] += r_test.T / skf.n_splits
    meta_test[:, 10] += np.mean(r_test, axis=0) / skf.n_splits
    meta_test[:, 11] += np.std(r_test, axis=0) / skf.n_splits
    meta_test[:, 12] += np.mean(sm_test, axis=0) / skf.n_splits
    meta_test[:, 13] += top1_same_test / skf.n_splits
    meta_test[:, 14] += rank_diff_test / skf.n_splits
    meta_test[:, 15] += rank_bias_test / skf.n_splits

print(f"\n [Pseudo-After SK-Fold] 平均 AUC = {np.mean(final_auc_list):.4f} | 平均 F1 = {np.mean(final_f1_list):.4f}")

# === 結果後處理：test 固定取前 k 個，private 取前 p% ===
probs_lgb_test = np.mean([m.predict_proba(X_test)[:, 1] for m in model_lgbs], axis=0)
probs_xgb_test = np.mean([m.predict_proba(X_test)[:, 1] for m in model_xgbs], axis=0)
probs_cat_test = np.mean([m.predict_proba(X_test)[:, 1] for m in model_cats], axis=0)
rank_lgb = probs_lgb_test.argsort().argsort()
rank_xgb = probs_xgb_test.argsort().argsort()
rank_cat = probs_cat_test.argsort().argsort()
ensemble_rank = 0.6 * rank_lgb + 0.2 * rank_xgb + 0.2 * rank_cat

final_preds_pseudo_ensemble = np.zeros_like(ensemble_rank , dtype=int)
test_probs = ensemble_rank[is_test]
top_test_idx = np.argsort(-test_probs)[:fixed_test_topk]
final_preds_pseudo_ensemble[is_test] = 0
final_preds_pseudo_ensemble[np.where(is_test)[0][top_test_idx]] = 1
private_probs = ensemble_rank[is_private]
n_top_private = max(1, int(len(private_probs) * top_pct))
top_private_idx = np.argsort(-private_probs)[:n_top_private]
final_preds_pseudo_ensemble[is_private] = 0
final_preds_pseudo_ensemble[np.where(is_private)[0][top_private_idx]] = 1

pd.DataFrame({"ID": ids, "飆股": final_preds_pseudo_ensemble}).to_csv("FILE_PATH", index=False, encoding="utf-8", lineterminator="\n")
print(" Pseudo-labeling 後立即預測的結果已儲存：1p.csv")

In [None]:
params_lgb_s = model_lgb.get_params().copy()
params_lgb_s.pop('early_stopping_rounds', None)
model_lgb_s = lgb.LGBMClassifier(**params_lgb_s)

params_xgb_s = model_xgb.get_params().copy()
params_xgb_s.pop('early_stopping_rounds', None)
model_xgb_s = xgb.XGBClassifier(**params_xgb_s)

params_cat = model_cat.get_params().copy()         
params_cat.pop('early_stopping_rounds', None)
params_cat.pop('task_type', None)
model_cat_s = CatBoostClassifier(**params_cat, task_type="CPU")

base_estimators = [
    ('lgb', model_lgb_s),
    ('xgb', model_xgb_s),
    ('cat', model_cat_s),
    ('et', ExtraTreesClassifier(n_estimators=300, max_depth=6, random_state=42, n_jobs=-1))]

meta_features = [
    "val_lgb", "val_xgb", "val_cat",
    "min_prob", "max_prob", "mean_prob", "std_prob",
    "rank_lgb", "rank_xgb", "rank_cat",
    "mean_rank", "std_rank", "softmax_mean",
    "top1_agree", "rank_ptp", "rank_bias"]

meta_model = StackingClassifier(
    estimators=base_estimators,
    final_estimator=LogisticRegressionCV(Cs=10, cv=5, scoring='f1', max_iter=1000, class_weight='balanced', random_state=42, n_jobs=-1),
    passthrough=True, n_jobs=-1)

meta_model.fit(meta_X, y_aug)
oof_avg = meta_model.predict_proba(pd.DataFrame(meta_X, columns=meta_features))[:, 1]
final_probs = meta_model.predict_proba(pd.DataFrame(meta_test, columns=meta_features))[:, 1]

thresholds = np.arange(0.0, 0.99, 0.01)
best_f1, best_thresh = 0, 0.5
for t in thresholds:
    f1 = f1_score(y_aug, (oof_avg > t).astype(int))
    if f1 > best_f1:
        best_f1, best_thresh = f1, t
        
auc = roc_auc_score(y_aug, oof_avg)
print(f"AUC = {auc:.4f}")
final_preds = (final_probs > best_thresh).astype(int)

print(f"F1 Score = {f1_score(y_aug, (oof_avg > best_thresh).astype(int)):.4f} "
      f"Precision = {precision_score(y_aug, (oof_avg > best_thresh).astype(int)):.4f} "
      f"Recall = {recall_score(y_aug, (oof_avg > best_thresh).astype(int)):.4f} "
      f"Threshold = {best_thresh:.4f}")

result = permutation_importance(
    meta_model, meta_X, y_aug,
    scoring='f1', n_repeats=10, random_state=42, n_jobs=-1)

importance_df = pd.DataFrame({
    "Feature": meta_features,
    "Mean_Importance": result.importances_mean,
    "Std_Dev": result.importances_std
}).sort_values(by="Mean_Importance", ascending=False)

print(" Permutation Importance (Top 5):")
print(importance_df.head())

selected_meta_features = importance_df[importance_df["Mean_Importance"] > 0]["Feature"].tolist()
if len(selected_meta_features) == 0:
    print(" 無重要特徵被選中，將 fallback 使用全部 meta features")
    selected_meta_features = meta_features
print(" 選中的 meta features:", selected_meta_features)

meta_X_selected = pd.DataFrame(meta_X, columns=meta_features)[selected_meta_features].to_numpy()
meta_test_selected = pd.DataFrame(meta_test, columns=meta_features)[selected_meta_features].to_numpy()
meta_model_new = LogisticRegressionCV(Cs=10, cv=5, scoring='f1', max_iter=1000,
                                      class_weight='balanced', random_state=42, n_jobs=-1)
meta_model_new.fit(meta_X_selected, y_aug)

oof_selected = meta_model_new.predict_proba(pd.DataFrame(meta_X_selected, columns=selected_meta_features))[:, 1]
final_probs_retrained = meta_model_new.predict_proba(pd.DataFrame(meta_test_selected, columns=selected_meta_features))[:, 1]

best_f1, best_thresh = 0, 0.5
for t in thresholds:
    f1 = f1_score(y_aug, (oof_selected > t).astype(int))
    if f1 > best_f1:
        best_f1, best_thresh = f1, t
        
auc = roc_auc_score(y_aug, oof_selected)
print(f"AUC = {auc:.4f}")
print(f" New F1 after feature selection: {best_f1:.4f}")
final_preds = (final_probs_retrained > best_thresh).astype(int)
test_probs = final_probs_retrained[is_test]
top_test_idx = np.argsort(-test_probs)[:fixed_test_topk]
final_preds[is_test] = 0
final_preds[np.where(is_test)[0][top_test_idx]] = 1

private_probs = final_probs_retrained[is_private]
n_top_private = max(1, int(len(private_probs) * top_pct))
top_private_idx = np.argsort(-private_probs)[:n_top_private]
final_preds[is_private] = 0
final_preds[np.where(is_private)[0][top_private_idx]] = 1

print(f"選中：test={final_preds[is_test].sum()}，private={final_preds[is_private].sum()}")

pd.DataFrame({"ID": ids, "飆股": final_preds, "dataset": dataset_flag})[["ID", "飆股"]].to_csv("FILE_PATH", index=False, encoding="utf-8", lineterminator="\n")