In [1]:
import polars as pl
import pandas as pd

# 讀取 parquet
train_filled = pl.read_parquet("data/train_with_companyID_features.parquet")

# 讀取top50 features
n_top =160
save_dir = "model_output/selected_features_xgb/one_model/v1_base_features/with_companyID_features/v1_model_no_gpu/"
df = pd.read_csv("model_output/all_features_xgb/v1_base_features/with_companyID_features/v1_model/model_importance/common_features_with_ranks.csv")
top50_features = df["feature"].head(n_top).tolist()
top50_features = [f for f in top50_features if f != "companyID"]

# 你要保留的欄位 (Top50 + target + group id)
cols_to_keep = top50_features + ["selected", "ranker_id"]

# 只保留這些欄位
train_filled = train_filled.select(cols_to_keep)


print(f"✅ DataFrame 現在只有 {len(train_filled.columns)} 欄位: {train_filled.columns}")


✅ DataFrame 現在只有 162 欄位: ['price_from_median_zscore', 'price_percentile', 'total_is_min_transfers', 'legs0_is_min_transfers', 'legs0_num_transfers_rank', 'total_num_transfers_rank', 'price_per_duration_rank', 'legs1_num_transfers_rank', 'price_per_duration', 'legs1_is_min_transfers', 'totalPrice_rank', 'legs0_segments0_flightNumber', 'legs0_arrivalAt_hour', 'price_per_fee_rank', 'price_per_tax', 'price_minus_fee_rank', 'legs0_mean_cabin', 'legs0_departureAt_hour', 'price_per_fee', 'legs0_weighted_mean_cabin', 'free_exchange', 'days_before_departure', 'group_size', 'legs0_segments0_baggageAllowance_quantity', 'legs0_max_duration_cabin', 'leg0_view_diff_mean', 'baggage_total_rank', 'companyID_loo_mean_legs0_departureAt_hour', 'total_num_transfers', 'both_legs_carrier_all_same', 'legs0_segments0_cabinClass', 'all_view_diff_mean', 'log_price', 'pricingInfo_isAccessTP', 'price_minus_fee', 'total_weighted_mean_cabin', 'companyID_loo_mean_taxes', 'has_access_tp', 'total_fees', 'companyID_loo_

In [20]:
# 要轉成 int 的欄位
cols_to_int = [
    "pricingInfo_isAccessTP",
    "legs0_segments0_baggageAllowance_quantity",
    "legs1_segments0_baggageAllowance_quantity",
    "miniRules1_statusInfos",
    "baggage_total",
    "legs0_segments0_seatsAvailable",
    "miniRules1_monetaryAmount",
    "total_fees",
    "price_minus_fee",
    "taxes",
    "totalPrice",
    "legs1_segments0_seatsAvailable"
]

# 先檢查哪些欄位存在
existing_cols = [c for c in cols_to_int if c in train_filled.columns]

print(f"✅ 共找到 {len(existing_cols)} 個存在的欄位要轉 int: {existing_cols}")

# 做轉型
train_filled = train_filled.with_columns([
    pl.col(c).fill_null(0).cast(pl.Int32).alias(c)
    for c in existing_cols
])

✅ 共找到 10 個存在的欄位要轉 int: ['pricingInfo_isAccessTP', 'legs0_segments0_baggageAllowance_quantity', 'legs1_segments0_baggageAllowance_quantity', 'baggage_total', 'legs0_segments0_seatsAvailable', 'miniRules1_monetaryAmount', 'total_fees', 'price_minus_fee', 'taxes', 'totalPrice']


In [2]:
exclude_cols = [
    'Id', 'ranker_id', 'selected',
    'profileId', 'requestDate',
    'legs0_departureAt', 'legs0_arrivalAt', 'legs1_departureAt', 'legs1_arrivalAt',
    'miniRules0_percentage', 'miniRules1_percentage',  # >90% missing
    'frequentFlyer',  # Already processed
    # Exclude constant columns
    'pricingInfo_passengerCount'
]

feature_cols = [col for col in train_filled.columns if col not in exclude_cols]

print(f"Using {len(feature_cols)} features")

X = train_filled.select(feature_cols)
y = train_filled.select('selected')
groups = train_filled.select('ranker_id')


Using 160 features


In [3]:
import numpy as np
import polars as pl
import xgboost as xgb
import gc

# ===== 你原本的 exclude 與 feature_cols =====
exclude_cols = [
    'Id', 'ranker_id', 'selected',
    'profileId', 'requestDate',
    'legs0_departureAt', 'legs0_arrivalAt', 'legs1_departureAt', 'legs1_arrivalAt',
    'miniRules0_percentage', 'miniRules1_percentage',
    'frequentFlyer',
    'pricingInfo_passengerCount'
]
feature_cols = [c for c in train_filled.columns if c not in exclude_cols]
print(f"Using {len(feature_cols)} features")
import numpy as np
import polars as pl
import xgboost as xgb
import gc

# ===== 你的既有設定 =====
categorical_cols = [
    'legs0_segments0_flightNumber',
    'legs1_segments0_flightNumber',
    'legs0_segments0_cabinClass',
    'legs1_segments0_cabinClass',
    'legs1_segments0_marketingCarrier_code',
    'corporateTariffCode',
    'is_major_carrier',
    'isVip',
    'has_baggage',
    'has_access_tp',
    'free_exchange',
    'free_cancel',
    'companyID',
]
categorical_cols = [c for c in categorical_cols if c in feature_cols]
numeric_cols = [c for c in feature_cols if c not in categorical_cols]

X = train_filled.select(feature_cols)
y = train_filled.select('selected')
groups = train_filled.select('ranker_id')

# ===== split by ranker_id (同你原本流程) =====
unique_rankers = groups.select("ranker_id").unique().to_series().to_list()
np.random.seed(42); np.random.shuffle(unique_rankers)
n_train = int(0.8 * len(unique_rankers))
train_rankers = set(unique_rankers[:n_train])

is_train = groups.select(pl.col("ranker_id").is_in(list(train_rankers)).alias("is_train"))
X_with_mask = X.with_columns(is_train)
y_with_mask = y.with_columns(is_train)
groups_with_mask = groups.with_columns(is_train)

X_train_df = X_with_mask.filter(pl.col("is_train"))
X_val_df   = X_with_mask.filter(~pl.col("is_train"))
y_train_df = y_with_mask.filter(pl.col("is_train"))
y_val_df   = y_with_mask.filter(~pl.col("is_train"))
groups_train_df = groups_with_mask.filter(pl.col("is_train"))
groups_val_df   = groups_with_mask.filter(~pl.col("is_train"))

# ===== 這裡是關鍵修正：用 StringCache + 先轉 Utf8 再轉 Categorical，再取整數碼 =====
with pl.StringCache():
    def encode_cats(df: pl.DataFrame) -> pl.DataFrame:
        if not categorical_cols:
            return df
        cat_exprs = []
        for c in categorical_cols:
            # 1) 先轉成字串 (Utf8)；2) 再轉 Categorical；3) 取實體碼 (u32)；4) 轉成 Int32；5) 空值補 -1
            cat_exprs.append(
                pl.col(c)
                  .cast(pl.Utf8)
                  .cast(pl.Categorical)
                  .to_physical()           # UInt32
                  .cast(pl.Int32)          # Int32 才能有 missing=-1
                  .fill_null(-1)
                  .alias(c)
            )
        return df.with_columns(cat_exprs)

    X_train_df = encode_cats(X_train_df)
    X_val_df   = encode_cats(X_val_df)

# 其他數值欄位轉 float32（省記憶體）
num_exprs = [pl.col(c).cast(pl.Float32).alias(c) for c in numeric_cols]
X_train_df = X_train_df.with_columns(num_exprs)
X_val_df   = X_val_df.with_columns(num_exprs)

# ===== 轉 NumPy =====
X_train_np = X_train_df.drop("is_train").to_numpy()
X_val_np   = X_val_df.drop("is_train").to_numpy()
y_train_np = y_train_df.drop("is_train").to_numpy().astype(np.float32).ravel()
y_val_np   = y_val_df.drop("is_train").to_numpy().astype(np.float32).ravel()

# ===== group sizes =====
group_sizes_train = (
    groups_train_df.drop("is_train")
    .group_by("ranker_id", maintain_order=True)
    .agg(pl.len())['len'].to_numpy()
)
group_sizes_val = (
    groups_val_df.drop("is_train")
    .group_by("ranker_id", maintain_order=True)
    .agg(pl.len())['len'].to_numpy()
)

# ===== 告訴 XGBoost 哪些是類別、哪些是連續 =====
feature_types = ['c' if c in categorical_cols else 'q' for c in feature_cols]

# ===== 建 DMatrix（注意 missing=-1 對應我們上面填的 -1）=====
dtrain = xgb.DMatrix(
    X_train_np,
    label=y_train_np,
    feature_names=feature_cols,
    feature_types=feature_types,
    enable_categorical=True,
    missing=-1
)
dtrain.set_group(group_sizes_train)

dval = xgb.DMatrix(
    X_val_np,
    label=y_val_np,
    feature_names=feature_cols,
    feature_types=feature_types,
    enable_categorical=True,
    missing=-1
)
dval.set_group(group_sizes_val)

del X_train_np, y_train_np, group_sizes_train
gc.collect()


Using 81 features


7

In [3]:
import numpy as np
import polars as pl

# 確認這些物件都是Polars DataFrame
# X, y, groups
# 都是 shape [n_rows, n_cols]

# 先把 ranker_id轉list
unique_rankers = groups.select("ranker_id").unique().to_series().to_list()

# 打亂
np.random.seed(42)
np.random.shuffle(unique_rankers)

# 切8:2
n_train = int(0.8 * len(unique_rankers))
train_rankers = set(unique_rankers[:n_train])
val_rankers = set(unique_rankers[n_train:])

# 用 Polars 過濾 train/val
is_train = groups.select(pl.col("ranker_id").is_in(list(train_rankers)).alias("is_train"))

# 先 concat mask
X_with_mask = X.with_columns(is_train)
y_with_mask = y.with_columns(is_train)
groups_with_mask = groups.with_columns(is_train)

# 分割 DataFrame
X_train_df = X_with_mask.filter(pl.col("is_train"))
X_val_df = X_with_mask.filter(~pl.col("is_train"))
y_train_df = y_with_mask.filter(pl.col("is_train"))
y_val_df = y_with_mask.filter(~pl.col("is_train"))
groups_train_df = groups_with_mask.filter(pl.col("is_train"))
groups_val_df = groups_with_mask.filter(~pl.col("is_train"))

# 再轉 numpy (分批)
X_train_np = X_train_df.drop("is_train").to_numpy()
X_val_np = X_val_df.drop("is_train").to_numpy()
y_train_np = y_train_df.drop("is_train").to_numpy().flatten()
y_val_np = y_val_df.drop("is_train").to_numpy().flatten()
groups_train_np = groups_train_df.drop("is_train").to_numpy().flatten()
groups_val_np = groups_val_df.drop("is_train").to_numpy().flatten()
# 最後計算 group sizes
group_sizes_train = (
    groups_train_df.drop("is_train")
    .group_by("ranker_id", maintain_order=True)
    .agg(pl.len())['len']
    .to_numpy()
)

group_sizes_val = (
    groups_val_df.drop("is_train")
    .group_by("ranker_id", maintain_order=True)
    .agg(pl.len())['len']
    .to_numpy()
)



In [None]:
import gc
del train_filled, X_train_df,X_val_df,X_with_mask,  y_train_df,y_val_df,y_with_mask, groups, groups_train_df, groups_val_df, is_train, groups_with_mask

gc.collect()

In [4]:
import xgboost as xgb
import gc

# DMatrix 建立 (不再用 X.columns)
dtrain = xgb.DMatrix(
    X_train_np,
    label=y_train_np,
    feature_names = feature_cols,
)
dtrain.set_group(group_sizes_train)
del X_train_np, y_train_np, group_sizes_train
gc.collect()
dval = xgb.DMatrix(
    X_val_np,
    label=y_val_np,
    feature_names = feature_cols,
)
dval.set_group(group_sizes_val)
del X_val_np
gc.collect()


7

In [5]:
import xgboost as xgb
import joblib, os, json, numpy as np
from scripts.hitrate import compute_hitrate_at_3
model_dir = os.path.join(save_dir, f"top{n_top}")
os.makedirs(model_dir, exist_ok=True)

params = {
    'objective': 'rank:pairwise',
    'eval_metric': 'ndcg@3',
    "learning_rate": 0.022641389657079056,
    "max_depth": 14,
    "min_child_weight": 2,
    "subsample": 0.8842234913702768,
    "colsample_bytree": 0.45840689146263086,
    "gamma": 3.3084297630544888,
    "lambda": 6.952586917313028,
    "alpha": 0.6395254133055179,
    'seed': 42,
    'n_jobs': -1,
    # 'device': 'cuda',
    # "tree_method": 'hist',
    # "predictor": 'gpu_predictor',
}
with open(os.path.join(model_dir, "xgb_params.json"), "w") as f:
    json.dump(params, f, indent=2)

# # 權重
# def make_sample_weights(y, pos_weight=10.0):
#     return np.where(y == 1, pos_weight, 1.0)

# w_train = make_sample_weights(y_train_np)
# w_val = make_sample_weights(y_val_np)

# dtrain = xgb.DMatrix(X_train_np, label=y_train_np, weight=w_train)
# dtrain.set_group(group_sizes_train)

# dval = xgb.DMatrix(X_val_np, label=y_val_np, weight=w_val)
# dval.set_group(group_sizes_val)

evals = [(dtrain, "train"), (dval, "val")]

# 訓練 + callback 模擬 early stopping + hitrate
xgb_model = xgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    evals=evals,
    early_stopping_rounds=50,
    verbose_eval=20,
)

# 儲存最佳模型（可轉成 SHAP）
model_path = os.path.join(model_dir, "xgb_ranker_model.pkl")
joblib.dump(xgb_model, model_path)
print(f"✅ 已儲存最佳模型：{model_path}")


[0]	train-ndcg@3:0.36168	val-ndcg@3:0.35263
[20]	train-ndcg@3:0.58582	val-ndcg@3:0.46729
[40]	train-ndcg@3:0.64054	val-ndcg@3:0.48642
[60]	train-ndcg@3:0.66892	val-ndcg@3:0.49587
[80]	train-ndcg@3:0.69047	val-ndcg@3:0.50149
[100]	train-ndcg@3:0.70729	val-ndcg@3:0.50782
[120]	train-ndcg@3:0.72333	val-ndcg@3:0.51333
[140]	train-ndcg@3:0.73928	val-ndcg@3:0.51874
[160]	train-ndcg@3:0.75076	val-ndcg@3:0.52257
[180]	train-ndcg@3:0.76285	val-ndcg@3:0.52629
[200]	train-ndcg@3:0.77352	val-ndcg@3:0.52853
[220]	train-ndcg@3:0.78228	val-ndcg@3:0.53174
[240]	train-ndcg@3:0.79240	val-ndcg@3:0.53447
[260]	train-ndcg@3:0.80158	val-ndcg@3:0.53728
[280]	train-ndcg@3:0.81044	val-ndcg@3:0.53925
[300]	train-ndcg@3:0.82006	val-ndcg@3:0.54224
[320]	train-ndcg@3:0.82909	val-ndcg@3:0.54480
[340]	train-ndcg@3:0.83733	val-ndcg@3:0.54749
[360]	train-ndcg@3:0.84369	val-ndcg@3:0.54989
[380]	train-ndcg@3:0.84873	val-ndcg@3:0.55066
[400]	train-ndcg@3:0.85313	val-ndcg@3:0.55217
[420]	train-ndcg@3:0.85839	val-ndcg@3:0.

In [6]:
import polars as pl
from scripts.hitrate import compute_hitrate_at_3

# 預測
val_preds = xgb_model.predict(dval)


# 計算 HitRate
hitrate = compute_hitrate_at_3(groups_val_np, y_val_np, val_preds)
hitrate_records = []
hitrate_records.append({"split_label": "overall", "hitrate": hitrate})


hitrate_df = pl.DataFrame(hitrate_records)
csv_path = os.path.join(model_dir, "hitrate_summary.csv")
hitrate_df.write_csv(csv_path)
print(f"\n✅ 已儲存所有 Hitrate 結果至 {csv_path}")



✅ HitRate@3 (groups size in [10, inf]): 0.6155

✅ 已儲存所有 Hitrate 結果至 model_output/selected_features_xgb/one_model/v1_base_features/with_companyID_features/v1_model_no_gpu/top160\hitrate_summary.csv


In [7]:
import pandas as pd

# 取三種重要性
importance_types = ["weight", "gain", "cover"]
importance_all = {}

for imp_type in importance_types:
    imp_raw = xgb_model.get_score(importance_type=imp_type)
    imp_named = {}
    for k, v in imp_raw.items():
        imp_named[k] = v
    # 排序
    sorted_imp = sorted(imp_named.items(), key=lambda x: x[1], reverse=True)
    importance_all[imp_type] = sorted_imp

# 把三個榜單放成DataFrame方便比對
df_weight = pd.DataFrame(importance_all["weight"], columns=["feature", "weight_rank"])
df_weight["weight_rank_pos"] = df_weight.index

df_gain = pd.DataFrame(importance_all["gain"], columns=["feature", "gain_rank"])
df_gain["gain_rank_pos"] = df_gain.index

df_cover = pd.DataFrame(importance_all["cover"], columns=["feature", "cover_rank"])
df_cover["cover_rank_pos"] = df_cover.index

# 合併
df_merged = (
    df_weight
    .merge(df_gain, on="feature", how="outer")
    .merge(df_cover, on="feature", how="outer")
)

# 把不存在的rank補大數字
df_merged["weight_rank_pos"] = df_merged["weight_rank_pos"].fillna(9999)
df_merged["gain_rank_pos"] = df_merged["gain_rank_pos"].fillna(9999)
df_merged["cover_rank_pos"] = df_merged["cover_rank_pos"].fillna(9999)

# 計算「三個榜單中最早出現的位置」
df_merged["min_rank"] = df_merged[["weight_rank_pos", "gain_rank_pos", "cover_rank_pos"]].min(axis=1)

# 排序
df_merged_sorted = df_merged.sort_values("min_rank")

# 取前50
top50 = df_merged_sorted.head(50)

# 顯示
print(top50[["feature", "weight_rank", "gain_rank", "cover_rank"]])
# 如果想輸出CSV
csv_path = os.path.join(model_dir, "feature_importance.csv")

df_merged_sorted.to_csv(csv_path, index=False)
print(f"✅ 已輸出{csv_path}")


                                         feature  weight_rank    gain_rank  \
154                       total_is_min_transfers        282.0  3452.116699   
141                             price_percentile      34435.0    15.923947   
136                           price_per_duration      31118.0    11.395290   
156                     total_num_transfers_rank        275.0  2354.525635   
61                        legs0_is_min_transfers        265.0  1113.544434   
133                     price_from_median_zscore      29566.0    17.590914   
76                  legs0_segments0_flightNumber      28566.0     8.568955   
67                      legs0_num_transfers_rank        145.0  1093.193604   
142                       pricingInfo_isAccessTP       1522.0    65.995598   
137                      price_per_duration_rank      26055.0     9.279482   
28                                 free_exchange        430.0   271.855621   
52                          legs0_arrivalAt_hour      25607.0   

# Shap 分析

In [None]:
import shap
import polars as pl
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
# 參數

import xgboost as xgb
model_path = "model_output/selected_features_xgb/one_model/features_v1_with_company_ID/model_par_4/top100/xgb_ranker_model.bin"

# 讀取模型
xgb_model = xgb.Booster(model_file=model_path)
# 隨機抽樣 index（使用 polars 的 row sampling）
sample_idx = np.random.default_rng(42).choice(len(X), size=50000, replace=False)
X_sample_pl = X[sample_idx]


In [None]:
import xgboost as xgb
import joblib
import pandas as pd
import shap
import numpy as np
import os
import matplotlib.pyplot as plt

# # ==== Config ====
# model_bin_path = "model_output/selected_features_xgb/one_model/features_v1_with_company_ID/model_par_4/top100/xgb_ranker_model.bin"
# model_pkl_path = model_bin_path.replace(".bin", ".pkl")
# shap_dir = os.path.dirname(model_pkl_path)

# # ==== Load Booster & Convert to Regressor ====
# booster = xgb.Booster()
# booster.load_model(model_bin_path)

# xgb_reg = xgb.XGBRegressor()
# xgb_reg._Booster = booster
# xgb_reg.n_features_in_ = booster.num_features()

# # 儲存為 .pkl
# joblib.dump(xgb_reg, model_pkl_path)
# print(f"✅ Booster 已儲存為: {model_pkl_path}")

# ==== Load X_sample and Compute SHAP ====
# 假設你已經有 X 可以用來取樣
X_sample = X.sample(n=25000, random_state=42)

explainer = shap.Explainer(xgb_model, X_sample)
shap_values = explainer(X_sample)

# 儲存 SHAP 值
np.save(os.path.join(xgb_model, "shap_values.npy"), shap_values.values)
X_sample.to_parquet(os.path.join(xgb_model, "shap_input.parquet"))

# SHAP summary plot
plt.figure()
shap.summary_plot(shap_values, X_sample)
plt.tight_layout()
# plt.savefig(os.path.join(shap_dir, "shap_summary.png"))
plt.close()

# SHAP bar plot
plt.figure()
shap.summary_plot(shap_values, X_sample, plot_type="bar")
plt.tight_layout()
# plt.savefig(os.path.join(shap_dir, "shap_bar.png"))
plt.close()

# Top 20 features importance to CSV
mean_abs_shap = np.abs(shap_values.values).mean(axis=0)
top_features = pd.Series(mean_abs_shap, index=X_sample.columns).sort_values(ascending=False)
top_features[:20].to_csv(os.path.join(shap_dir, "shap_top20.csv"))

print("✅ SHAP 值與圖表已儲存完畢")

In [None]:
import xgboost as xgb
import shap
import numpy as np
import pandas as pd
import joblib
import os
import matplotlib.pyplot as plt

# # === 參數 ===
# bin_model_path = "model_output/selected_features_xgb/one_model/features_v1_with_company_ID/model_par_4/top100/xgb_ranker_model.bin"
# model_pkl_path = "model_output/selected_features_xgb/one_model/features_v1_with_company_ID/model_par_4/top100/xgb_ranker_model.pkl"
# model_dir = os.path.dirname(model_pkl_path)
# os.makedirs(model_dir, exist_ok=True)

# # ✅ 載入 Booster
# booster = xgb.Booster()
# booster.load_model(bin_model_path)

# # ✅ 建立 XGBRegressor wrapper
# xgb_reg = xgb.XGBRegressor()
# xgb_reg._Booster = booster

# # ✅ 手動設定必要屬性
# xgb_reg._features_count = booster.num_features()

# # 假設是二分類（此步驟可能視 SHAP 或 sklearn 需求）
# class DummyLabelEncoder:
#     def transform(self, x): return x
#     def inverse_transform(self, x): return x
# xgb_reg._le = DummyLabelEncoder()

# # ✅ 儲存 .pkl
# joblib.dump(xgb_reg, model_pkl_path)
# print(f"✅ Booster 已轉換並儲存為: {model_pkl_path}")

# ✅ 準備 SHAP 輸入
sample_idx = np.random.default_rng(42).choice(len(X), size=5000, replace=False)
X_sample_pl = X[sample_idx]
X_sample = X_sample_pl.to_pandas()
explainer = shap.Explainer(xgb_model, X_sample)
shap_values = explainer(X_sample)

# ✅ 儲存 SHAP 結果
np.save(os.path.join(model_dir, "shap_values.npy"), shap_values.values)
X_sample.to_parquet(os.path.join(model_dir, "shap_input.parquet"))

# ✅ 前 20 特徵重要性 CSV
importance_df = pd.DataFrame({
    "feature": X_sample.columns,
    "mean_abs_shap": np.abs(shap_values.values).mean(axis=0)
}).sort_values("mean_abs_shap", ascending=False)

importance_df.head(20).to_csv(os.path.join(model_dir, "shap_top20.csv"), index=False)

# ✅ summary plot
shap.summary_plot(shap_values, X_sample, show=False)
plt.savefig(os.path.join(model_dir, "shap_summary.png"), bbox_inches="tight")
plt.close()

# ✅ bar plot
shap.summary_plot(shap_values, X_sample, plot_type="bar", show=False)
plt.savefig(os.path.join(model_dir, "shap_bar.png"), bbox_inches="tight")
plt.close()

print("✅ SHAP 分析與圖形儲存完畢")


In [None]:

# 只轉 sample 的 subset 成 pandas，速度快、記憶體小
X_sample = X_sample_pl.to_pandas()


In [None]:
dX_sample = xgb.DMatrix(X_sample, feature_names=X_sample.columns.tolist())
explainer = shap.TreeExplainer(xgb_model)  # 明確指定 TreeExplainer
shap_vals = explainer.shap_values(dX_sample)


In [None]:
explainer = shap.Explainer(xgb_model, X_sample)

# 一次性處理整個 sample
shap_vals = explainer.shap_values(X_sample)


In [None]:
# 1. SHAP 解釋器與 SHAP 值計算（支援進度條）
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer(X_sample, check_additivity=False)  # 這支援進度條


In [None]:
import os
import shap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 3. 儲存 SHAP 排名至 CSV
shap_importance = np.abs(shap_vals).mean(axis=0)
shap_importance_df = pd.DataFrame({
    "feature": X_sample.columns,
    "mean_abs_shap": shap_importance
}).sort_values("mean_abs_shap", ascending=False)
shap_importance_path = os.path.join("model_output/selected_features_xgb/one_model/features_v1_with_company_ID/model_par_4/top100", "shap_feature_importance.csv")
shap_importance_df.to_csv(shap_importance_path, index=False)
print(f"✅ 已儲存 shap 排名至 {shap_importance_path}")

# 4. summary plot
plt.figure()
shap.summary_plot(shap_vals, X_sample)
# plt.savefig(os.path.join(model_dir, "shap_summary_plot.png"), bbox_inches='tight')
plt.close()

# 5. bar plot
plt.figure()
shap.summary_plot(shap_vals, X_sample, plot_type="bar")
# plt.savefig(os.path.join(model_dir, "shap_bar_plot.png"), bbox_inches='tight')
plt.close()
print("✅ SHAP summary 與 bar 圖已儲存")

# # 6. dependence plots for Top-20 features
# top_features = shap_importance_df["feature"].values[:20]
# for feat in top_features:
#     plt.figure()
#     shap.dependence_plot(feat, shap_vals, X_sample, show=False)
#     # plt.savefig(os.path.join(model_dir, f"shap_dependence_{feat}.png"), bbox_inches='tight')
#     plt.close()
print("✅ 已儲存前 20 個 SHAP dependence plots")

# Prediction

In [8]:
import os
import numpy as np
import polars as pl
import xgboost as xgb
import joblib

# # 參數model_output/selected_features_xgb/one_model/v1_base_features/with_companyID_engineer/v1_model/top120/xgb_params.json
model_path = "model_output/selected_features_xgb/one_model/v1_base_features/with_companyID_features/v1_model_no_gpu/top160/xgb_ranker_model.pkl"
parquet_path = "data/test_with_companyID_features.parquet"

# # 讀取模型
# xgb_model = xgb.Booster(model_file=model_path)
# 使用 joblib 載入 .pkl 模型
xgb_model = joblib.load(model_path)


# 確認 feature_names
model_features = xgb_model.feature_names
if model_features is None:
    raise ValueError("❌ 模型沒有 feature_names，請確認訓練時有指定 feature_names")
print(f"✅ 模型共 {len(model_features)} 個features")

# 讀取 test_filled
df = pl.read_parquet(parquet_path)
print(f"✅ 讀取 test_filled，共 {df.height} rows")

# 檢查缺失
missing_in_data = [f for f in model_features if f not in df.columns]
if missing_in_data:
    raise ValueError(f"❌ 下列特徵在 test_filled 不存在: {missing_in_data}")

# 篩選&排序
df_for_predict = df.select(model_features)
X_np = df_for_predict.to_numpy()

# 預測
dtest = xgb.DMatrix(X_np, feature_names=model_features)
preds = xgb_model.predict(dtest)
print(f"✅ 預測完成，共 {len(preds)} 筆")

# 回存結果
df_result = (
    df
    .with_columns([
        pl.Series("selected", preds)
    ])
)

# 查看前幾筆
print(df_result.head())


✅ 模型共 160 個features
✅ 讀取 test_filled，共 6897776 rows
✅ 預測完成，共 6897776 筆
shape: (5, 291)
┌──────────┬────────┬───────────┬────────────┬───┬────────────┬────────────┬───────────┬───────────┐
│ Id       ┆ bySelf ┆ companyID ┆ nationalit ┆ … ┆ companyID_ ┆ companyID_ ┆ companyID ┆ selected  │
│ ---      ┆ ---    ┆ ---       ┆ y          ┆   ┆ mode_has_t ┆ mode_trans ┆ _total_oc ┆ ---       │
│ i64      ┆ i8     ┆ i64       ┆ ---        ┆   ┆ ransfer    ┆ fer_num    ┆ currences ┆ f32       │
│          ┆        ┆           ┆ i64        ┆   ┆ ---        ┆ ---        ┆ ---       ┆           │
│          ┆        ┆           ┆            ┆   ┆ i64        ┆ i64        ┆ i64       ┆           │
╞══════════╪════════╪═══════════╪════════════╪═══╪════════════╪════════════╪═══════════╪═══════════╡
│ 18144679 ┆ 1      ┆ 62840     ┆ 36         ┆ … ┆ 0          ┆ 0          ┆ 23149     ┆ -0.952924 │
│ 18144680 ┆ 1      ┆ 62840     ┆ 36         ┆ … ┆ 0          ┆ 0          ┆ 23149     ┆ -0.124138 │
│ 18

In [9]:
from scripts.group_wise import export_submission_parquets


n_top =160
export_submission_parquets(
    test_filled_with_preds=df_result,   # 你的帶有 selected 分數的 DataFrame
    output_dir=f"model_output/selected_features_xgb/one_model/v1_base_features/with_companyID_features/v1_model_no_gpu/top{n_top}",
    ranked_filename = "rank_submission.parquet",
    raw_filename ="raw_submission.parquet",
)


✅ 已儲存原始 submission: model_output/selected_features_xgb/one_model/v1_base_features/with_companyID_features/v1_model_no_gpu/top160\raw_submission.parquet
shape: (6_897_776, 4)
┌──────────┬─────────────────────────────────┬───────────┬───────────────────┐
│ Id       ┆ ranker_id                       ┆ selected  ┆ __index_level_0__ │
│ ---      ┆ ---                             ┆ ---       ┆ ---               │
│ i64      ┆ str                             ┆ f64       ┆ i64               │
╞══════════╪═════════════════════════════════╪═══════════╪═══════════════════╡
│ 18144679 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ -0.952924 ┆ 18144679          │
│ 18144680 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ -0.124138 ┆ 18144680          │
│ 18144681 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ -2.804019 ┆ 18144681          │
│ 18144682 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ -1.062132 ┆ 18144682          │
│ 18144683 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ -0.317814 ┆ 18144683          │
│ …        ┆ …                      