In [1]:
import polars as pl
import pandas as pd

# 讀取 parquet
train_filled = pl.read_parquet("data/train_filled_v1_with_all.parquet")

# 讀取top50 features
n_top = 100
save_dir = "model_output/selected_features_xgb/one_model/features_v1_with_company_ID/k-fold/"
df = pd.read_csv("model_output/all_features_xgb/v1_add_company_ID/v1_model/model_importance/common_features_with_ranks.csv")
top50_features = df["feature"].head(n_top).tolist()
top50_features = [f for f in top50_features if f != "companyID"]

# 你要保留的欄位 (Top50 + target + group id)
cols_to_keep = top50_features + ["selected", "ranker_id"]

# 只保留這些欄位
train_filled = train_filled.select(cols_to_keep)


print(f"✅ DataFrame 現在只有 {len(train_filled.columns)} 欄位: {train_filled.columns}")


✅ DataFrame 現在只有 102 欄位: ['total_num_transfers_rank', 'legs0_segments0_flightNumber', 'price_per_duration_rank', 'price_percentile', 'price_from_median_zscore', 'price_per_duration', 'legs0_segments0_baggageAllowance_quantity', 'price_minus_fee_rank', 'legs0_arrivalAt_hour', 'pricingInfo_isAccessTP', 'legs0_departureAt_hour', 'both_legs_carrier_all_same', 'total_weighted_mean_cabin', 'price_per_fee_rank', 'totalPrice_rank', 'days_before_departure', 'legs0_segments0_seatsAvailable', 'price_per_fee', 'price_per_tax', 'legs1_main_carrier', 'isVip', 'leg0_view_diff_mean', 'legs1_departureAt_hour', 'baggage_total', 'duration_ratio', 'total_fees', 'all_view_diff_mean', 'legs1_arrivalAt_hour', 'log_price', 'legs1_segments0_flightNumber', 'legs1_weighted_mean_cabin', 'companyID_loo_mean_legs0_departureAt_hour', 'legs0_segments0_cabinClass', 'price_minus_fee', 'legs0_main_carrier', 'companyID_loo_mean_legs0_arrivalAt_hour', 'baggage_total_rank', 'legs0_max_duration_cabin', 'legs0_segments0_key_

In [2]:
# 要轉成 int 的欄位
cols_to_int = [
    "pricingInfo_isAccessTP",
    "legs0_segments0_baggageAllowance_quantity",
    "legs1_segments0_baggageAllowance_quantity",
    "miniRules1_statusInfos",
    "baggage_total",
    "legs0_segments0_seatsAvailable",
    "miniRules1_monetaryAmount",
    "total_fees",
    "price_minus_fee",
    "taxes",
    "totalPrice",
    "legs1_segments0_seatsAvailable"
]

# 先檢查哪些欄位存在
existing_cols = [c for c in cols_to_int if c in train_filled.columns]

print(f"✅ 共找到 {len(existing_cols)} 個存在的欄位要轉 int: {existing_cols}")

# 做轉型
train_filled = train_filled.with_columns([
    pl.col(c).fill_null(0).cast(pl.Int32).alias(c)
    for c in existing_cols
])

✅ 共找到 12 個存在的欄位要轉 int: ['pricingInfo_isAccessTP', 'legs0_segments0_baggageAllowance_quantity', 'legs1_segments0_baggageAllowance_quantity', 'miniRules1_statusInfos', 'baggage_total', 'legs0_segments0_seatsAvailable', 'miniRules1_monetaryAmount', 'total_fees', 'price_minus_fee', 'taxes', 'totalPrice', 'legs1_segments0_seatsAvailable']


In [2]:
exclude_cols = [
    'Id', 'ranker_id', 'selected',
    'profileId', 'requestDate',
    'legs0_departureAt', 'legs0_arrivalAt', 'legs1_departureAt', 'legs1_arrivalAt',
    'miniRules0_percentage', 'miniRules1_percentage',  # >90% missing
    'frequentFlyer',  # Already processed
    # Exclude constant columns
    'pricingInfo_passengerCount'
]

feature_cols = [col for col in train_filled.columns if col not in exclude_cols]

print(f"Using {len(feature_cols)} features")

X = train_filled.select(feature_cols)
y = train_filled.select('selected')
groups = train_filled.select('ranker_id')

num_positive = len(train_filled["ranker_id"].unique())
num_negative = len(train_filled) - num_positive
import os, json, joblib, gc
del train_filled
gc.collect()


Using 100 features


0

In [3]:
import os, json, joblib, gc
import numpy as np
import polars as pl
import xgboost as xgb
from scripts.hitrate import compute_hitrate_at_3
import polars as pl
from scripts.hitrate import compute_hitrate_at_3
# ===== 假設以下皆為 Polars DataFrame: X, y, groups =====


# 設定 fold 數與儲存目錄
n_folds = 5
model_base_dir = os.path.join(save_dir, f"top{n_top}")
os.makedirs(model_base_dir, exist_ok=True)

# 取得唯一 ranker_id，並打亂順序
unique_rankers = groups.select("ranker_id").unique().to_series().to_list()
np.random.seed(42)
np.random.shuffle(unique_rankers)

# 計算每 fold 的切分大小
fold_size = len(unique_rankers) // n_folds

# 儲存超參數
params = {
    'objective': 'rank:pairwise',
    'eval_metric': 'ndcg@3',
    "learning_rate": 0.022641389657079056,
    "max_depth": 14,
    "min_child_weight": 2,
    "subsample": 0.8842234913702768,
    "colsample_bytree": 0.45840689146263086,
    "gamma": 3.3084297630544888,
    "lambda": 6.952586917313028,
    "alpha": 0.6395254133055179,
    'seed': 42,
    'n_jobs': -1,
}

# 儲存超參數設定一次即可
with open(os.path.join(model_base_dir, "xgb_params.json"), "w") as f:
    json.dump(params, f, indent=2)

# 開始 5-fold CV
for fold in range(4, n_folds):
    print(f"training fold {fold}...")
    val_start = fold * fold_size
    val_end = (fold + 1) * fold_size if fold < n_folds - 1 else len(unique_rankers)

    val_rankers = set(unique_rankers[val_start:val_end])
    train_rankers = set(unique_rankers) - val_rankers

    is_train = groups.select(pl.col("ranker_id").is_in(list(train_rankers)).alias("is_train"))

    # 合併 boolean mask
    X_with_mask = X.with_columns(is_train)
    y_with_mask = y.with_columns(is_train)
    groups_with_mask = groups.with_columns(is_train)

    # 分割
    X_train_df = X_with_mask.filter(pl.col("is_train"))
    X_val_df = X_with_mask.filter(~pl.col("is_train"))
    y_train_df = y_with_mask.filter(pl.col("is_train"))
    y_val_df = y_with_mask.filter(~pl.col("is_train"))
    groups_train_df = groups_with_mask.filter(pl.col("is_train"))
    groups_val_df = groups_with_mask.filter(~pl.col("is_train"))

    # 轉 numpy
    X_train_np = X_train_df.drop("is_train").to_numpy()
    X_val_np = X_val_df.drop("is_train").to_numpy()
    y_train_np = y_train_df.drop("is_train").to_numpy().flatten()
    y_val_np = y_val_df.drop("is_train").to_numpy().flatten()
    groups_train_np = groups_train_df.drop("is_train").to_numpy().flatten()
    groups_val_np = groups_val_df.drop("is_train").to_numpy().flatten()

    # group sizes
    group_sizes_train = (
        groups_train_df.drop("is_train")
        .group_by("ranker_id", maintain_order=True)
        .agg(pl.len())["len"]
        .to_numpy()
    )
    group_sizes_val = (
        groups_val_df.drop("is_train")
        .group_by("ranker_id", maintain_order=True)
        .agg(pl.len())["len"]
        .to_numpy()
    )

    # DMatrix
    dtrain = xgb.DMatrix(X_train_np, label=y_train_np, feature_names=feature_cols)
    dtrain.set_group(group_sizes_train)
    dval = xgb.DMatrix(X_val_np, label=y_val_np, feature_names=feature_cols)
    dval.set_group(group_sizes_val)

    del X_train_np, y_train_np, X_val_np
    gc.collect()

    evals = [(dtrain, "train"), (dval, "val")]

    # 訓練
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=1000,
        evals=evals,
        early_stopping_rounds=50,
        verbose_eval=20,
    )

    # 儲存
    fold_dir = os.path.join(model_base_dir, f"fold_{fold}")
    os.makedirs(fold_dir, exist_ok=True)
    joblib.dump(model, os.path.join(fold_dir, "xgb_ranker_model.pkl"))
    print(f"✅ Fold {fold}: 模型已儲存於 {fold_dir}")


    # 預測
    val_preds = model.predict(dval)


    # 計算 HitRate
    hitrate = compute_hitrate_at_3(groups_val_np, y_val_np, val_preds)
    hitrate_records = []
    hitrate_records.append({"split_label": "overall", "hitrate": hitrate})



    hitrate_df = pl.DataFrame(hitrate_records)
    csv_path = os.path.join(fold_dir, "hitrate_summary.csv")
    hitrate_df.write_csv(csv_path)
    print(f"\n✅ 已儲存所有 Hitrate 結果至 {csv_path}")


    # 取三種重要性
    importance_types = ["weight", "gain", "cover"]
    importance_all = {}

    for imp_type in importance_types:
        imp_raw = model.get_score(importance_type=imp_type)
        imp_named = {}
        for k, v in imp_raw.items():
            imp_named[k] = v
        # 排序
        sorted_imp = sorted(imp_named.items(), key=lambda x: x[1], reverse=True)
        importance_all[imp_type] = sorted_imp

    # 把三個榜單放成DataFrame方便比對
    df_weight = pd.DataFrame(importance_all["weight"], columns=["feature", "weight_rank"])
    df_weight["weight_rank_pos"] = df_weight.index

    df_gain = pd.DataFrame(importance_all["gain"], columns=["feature", "gain_rank"])
    df_gain["gain_rank_pos"] = df_gain.index

    df_cover = pd.DataFrame(importance_all["cover"], columns=["feature", "cover_rank"])
    df_cover["cover_rank_pos"] = df_cover.index

    # 合併
    df_merged = (
        df_weight
        .merge(df_gain, on="feature", how="outer")
        .merge(df_cover, on="feature", how="outer")
    )

    # 把不存在的rank補大數字
    df_merged["weight_rank_pos"] = df_merged["weight_rank_pos"].fillna(9999)
    df_merged["gain_rank_pos"] = df_merged["gain_rank_pos"].fillna(9999)
    df_merged["cover_rank_pos"] = df_merged["cover_rank_pos"].fillna(9999)

    # 計算「三個榜單中最早出現的位置」
    df_merged["min_rank"] = df_merged[["weight_rank_pos", "gain_rank_pos", "cover_rank_pos"]].min(axis=1)

    # 排序
    df_merged_sorted = df_merged.sort_values("min_rank")

    # 取前50
    top50 = df_merged_sorted.head(50)

    # 顯示
    print(top50[["feature", "weight_rank", "gain_rank", "cover_rank"]])
    # 如果想輸出CSV
    csv_path = os.path.join(fold_dir, "feature_importance.csv")

    df_merged_sorted.to_csv(csv_path, index=False)
    print(f"✅ 已輸出{csv_path}")

    # 顯式釋放 Booster 內部資源
    model.__del__()

    # 強制清除 Polars DataFrame 參考
    del X_with_mask, y_with_mask, groups_with_mask
    del X_train_df, X_val_df, y_train_df, y_val_df
    del groups_train_df, groups_val_df

    # Numpy 陣列
    del y_val_np
    del groups_train_np, groups_val_np
    del group_sizes_train, group_sizes_val, val_preds

    # DMatrix
    del dtrain, dval

    # importance 資料
    del df_weight, df_gain, df_cover, df_merged, df_merged_sorted, top50
    del importance_all

    # 最後強制回收
    gc.collect()

training fold 4...
[0]	train-ndcg@3:0.37106	val-ndcg@3:0.35724
[20]	train-ndcg@3:0.59592	val-ndcg@3:0.46908
[40]	train-ndcg@3:0.64696	val-ndcg@3:0.48708
[60]	train-ndcg@3:0.68020	val-ndcg@3:0.49868
[80]	train-ndcg@3:0.69810	val-ndcg@3:0.50506
[100]	train-ndcg@3:0.71391	val-ndcg@3:0.51032
[120]	train-ndcg@3:0.72988	val-ndcg@3:0.51483
[140]	train-ndcg@3:0.74412	val-ndcg@3:0.51987
[160]	train-ndcg@3:0.75597	val-ndcg@3:0.52252
[180]	train-ndcg@3:0.76845	val-ndcg@3:0.52647
[200]	train-ndcg@3:0.78074	val-ndcg@3:0.53009
[220]	train-ndcg@3:0.78967	val-ndcg@3:0.53311
[240]	train-ndcg@3:0.79948	val-ndcg@3:0.53566
[260]	train-ndcg@3:0.80883	val-ndcg@3:0.53863
[280]	train-ndcg@3:0.81757	val-ndcg@3:0.54202
[300]	train-ndcg@3:0.82583	val-ndcg@3:0.54438
[320]	train-ndcg@3:0.83368	val-ndcg@3:0.54682
[340]	train-ndcg@3:0.84144	val-ndcg@3:0.54849
[360]	train-ndcg@3:0.84697	val-ndcg@3:0.55056
[380]	train-ndcg@3:0.85385	val-ndcg@3:0.55193
[400]	train-ndcg@3:0.85913	val-ndcg@3:0.55407
[420]	train-ndcg@3:0.

# Prediction

In [4]:
import os
import numpy as np
import polars as pl
import xgboost as xgb
import joblib

# 設定路徑
model_base_dir = "model_output/selected_features_xgb/one_model/features_v1_with_company_ID/k-fold/top100"
parquet_path = "data/test_filled_v1_with_all.parquet"
n_folds = 5

# 載入 test 資料
df = pl.read_parquet(parquet_path)
print(f"✅ 讀取 test_filled，共 {df.height} rows")

# 讀 feature_names from 其中一個模型
model0 = joblib.load(os.path.join(model_base_dir, "fold_0", "xgb_ranker_model.pkl"))
feature_names = model0.feature_names

# 確認欄位存在
missing = [f for f in feature_names if f not in df.columns]
if missing:
    raise ValueError(f"❌ 下列特徵在 test data 不存在: {missing}")

# 取出 features
X_np = df.select(feature_names).to_numpy()

# 製作 DMatrix
dtest = xgb.DMatrix(X_np, feature_names=feature_names)

# 對每個 fold 預測
all_preds = []
for fold in range(n_folds):
    model_path = os.path.join(model_base_dir, f"fold_{fold}", "xgb_ranker_model.pkl")
    model = joblib.load(model_path)
    preds = model.predict(dtest)
    all_preds.append(preds)

# 平均所有預測
final_preds = np.mean(all_preds, axis=0)

# 加入預測結果
df_result = df.with_columns([
    pl.Series("selected", final_preds)
])

print(f"✅ 預測完成：已用{n_folds}個models預測完{len(final_preds)} 筆")


✅ 讀取 test_filled，共 6897776 rows
✅ 預測完成：已用5個models預測完6897776 筆


In [5]:
from scripts.group_wise import export_submission_parquets


n_top =100
export_submission_parquets(
    test_filled_with_preds=df_result,   # 你的帶有 selected 分數的 DataFrame
    output_dir=f"model_output/selected_features_xgb/one_model/features_v1_with_company_ID/k-fold/top{n_top}",
    ranked_filename = "rank_submission.parquet",
    raw_filename ="raw_submission.parquet",
)


✅ 已儲存原始 submission: model_output/selected_features_xgb/one_model/features_v1_with_company_ID/k-fold/top100/raw_submission.parquet
shape: (6_897_776, 4)
┌──────────┬─────────────────────────────────┬───────────┬───────────────────┐
│ Id       ┆ ranker_id                       ┆ selected  ┆ __index_level_0__ │
│ ---      ┆ ---                             ┆ ---       ┆ ---               │
│ i64      ┆ str                             ┆ f64       ┆ i64               │
╞══════════╪═════════════════════════════════╪═══════════╪═══════════════════╡
│ 18144679 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ -0.308762 ┆ 18144679          │
│ 18144680 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ -0.064771 ┆ 18144680          │
│ 18144681 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ -2.517344 ┆ 18144681          │
│ 18144682 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ -0.956076 ┆ 18144682          │
│ 18144683 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ -0.512457 ┆ 18144683          │
│ …        ┆ …                               ┆ …         ┆