In [4]:
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import time
import xgboost as xgb
import os 
os.chdir("..")

In [None]:
# Load data
train_filled = pl.read_parquet('data/data_from_json/train/train_merged.parquet')
from scripts.feature_enigeer import clean_fill_and_cast_columns
train_filled = clean_fill_and_cast_columns(train_filled)

✅ 共找到 256 個字串欄位
✅ 共找到 75 個數值欄位
✅ 共找到 16 個布林欄位


In [None]:
# 先數一下目前欄位數
print("before:", len(train_filled.columns))

# 找出欄名包含 segments2 或 segments3 的欄位
drop_cols = [c for c in train_filled.columns
             if ("segments2" in c) or ("segments3" in c)]

# 丟掉
train_filled = train_filled.drop(drop_cols)

print("dropped:", len(drop_cols))
print("after:", len(train_filled.columns))


All ranker_id have a single currency? True
ranker_id with >1 currency: 0


ranker_id,n_currency,currency_codes,single_currency
str,u32,list[i64],bool


In [16]:
# Load data
test_filled = pl.read_parquet('data/test.parquet').drop('__index_level_0__').with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))
from scripts.feature_enigeer import clean_fill_and_cast_columns


test_filled = clean_fill_and_cast_columns(test_filled, test= True)

✅ 共找到 57 個字串欄位
✅ 共找到 48 個數值欄位
✅ 共找到 4 個布林欄位
✅ test=True: 已將 10 個duration欄位轉str並填'missing'
✅ 已完成空字串處理、缺失補值、布林轉0/1


# Personal data

In [None]:
  "isGlobal":   "yearOfBirth": integer,       // Birth year
  "nationality"

# Currency change

In [6]:
import polars as pl
import json
from typing import Dict, List, Optional

# 1) 由 train_filled 建立 ranker_id -> to_RUB 的轉換字典
def build_currency_transform_dic(
    train_filled: pl.DataFrame,
    kzt_to_rub: float,
    save_path: Optional[str] = None,
) -> Dict[str, float]:
    """
    假設每個 ranker_id 只有一種貨幣：
      - RUB -> 1.0
      - KZT -> kzt_to_rub（你提供）
    會回傳 {ranker_id: to_RUB_rate}，並可選擇存成 JSON。
    """
    rates = pl.DataFrame(
        {"currencyCode": ["RUB", "KZT"], "to_RUB": [1.0, float(kzt_to_rub)]}
    )

    # 取每個 ranker_id 的單一幣別；若有例外會檢查出來
    cur = (
        train_filled
        .select("ranker_id", "currencyCode")
        .group_by("ranker_id")
        .agg([
            pl.col("currencyCode").n_unique().alias("n_currency"),
            pl.col("currencyCode").first().alias("currencyCode"),
        ])
    )

    # 風險提示：理應全為 1
    bad = cur.filter(pl.col("n_currency") != 1)
    if bad.height > 0:
        raise ValueError("Some ranker_id have multiple currencies:\n" + str(bad))

    # 連接匯率表 → 得到 to_RUB
    trans = cur.join(rates, on="currencyCode", how="left").select("ranker_id", "to_RUB")

    d: Dict[str, float] = dict(zip(trans["ranker_id"].to_list(), trans["to_RUB"].to_list()))

    if save_path:
        with open(save_path, "w", encoding="utf-8") as f:
            json.dump(d, f, ensure_ascii=False)
    return d


# 2) 把指定欄位轉成 RUB（新增 *_rub 欄位，或 in_place 直接覆蓋）
def convert_prices_to_rub(
    df: pl.DataFrame,
    currency_transform_dic: Dict[str, float],
    cols_to_convert: List[str],
    *,
    suffix: str = "_rub",
    in_place: bool = False,
) -> pl.DataFrame:
    """
    根據 ranker_id 映射 to_RUB 倍率，把 cols_to_convert 轉成 RUB。
    預設輸出新欄位 <col>+suffix；若 in_place=True 則覆蓋原欄位。
    """
    tf = pl.DataFrame(
        {"ranker_id": list(currency_transform_dic.keys()),
         "to_RUB": list(currency_transform_dic.values())}
    )

    out = df.join(tf, on="ranker_id", how="left")

    new_cols = []
    for c in cols_to_convert:
        if c not in out.columns:
            # 欄位不存在就跳過；你也可以改成 raise
            continue
        expr = (pl.col(c).cast(pl.Float64, strict=False) * pl.col("to_RUB"))
        new_cols.append(expr.alias(c if in_place else f"{c}{suffix}"))

    out = out.with_columns(new_cols).drop("to_RUB")
    return out if not in_place else out  # 已經就地覆蓋


# 3) 範例用法
# 假設你已有 train_filled 和 df（train/test 皆可）
# 設定 KZT→RUB 的倍率（自己決定或外部餵入；這裡示例用 0.20）
kzt_to_rub_rate = 0.20

# 建立/保存轉換字典
currency_dic = build_currency_transform_dic(train_filled, kzt_to_rub=kzt_to_rub_rate, save_path="data/extra_features/train/currency_transform.json")
# currency_dic = build_currency_transform_dic(train_filled, kzt_to_rub=kzt_to_rub_rate)


In [None]:

# 需要統一成 RUB 的欄位
cols = [
    "totalPrice",
    "taxes",
    "miniRules0_monetaryAmount",
    "miniRules1_monetaryAmount",
]

# 轉換：新增 *_rub 欄位（保留原值）
train_filled_rub = convert_prices_to_rub(train_filled, currency_dic, cols, suffix="_rub", in_place=False)

# 若你想直接覆蓋原欄位，把 in_place=True：
# train_filled_rub = convert_prices_to_rub(train_filled, currency_dic, cols, in_place=True)


# price rank

In [None]:
from scripts.feature_enigeer import build_price_features
# 執行價格特徵工程，並存檔
price_features = build_price_features(
    train_filled,
    output_dir="data/extra_features/train/"
)

# 查看結果
print(price_features)


✅ 已完成價格特徵工程
✅ 已儲存 Parquet: data/data_from_json/extra_features/train/1_price_features.parquet
shape: (18_151_423, 8)
┌──────────┬────────────┬──────────┬───────────┬────────────┬────────────┬────────────┬────────────┐
│ Id       ┆ price_per_ ┆ tax_rate ┆ log_price ┆ totalPrice ┆ is_cheapes ┆ price_from ┆ price_perc │
│ ---      ┆ tax        ┆ ---      ┆ ---       ┆ _rank      ┆ t          ┆ _median_zs ┆ entile     │
│ i64      ┆ ---        ┆ f64      ┆ f64       ┆ ---        ┆ ---        ┆ core       ┆ ---        │
│          ┆ f64        ┆          ┆           ┆ u32        ┆ i8         ┆ ---        ┆ f64        │
│          ┆            ┆          ┆           ┆            ┆            ┆ f64        ┆            │
╞══════════╪════════════╪══════════╪═══════════╪════════════╪════════════╪════════════╪════════════╡
│ 9692136  ┆ 51.329476  ┆ 0.019468 ┆ 11.230616 ┆ 150        ┆ 0          ┆ 1.847517   ┆ 0.859242   │
│ 9692142  ┆ 24.825051  ┆ 0.040253 ┆ 10.504218 ┆ 118        ┆ 0          ┆ 0

In [3]:
from scripts.feature_enigeer import build_price_features
# 執行價格特徵工程，並存檔
price_features = build_price_features(
    test_filled,
    output_dir="data/extra_features/test/"
)

# 查看結果
print(price_features)


✅ 已完成價格特徵工程
✅ 已儲存 Parquet: data/extra_features/test/1_price_features.parquet
shape: (6_897_776, 8)
┌──────────┬────────────┬──────────┬───────────┬────────────┬────────────┬────────────┬────────────┐
│ Id       ┆ price_per_ ┆ tax_rate ┆ log_price ┆ totalPrice ┆ is_cheapes ┆ price_from ┆ price_perc │
│ ---      ┆ tax        ┆ ---      ┆ ---       ┆ _rank      ┆ t          ┆ _median_zs ┆ entile     │
│ i64      ┆ ---        ┆ f64      ┆ f64       ┆ ---        ┆ ---        ┆ core       ┆ ---        │
│          ┆ f64        ┆          ┆           ┆ u32        ┆ i8         ┆ ---        ┆ f64        │
│          ┆            ┆          ┆           ┆            ┆            ┆ f64        ┆            │
╞══════════╪════════════╪══════════╪═══════════╪════════════╪════════════╪════════════╪════════════╡
│ 18144679 ┆ 9.634936   ┆ 0.103677 ┆ 9.192075  ┆ 1          ┆ 1          ┆ -0.444113  ┆ 0.003641   │
│ 18144680 ┆ 13.756624  ┆ 0.072616 ┆ 9.548169  ┆ 11         ┆ 0          ┆ -0.224798  ┆ 0.212

# 處理duration

In [3]:
import polars as pl
from scripts.feature_enigeer import build_duration_features
# 所需欄位
# duration_cols = [
#     "legs0_duration",
#     "legs1_duration",
#     "legs0_segments0_duration",
#     "legs0_segments1_duration",
#     "legs0_segments2_duration",
#     "legs0_segments3_duration",
#     "legs1_segments0_duration",
#     "legs1_segments1_duration",
#     "legs1_segments2_duration",
#     "legs1_segments3_duration"
# ]

# needed_cols = ["Id", "ranker_id", "totalPrice"] + duration_cols

# # 篩選
# train_filled = train_filled.select([c for c in needed_cols if c in train_filled.columns])

# 再交給 build_duration_features
duration_features = build_duration_features(
    train_filled,
    output_dir="data/extra_features/train/"
)


✅ 已完成 Duration 特徵工程 (含排名、價格比、fastest 標記)
✅ 已輸出 Parquet: data/extra_features/train/2_duration_features.parquet


In [6]:
import polars as pl
from scripts.feature_enigeer import build_duration_features

# 再交給 build_duration_features
duration_features = build_duration_features(
    test_filled,
    output_dir="data/extra_features/test/"
)


✅ 已完成 Duration 特徵工程 (含排名、價格比、fastest 標記)
✅ 已輸出 Parquet: data/extra_features/test/2_duration_features.parquet


# frequent_flyer_features code

In [14]:
import polars as pl
from scripts.feature_enigeer import build_frequent_flyer_match_features
# 所需欄位
needed_cols = [
        "Id", "ranker_id", "frequentFlyer", "isVip",
        "legs0_duration", "legs1_duration",
        *[f"{s}_{t}" for s in [
            "legs0_segments0", "legs0_segments1", "legs0_segments2", "legs0_segments3",
            "legs1_segments0", "legs1_segments1", "legs1_segments2", "legs1_segments3"
        ] for t in ["marketingCarrier_code", "operatingCarrier_code", "duration"]]
    ]


# 篩選
train_filled = train_filled.select([c for c in needed_cols if c in train_filled.columns])

# 再交給 build_duration_features
ff_features = build_frequent_flyer_match_features(
    train_filled,
    output_dir="data/extra_features/train/"
)


✅ 已儲存 Parquet: data/extra_features/train/3_frequent_flyer_features.parquet
✅ 已完成 frequentFlyer 特徵 + match 特徵 + duration 特徵生成


In [15]:
import polars as pl
from scripts.feature_enigeer import build_frequent_flyer_match_features
# # 所需欄位
needed_cols = [
        "Id", "ranker_id", "frequentFlyer",
        "legs0_duration", "legs1_duration","isVip",
        *[f"{s}_{t}" for s in [
            "legs0_segments0", "legs0_segments1", "legs0_segments2", "legs0_segments3",
            "legs1_segments0", "legs1_segments1", "legs1_segments2", "legs1_segments3"
        ] for t in ["marketingCarrier_code", "operatingCarrier_code", "duration"]]
    ]


# 篩選
test_filled = test_filled.select([c for c in needed_cols if c in test_filled.columns])

# 再交給 build_duration_features
ff_features = build_frequent_flyer_match_features(
    test_filled,
    output_dir="data/extra_features/test/"
)


✅ 已儲存 Parquet: data/extra_features/test/3_frequent_flyer_features.parquet
✅ 已完成 frequentFlyer 特徵 + match 特徵 + duration 特徵生成


# Bagges

In [8]:
from scripts.feature_enigeer import build_baggage_fee_features

baggage_fee_df = build_baggage_fee_features(
    df=train_filled.select([
        "Id",
        "totalPrice",
        'ranker_id',
        "legs0_segments0_baggageAllowance_quantity",
        "legs1_segments0_baggageAllowance_quantity",
        "miniRules0_monetaryAmount",
        "miniRules1_monetaryAmount",
        "miniRules0_statusInfos",
        "miniRules1_statusInfos"
    ]),
    output_dir="data/extra_features/train/"
)


✅ 已儲存 Parquet: data/extra_features/train/4_baggage_fee_features.parquet


In [6]:
from scripts.feature_enigeer import build_baggage_fee_features

baggage_fee_df = build_baggage_fee_features(
    df=test_filled.select([
        "Id",
        "totalPrice",
        'ranker_id',
        "legs0_segments0_baggageAllowance_quantity",
        "legs1_segments0_baggageAllowance_quantity",
        "miniRules0_monetaryAmount",
        "miniRules1_monetaryAmount",
        "miniRules0_statusInfos",
        "miniRules1_statusInfos"
    ]),
    output_dir="data/extra_features/test/"
)


✅ 已儲存 Parquet: data/extra_features/test/4_baggage_fee_features.parquet


# cabinClass

In [None]:
from scripts.feature_enigeer import build_cabin_features
train_filled = train_filled.select([
        "Id",
        *[f"legs{i}_segments{j}_cabinClass" for i in [0,1] for j in range(4)],
        *[f"legs{i}_segments{j}_duration" for i in [0,1] for j in range(4)]
    ])

cabin_features_df = build_cabin_features(
    df=train_filled,
    output_dir="data/extra_features/train/"
)



A later expression might fail because the output type is not known. Set return_dtype=pl.self_dtype() if the type is unchanged, or set the proper output data type.
  df = df.with_columns([


✅ 已儲存 Parquet: data/extra_features/train/5_cabin_features.parquet


In [19]:
from scripts.feature_enigeer import build_cabin_features


cabin_features_df = build_cabin_features(
    df=test_filled.select([
        "Id",
        *[f"legs{i}_segments{j}_cabinClass" for i in [0,1] for j in range(4)],
        *[f"legs{i}_segments{j}_duration" for i in [0,1] for j in range(4)]
    ]),
    output_dir="data/extra_features/test/"
)



A later expression might fail because the output type is not known. Set return_dtype=pl.self_dtype() if the type is unchanged, or set the proper output data type.
  df = df.with_columns([


✅ 已儲存 Parquet: data/extra_features/test/5_cabin_features.parquet


# add time

In [3]:
from scripts.feature_enigeer import build_time_features

df_time_features = build_time_features(
    df=train_filled.select(["Id", "requestDate", "legs0_departureAt", "legs0_arrivalAt", "legs1_departureAt", "legs1_arrivalAt"]),
    output_dir="data/extra_features/train/"
)


✅ 已儲存 Parquet: data/extra_features/train/6_time_features.parquet
✅ 所有時間特徵已生成完成


In [20]:
from scripts.feature_enigeer import build_time_features
# test_filled = test_filled.select(["Id", "requestDate", "legs0_departureAt", "legs0_arrivalAt", "legs1_departureAt", "legs1_arrivalAt"])

df_time_features = build_time_features(
    df=test_filled.select(["Id", "requestDate", "legs0_departureAt", "legs0_arrivalAt", "legs1_departureAt", "legs1_arrivalAt"]),
    output_dir="data/extra_features/test/"
)


✅ 已儲存 Parquet: data/extra_features/test/6_time_features.parquet
✅ 所有時間特徵已生成完成


# build_corporate_access_route_features

In [4]:
from scripts.feature_enigeer import build_corporate_access_route_features
df_features = build_corporate_access_route_features(
    df=train_filled.select(["Id", "corporateTariffCode", "pricingInfo_isAccessTP", "searchRoute"]),
    output_dir="data/extra_features/train/"
)


✅ 已儲存 Parquet: data/extra_features/train/7_corporate_access_route_features.parquet
✅ 已完成 corporate/access/route 特徵生成


In [9]:
from scripts.feature_enigeer import build_corporate_access_route_features
df_features = build_corporate_access_route_features(
    df=test_filled.select(["Id", "corporateTariffCode", "pricingInfo_isAccessTP", "searchRoute"]),
    output_dir="data/extra_features/test/"
)


✅ 已儲存 Parquet: data/extra_features/test/7_corporate_access_route_features.parquet
✅ 已完成 corporate/access/route 特徵生成


# 加上是否直飛跟轉機幾次

In [5]:
from scripts.feature_enigeer import build_transfer_features
transfer_features_df = build_transfer_features(
    df=train_filled.select([
        "Id", "ranker_id", "legs0_duration", "legs1_duration",
        "legs0_segments1_departureFrom_airport_iata",
        "legs0_segments2_departureFrom_airport_iata",
        "legs0_segments3_departureFrom_airport_iata",
        "legs1_segments1_departureFrom_airport_iata",
        "legs1_segments2_departureFrom_airport_iata",
        "legs1_segments3_departureFrom_airport_iata"
    ]),
    output_dir="data/extra_features/train/"
)


✅ 已儲存 Parquet: data/extra_features/train/8_transfer_features.parquet
✅ 已完成轉機特徵生成


In [10]:
from scripts.feature_enigeer import build_transfer_features
transfer_features_df = build_transfer_features(
    df=test_filled.select([
        "Id", "ranker_id", "legs0_duration", "legs1_duration",
        "legs0_segments1_departureFrom_airport_iata",
        "legs0_segments2_departureFrom_airport_iata",
        "legs0_segments3_departureFrom_airport_iata",
        "legs1_segments1_departureFrom_airport_iata",
        "legs1_segments2_departureFrom_airport_iata",
        "legs1_segments3_departureFrom_airport_iata"
    ]),
    output_dir="data/extra_features/test/"
)


✅ 已儲存 Parquet: data/extra_features/test/8_transfer_features.parquet
✅ 已完成轉機特徵生成


# Carrier

In [3]:
from scripts.feature_enigeer import build_carrier_consistency_features
required_columns = [
    # 主鍵
    "Id",
    "ranker_id",
    # legs0 轉機判斷
    "legs0_segments1_departureFrom_airport_iata",
    "legs0_segments2_departureFrom_airport_iata",
    "legs0_segments3_departureFrom_airport_iata",
    # legs1 轉機判斷
    "legs1_segments1_departureFrom_airport_iata",
    "legs1_segments2_departureFrom_airport_iata",
    "legs1_segments3_departureFrom_airport_iata",
    # legs0 Marketing
    "legs0_segments0_marketingCarrier_code",
    "legs0_segments1_marketingCarrier_code",
    "legs0_segments2_marketingCarrier_code",
    "legs0_segments3_marketingCarrier_code",
    # legs1 Marketing
    "legs1_segments0_marketingCarrier_code",
    "legs1_segments1_marketingCarrier_code",
    "legs1_segments2_marketingCarrier_code",
    "legs1_segments3_marketingCarrier_code"
]

train_filled = train_filled.select(required_columns)
import pickle
with open("data/extra_features/transform_config_rank.pkl", "rb") as f:
    config = pickle.load(f)

carrier_df = build_carrier_consistency_features(
    df=train_filled,
    output_dir="data/extra_features/train/",
    transform_config=config
)


✅ 正在共用carrier encoding處理 ['legs0_main_carrier', 'legs1_main_carrier']
✅ 已儲存 Parquet: data/extra_features/train/9_carrier_consistency_features.parquet
✅ 已完成主Carrier一致性與轉機次數特徵


In [21]:
from scripts.feature_enigeer import build_carrier_consistency_features
required_columns = [
    # 主鍵
    "Id",
    "ranker_id",
    # legs0 轉機判斷
    "legs0_segments1_departureFrom_airport_iata",
    "legs0_segments2_departureFrom_airport_iata",
    "legs0_segments3_departureFrom_airport_iata",
    # legs1 轉機判斷
    "legs1_segments1_departureFrom_airport_iata",
    "legs1_segments2_departureFrom_airport_iata",
    "legs1_segments3_departureFrom_airport_iata",
    # legs0 Marketing
    "legs0_segments0_marketingCarrier_code",
    "legs0_segments1_marketingCarrier_code",
    "legs0_segments2_marketingCarrier_code",
    "legs0_segments3_marketingCarrier_code",
    # legs1 Marketing
    "legs1_segments0_marketingCarrier_code",
    "legs1_segments1_marketingCarrier_code",
    "legs1_segments2_marketingCarrier_code",
    "legs1_segments3_marketingCarrier_code"
]

test_filled = test_filled.select(required_columns)
import pickle
with open("data/extra_features/transform_config_rank.pkl", "rb") as f:
    config = pickle.load(f)

carrier_df = build_carrier_consistency_features(
    df=test_filled,
    output_dir="data/extra_features/test/",
    transform_config=config
)


✅ 正在共用carrier encoding處理 ['legs0_main_carrier', 'legs1_main_carrier']
✅ 已儲存 Parquet: data/extra_features/test/9_carrier_consistency_features.parquet
✅ 已完成主Carrier一致性與轉機次數特徵


# encoding category features.

In [6]:
from scripts.feature_enigeer import build_label_encoding_features
# ✅ 先把所有 columns 列出
all_cols = train_filled.columns

# ✅ Collect relevant columns
selected_cols = []
# ✅ Collect relevant columns + Id
selected_cols = ["Id"]  # 一定要先放Id

# Aircraft code
selected_cols += [c for c in all_cols if c.endswith("_aircraft_code")]

# Flight Number
selected_cols += [c for c in all_cols if c.endswith("_flightNumber")]

# Airport / City
selected_cols += [c for c in all_cols if "_arrivalTo_airport_" in c or "_departureFrom_airport_" in c]

# Carrier
selected_cols += [c for c in all_cols if c.endswith("_marketingCarrier_code") or c.endswith("_operatingCarrier_code")]

# searchRoute
selected_cols.append("searchRoute")

# frequentFlyer
selected_cols.append("frequentFlyer")
selected_cols.append("selected")


# 去除重複
selected_cols = list(dict.fromkeys(selected_cols))

print("✅ build_label_encoding_features 需要的欄位：", selected_cols)

train_filled = train_filled.select(selected_cols)
# import pickle
# with open("data/extra_features/transform_config_rank.pkl", "rb") as f:
#     config = pickle.load(f)

# 執行編碼
df_encoded, config = build_label_encoding_features(
    df=train_filled,
    output_dir="data/extra_features/train/",
    # transform_config=config
)


✅ build_label_encoding_features 需要的欄位： ['Id', 'legs0_segments0_aircraft_code', 'legs0_segments1_aircraft_code', 'legs0_segments2_aircraft_code', 'legs0_segments3_aircraft_code', 'legs1_segments0_aircraft_code', 'legs1_segments1_aircraft_code', 'legs1_segments2_aircraft_code', 'legs1_segments3_aircraft_code', 'legs0_segments0_flightNumber', 'legs0_segments1_flightNumber', 'legs0_segments2_flightNumber', 'legs0_segments3_flightNumber', 'legs1_segments0_flightNumber', 'legs1_segments1_flightNumber', 'legs1_segments2_flightNumber', 'legs1_segments3_flightNumber', 'legs0_segments0_arrivalTo_airport_city_iata', 'legs0_segments0_arrivalTo_airport_iata', 'legs0_segments0_departureFrom_airport_iata', 'legs0_segments1_arrivalTo_airport_city_iata', 'legs0_segments1_arrivalTo_airport_iata', 'legs0_segments1_departureFrom_airport_iata', 'legs0_segments2_arrivalTo_airport_city_iata', 'legs0_segments2_arrivalTo_airport_iata', 'legs0_segments2_departureFrom_airport_iata', 'legs0_segments3_arrivalTo_ai

In [12]:
from scripts.feature_enigeer import build_label_encoding_features
import pickle
# all_cols = train_filled.columns

# # ✅ Collect relevant columns
# selected_cols = []
# # ✅ Collect relevant columns + Id
# selected_cols = ["Id"]  # 一定要先放Id

# # Aircraft code
# selected_cols += [c for c in all_cols if c.endswith("_aircraft_code")]

# # Flight Number
# selected_cols += [c for c in all_cols if c.endswith("_flightNumber")]

# # Airport / City
# selected_cols += [c for c in all_cols if "_arrivalTo_airport_" in c or "_departureFrom_airport_" in c]

# # Carrier
# selected_cols += [c for c in all_cols if c.endswith("_marketingCarrier_code") or c.endswith("_operatingCarrier_code")]

# # searchRoute
# selected_cols.append("searchRoute")

# # frequentFlyer
# selected_cols.append("frequentFlyer")
# selected_cols.append("selected")

# # 去除重複
# selected_cols = list(dict.fromkeys(selected_cols))

# print("✅ build_label_encoding_features 需要的欄位：", selected_cols)

# test_filled = test_filled.select(selected_cols)
with open("data/extra_features/transform_config_rank.pkl", "rb") as f:
    config = pickle.load(f)

# 執行編碼
df_encoded, config = build_label_encoding_features(
    df=test_filled,
    output_dir="data/extra_features/test/",
    transform_config = config
)


# add view time

In [7]:
from scripts.feature_enigeer  import enrich_flight_view_features
# # 所需欄位 (補齊所有必要欄位)
needed_cols = [
    "Id",
    "ranker_id",
    # segment columns for legs0
    "legs0_segments0_departureFrom_airport_iata",
    "legs0_segments0_arrivalTo_airport_iata",
    "legs0_segments1_departureFrom_airport_iata",
    "legs0_segments1_arrivalTo_airport_iata",
    "legs0_segments2_departureFrom_airport_iata",
    "legs0_segments2_arrivalTo_airport_iata",
    "legs0_segments3_departureFrom_airport_iata",
    "legs0_segments3_arrivalTo_airport_iata",
    # segment columns for legs1
    "legs1_segments0_departureFrom_airport_iata",
    "legs1_segments0_arrivalTo_airport_iata",
    "legs1_segments1_departureFrom_airport_iata",
    "legs1_segments1_arrivalTo_airport_iata",
    "legs1_segments2_departureFrom_airport_iata",
    "legs1_segments2_arrivalTo_airport_iata",
    "legs1_segments3_departureFrom_airport_iata",
    "legs1_segments3_arrivalTo_airport_iata",
    "searchRoute"
]



# 篩選
train_filled = train_filled.select([c for c in needed_cols if c in train_filled.columns])

# 呼叫 enrich_flight_view_features
flight_view_features, _ = enrich_flight_view_features(
    train_filled,
    output_dir="data/extra_features/train/",

)

✅ 已儲存flight view特徵: data/extra_features/train/11_flight_view_features.parquet
✅ 已儲存 transform_config: data/extra_features/train/transform_flight_view_key_config.pkl


In [None]:
import polars as pl

def add_segment_searchRoute_company_pct_features(df: pl.DataFrame) -> pl.DataFrame:
    # 強制保證 searchRoute 為字串
    df = df.with_columns([
        pl.col("searchRoute")
            .map_elements(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else (x if x is not None else "missing"))
            .cast(pl.Utf8)
            .fill_null("missing")
            .alias("searchRoute")
    ])

    # 直接用 extract 拆解
    df = df.with_columns([
        pl.col("searchRoute").str.extract(r"^([^/]*)", 1).alias("log0_searchRoute"),
        pl.col("searchRoute").str.extract(r"/(.*)$", 1).alias("log1_searchRoute"),
    ])

    seg_cols = [f"legs0_segments{i}_key" for i in range(4)] + [f"legs1_segments{i}_key" for i in range(4)]

    melt_df = df.melt(
        id_vars=["log0_searchRoute", "log1_searchRoute", "Id", "companyID"],
        value_vars=seg_cols,
        variable_name="seg_type",
        value_name="segment_key"
    )
    melt_df = melt_df.with_columns([
        pl.col("segment_key").fill_null("missing"),
        pl.coalesce([pl.col("log0_searchRoute"), pl.col("log1_searchRoute")]).alias("unified_searchRoute")
    ]).filter(pl.col("segment_key") != "missing")

    stats = melt_df.group_by(["unified_searchRoute", "segment_key"]).agg(pl.count().alias("count"))
    route_total = stats.group_by("unified_searchRoute").agg(pl.sum("count").alias("route_total"))
    stats = stats.join(route_total, on="unified_searchRoute", how="left")
    stats = stats.with_columns([
        (pl.col("count") / pl.col("route_total")).alias("segment_searchRoute_pct")
    ])
    route_pct_mapping = dict(
        zip(
            zip(stats["unified_searchRoute"], stats["segment_key"]),
            stats["segment_searchRoute_pct"]
        )
    )

    # ---- (B) searchRoute + companyID ----
    stats2 = melt_df.group_by(["unified_searchRoute", "companyID", "segment_key"]).agg(pl.count().alias("count"))
    route2_total = stats2.group_by(["unified_searchRoute", "companyID"]).agg(pl.sum("count").alias("route_total"))
    stats2 = stats2.join(route2_total, on=["unified_searchRoute", "companyID"], how="left")
    stats2 = stats2.with_columns([
        (pl.col("count") / pl.col("route_total")).alias("segment_searchRoute_company_pct")
    ])
    route_company_pct_mapping = dict(
        zip(
            zip(stats2["unified_searchRoute"], stats2["companyID"], stats2["segment_key"]),
            stats2["segment_searchRoute_company_pct"]
        )
    )

    for seg_col in seg_cols:
        # searchRoute 分組熱門度
        df = df.with_columns([
            pl.struct([
                pl.coalesce([pl.col("log0_searchRoute"), pl.col("log1_searchRoute")]).alias("unified_searchRoute"),
                pl.col(seg_col).alias("segment_key"),
            ]).map_elements(
                lambda x: route_pct_mapping.get((x["unified_searchRoute"], x["segment_key"]), 0.0),
                return_dtype=pl.Float64
            ).alias(f"{seg_col}_searchRoute_pct")
        ])
        # searchRoute+companyID 分組熱門度
        df = df.with_columns([
            pl.struct([
                pl.coalesce([pl.col("log0_searchRoute"), pl.col("log1_searchRoute")]).alias("unified_searchRoute"),
                pl.col("companyID").fill_null("missing"),
                pl.col(seg_col).alias("segment_key"),
            ]).map_elements(
                lambda x: route_company_pct_mapping.get(
                    (x["unified_searchRoute"], x["companyID"], x["segment_key"]), 0.0),
                return_dtype=pl.Float64
            ).alias(f"{seg_col}_searchRoute_company_pct")
        ])
    return df


In [7]:
df = add_segment_searchRoute_company_pct_features(flight_view_features)
df


A later expression might fail because the output type is not known. Set return_dtype=pl.self_dtype() if the type is unchanged, or set the proper output data type.
  df = df.with_columns([
  melt_df = df.melt(
(Deprecated in version 0.20.5)
  stats = melt_df.group_by(["unified_searchRoute", "segment_key"]).agg(pl.count().alias("count"))
(Deprecated in version 0.20.5)
  stats2 = melt_df.group_by(["unified_searchRoute", "companyID", "segment_key"]).agg(pl.count().alias("count"))


Id,bySelf,companyID,corporateTariffCode,frequentFlyer,nationality,isAccess3D,isVip,legs0_arrivalAt,legs0_departureAt,legs0_duration,legs0_segments0_aircraft_code,legs0_segments0_arrivalTo_airport_city_iata,legs0_segments0_arrivalTo_airport_iata,legs0_segments0_baggageAllowance_quantity,legs0_segments0_baggageAllowance_weightMeasurementType,legs0_segments0_cabinClass,legs0_segments0_departureFrom_airport_iata,legs0_segments0_duration,legs0_segments0_flightNumber,legs0_segments0_marketingCarrier_code,legs0_segments0_operatingCarrier_code,legs0_segments0_seatsAvailable,legs0_segments1_aircraft_code,legs0_segments1_arrivalTo_airport_city_iata,legs0_segments1_arrivalTo_airport_iata,legs0_segments1_baggageAllowance_quantity,legs0_segments1_baggageAllowance_weightMeasurementType,legs0_segments1_cabinClass,legs0_segments1_departureFrom_airport_iata,legs0_segments1_duration,legs0_segments1_flightNumber,legs0_segments1_marketingCarrier_code,legs0_segments1_operatingCarrier_code,legs0_segments1_seatsAvailable,legs0_segments2_aircraft_code,legs0_segments2_arrivalTo_airport_city_iata,…,group_size,group_size_log,leg0_view_mean,leg1_view_mean,all_view_mean,leg0_view_diff_mean,leg1_view_diff_mean,all_view_diff_mean,leg0_flight_view_count_rank,leg1_flight_view_count_rank,all_flight_view_count_rank,legs0_segments0_key_view_count_rank,legs0_segments1_key_view_count_rank,legs0_segments2_key_view_count_rank,legs0_segments3_key_view_count_rank,legs1_segments0_key_view_count_rank,legs1_segments1_key_view_count_rank,legs1_segments2_key_view_count_rank,legs1_segments3_key_view_count_rank,log0_searchRoute,log1_searchRoute,legs0_segments0_key_searchRoute_pct,legs0_segments0_key_searchRoute_company_pct,legs0_segments1_key_searchRoute_pct,legs0_segments1_key_searchRoute_company_pct,legs0_segments2_key_searchRoute_pct,legs0_segments2_key_searchRoute_company_pct,legs0_segments3_key_searchRoute_pct,legs0_segments3_key_searchRoute_company_pct,legs1_segments0_key_searchRoute_pct,legs1_segments0_key_searchRoute_company_pct,legs1_segments1_key_searchRoute_pct,legs1_segments1_key_searchRoute_company_pct,legs1_segments2_key_searchRoute_pct,legs1_segments2_key_searchRoute_company_pct,legs1_segments3_key_searchRoute_pct,legs1_segments3_key_searchRoute_company_pct
i64,i8,i64,i64,str,i64,i8,i8,str,str,str,str,str,str,f64,f64,f64,str,str,str,str,str,f64,str,str,str,f64,f64,f64,str,str,str,str,str,f64,str,str,…,u32,f64,f64,f64,f64,f64,f64,f64,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0,1,57323,0,"""S7/SU/UT""",36,0,0,"""2024-06-15T16:20:00""","""2024-06-15T15:40:00""","""02:40:00""","""YK2""","""KJA""","""KJA""",1.0,0.0,1.0,"""TLK""","""02:40:00""","""216""","""KV""","""KV""",9.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""","""missing""","""missing""","""missing""","""missing""",0.0,"""missing""","""missing""",…,25,3.258097,90.04,23.32,23.08,-71.04,-16.32,-22.08,1,1,1,1,1,1,1,1,1,1,1,"""TLKKJA""","""KJATLK""",0.019,0.0,0.72,0.0,0.72,0.0,0.72,0.0,0.001,0.0,0.72,0.0,0.72,0.0,0.72,0.0
1,1,57323,123,"""S7/SU/UT""",36,1,0,"""2024-06-15T14:50:00""","""2024-06-15T09:25:00""","""07:25:00""","""E70""","""OVB""","""OVB""",1.0,0.0,1.0,"""TLK""","""02:50:00""","""5358""","""S7""","""S7""",4.0,"""E70""","""KJA""","""KJA""",1.0,0.0,1.0,"""OVB""","""01:20:00""","""5311""","""S7""","""S7""",4.0,"""missing""","""missing""",…,25,3.258097,90.04,23.32,23.08,2.96,0.68,0.92,2,2,2,2,2,1,1,2,2,1,1,"""TLKKJA""","""KJATLK""",0.093,0.0,0.093,0.0,0.72,0.0,0.72,0.0,0.024,0.0,0.024,0.0,0.72,0.0,0.72,0.0
2,1,57323,0,"""S7/SU/UT""",36,0,0,"""2024-06-15T14:50:00""","""2024-06-15T09:25:00""","""07:25:00""","""E70""","""OVB""","""OVB""",1.0,0.0,1.0,"""TLK""","""02:50:00""","""5358""","""S7""","""S7""",4.0,"""E70""","""KJA""","""KJA""",1.0,0.0,1.0,"""OVB""","""01:20:00""","""5311""","""S7""","""S7""",4.0,"""missing""","""missing""",…,25,3.258097,90.04,23.32,23.08,2.96,0.68,0.92,2,2,2,2,2,1,1,2,2,1,1,"""TLKKJA""","""KJATLK""",0.093,0.0,0.093,0.0,0.72,0.0,0.72,0.0,0.024,0.0,0.024,0.0,0.72,0.0,0.72,0.0
3,1,57323,123,"""S7/SU/UT""",36,1,0,"""2024-06-15T14:50:00""","""2024-06-15T09:25:00""","""07:25:00""","""E70""","""OVB""","""OVB""",1.0,0.0,1.0,"""TLK""","""02:50:00""","""5358""","""S7""","""S7""",4.0,"""E70""","""KJA""","""KJA""",1.0,0.0,1.0,"""OVB""","""01:20:00""","""5311""","""S7""","""S7""",4.0,"""missing""","""missing""",…,25,3.258097,90.04,23.32,23.08,2.96,0.68,0.92,2,2,2,2,2,1,1,2,2,1,1,"""TLKKJA""","""KJATLK""",0.093,0.0,0.093,0.0,0.72,0.0,0.72,0.0,0.024,0.0,0.024,0.0,0.72,0.0,0.72,0.0
4,1,57323,0,"""S7/SU/UT""",36,0,0,"""2024-06-15T14:50:00""","""2024-06-15T09:25:00""","""07:25:00""","""E70""","""OVB""","""OVB""",1.0,0.0,1.0,"""TLK""","""02:50:00""","""5358""","""S7""","""S7""",4.0,"""E70""","""KJA""","""KJA""",1.0,0.0,1.0,"""OVB""","""01:20:00""","""5311""","""S7""","""S7""",4.0,"""missing""","""missing""",…,25,3.258097,90.04,23.32,23.08,2.96,0.68,0.92,2,2,2,2,2,1,1,2,2,1,1,"""TLKKJA""","""KJATLK""",0.093,0.0,0.093,0.0,0.72,0.0,0.72,0.0,0.024,0.0,0.024,0.0,0.72,0.0,0.72,0.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
18146427,1,54154,44,"""missing""",36,1,0,"""2024-11-06T09:45:00""","""2024-11-05T20:50:00""","""10:55:00""","""738""","""SGC""","""SGC""",0.0,0.0,1.0,"""VKO""","""03:20:00""","""247""","""UT""","""UT""",3.0,"""AT7""","""SVX""","""SVX""",0.0,0.0,1.0,"""SGC""","""02:30:00""","""111""","""UT""","""UT""",5.0,"""missing""","""missing""",…,590,6.381816,217522.654237,186624.245763,176296.166102,-215980.654237,-186331.245763,-176213.166102,2,1,3,4,2,1,1,2,2,1,1,"""MOWSVX""","""SVXMOW""",0.000347,0.0,0.000347,0.0,0.745628,0.0,0.745628,0.0,0.000066,0.0,0.000064,0.0,0.745628,0.0,0.745628,0.0
18146428,1,54154,44,"""missing""",36,1,0,"""2024-11-05T20:00:00""","""2024-11-05T00:20:00""","""17:40:00""","""738""","""UFA""","""UFA""",0.0,0.0,1.0,"""VKO""","""02:05:00""","""363""","""UT""","""UT""",3.0,"""AT7""","""SVX""","""SVX""",0.0,0.0,1.0,"""UFA""","""01:30:00""","""106""","""UT""","""UT""",9.0,"""missing""","""missing""",…,590,6.381816,217522.654237,186624.245763,176296.166102,-215868.654237,-185923.245763,-176064.166102,3,2,6,2,3,1,1,1,3,1,1,"""MOWSVX""","""SVXMOW""",0.000372,0.0,0.000374,0.0,0.745628,0.0,0.745628,0.0,0.000156,0.0,0.000156,0.0,0.745628,0.0,0.745628,0.0
18146429,1,54154,44,"""missing""",36,1,0,"""2024-11-05T20:00:00""","""2024-11-05T00:20:00""","""17:40:00""","""738""","""UFA""","""UFA""",0.0,0.0,1.0,"""VKO""","""02:05:00""","""363""","""UT""","""UT""",3.0,"""AT7""","""SVX""","""SVX""",0.0,0.0,1.0,"""UFA""","""01:30:00""","""106""","""UT""","""UT""",9.0,"""missing""","""missing""",…,590,6.381816,217522.654237,186624.245763,176296.166102,-215868.654237,-186331.245763,-176206.166102,3,1,4,2,3,1,1,2,2,1,1,"""MOWSVX""","""SVXMOW""",0.000372,0.0,0.000374,0.0,0.745628,0.0,0.745628,0.0,0.000066,0.0,0.000064,0.0,0.745628,0.0,0.745628,0.0
18146430,1,54154,44,"""missing""",36,1,0,"""2024-11-06T17:10:00""","""2024-11-05T19:30:00""","""19:40:00""","""738""","""KUF""","""KUF""",0.0,0.0,1.0,"""VKO""","""01:45:00""","""357""","""UT""","""UT""",4.0,"""AT7""","""SVX""","""SVX""",0.0,0.0,1.0,"""KUF""","""02:05:00""","""282""","""UT""","""UT""",2.0,"""missing""","""missing""",…,590,6.381816,217522.654237,186624.245763,176296.166102,-217120.654237,-185923.245763,-176229.166102,1,2,2,5,4,1,1,1,3,1,1,"""MOWSVX""","""SVXMOW""",0.00009,0.0,0.000419,0.0,0.745628,0.0,0.745628,0.0,0.000156,0.0,0.000156,0.0,0.745628,0.0,0.745628,0.0


In [9]:
from scripts.feature_enigeer  import enrich_flight_view_features
import pickle
# 所需欄位 (補齊所有必要欄位)
needed_cols = [
    "Id",
    "ranker_id",
    # segment columns for legs0
    "legs0_segments0_departureFrom_airport_iata",
    "legs0_segments0_arrivalTo_airport_iata",
    "legs0_segments1_departureFrom_airport_iata",
    "legs0_segments1_arrivalTo_airport_iata",
    "legs0_segments2_departureFrom_airport_iata",
    "legs0_segments2_arrivalTo_airport_iata",
    "legs0_segments3_departureFrom_airport_iata",
    "legs0_segments3_arrivalTo_airport_iata",
    # segment columns for legs1
    "legs1_segments0_departureFrom_airport_iata",
    "legs1_segments0_arrivalTo_airport_iata",
    "legs1_segments1_departureFrom_airport_iata",
    "legs1_segments1_arrivalTo_airport_iata",
    "legs1_segments2_departureFrom_airport_iata",
    "legs1_segments2_arrivalTo_airport_iata",
    "legs1_segments3_departureFrom_airport_iata",
    "legs1_segments3_arrivalTo_airport_iata",
    "searchRoute"
]



# 篩選
test_filled = test_filled.select([c for c in needed_cols if c in test_filled.columns])

with open("data/extra_features/transform_flight_view_key_config.pkl", "rb") as f:
    config = pickle.load(f)
    
# 呼叫 enrich_flight_view_features
test_filled, _ = enrich_flight_view_features(
    test_filled,
    output_dir="data/extra_features/test/",
    transform_config=config
)

✅ 已儲存flight view特徵: data/extra_features/test/11_flight_view_features.parquet


# Add company ID feature

In [3]:
# build_company_loo_features_loo_rid.py
import polars as pl
import os
import pickle
from typing import Optional

def build_company_loo_resid_features_train(df: pl.DataFrame, output_dir: Optional[str] = None) -> pl.DataFrame:
    company_col = "companyID"
    route_col = "searchRoute"
    comb_col = "companyID_searchRoute"
    ranker_col = "ranker_id"
    target_col = "selected"

    df = df.with_columns([
        pl.col(target_col).cast(pl.Int8),
        pl.col(company_col).cast(pl.Utf8),
        pl.col(route_col).cast(pl.Utf8),
        (pl.col(company_col).cast(pl.Utf8) + "_" + pl.col(route_col).cast(pl.Utf8)).alias(comb_col),
    ])

    for dur_col in ["legs0_duration", "legs1_duration"]:
        if dur_col in df.columns and df[dur_col].dtype == pl.Utf8:
            df = df.with_columns(
                pl.when(pl.col(dur_col).is_in([None, "missing"]))
                .then(0)
                .otherwise(
                    pl.col(dur_col).str.extract(r"^(\d+):", 1).cast(pl.Int64) * 60 +
                    pl.col(dur_col).str.extract(r":(\d+):", 1).cast(pl.Int64)
                )
                .alias(dur_col)
            )

    time_cols = ["legs0_departureAt", "legs0_arrivalAt", "legs1_departureAt", "legs1_arrivalAt"]
    time_exprs = []
    for col in time_cols:
        cleaned_col = (
            pl.when(pl.col(col).is_in(["missing", None, ""]))
            .then(None)
            .otherwise(pl.col(col))
        )
        dt = cleaned_col.str.to_datetime(strict=False)
        h = dt.dt.hour()
        time_exprs.append(h.fill_null(-1).alias(f"{col}_hour"))
    df = df.with_columns(time_exprs)

    df = df.with_columns([
        pl.col("legs0_segments0_cabinClass").cast(pl.Float32).alias("cabin_class")
        if "legs0_segments0_cabinClass" in df.columns else pl.lit(None).alias("cabin_class")
    ])

    df = df.with_columns([
        pl.sum_horizontal([
            ((pl.col(f"legs0_segments{i}_departureFrom_airport_iata").is_not_null()) &
             (pl.col(f"legs0_segments{i}_departureFrom_airport_iata") != "missing")).cast(pl.Int8)
            for i in range(1, 4)
        ]).alias("legs0_num_transfers"),
        pl.sum_horizontal([
            ((pl.col(f"legs1_segments{i}_departureFrom_airport_iata").is_not_null()) &
             (pl.col(f"legs1_segments{i}_departureFrom_airport_iata") != "missing")).cast(pl.Int8)
            for i in range(1, 4)
        ]).alias("legs1_num_transfers")
    ])

    df = df.with_columns([
        (pl.col("legs0_num_transfers") + pl.col("legs1_num_transfers")).fill_null(0).cast(pl.Int64).alias("total_num_transfers"),
        ((pl.col("legs0_num_transfers") + pl.col("legs1_num_transfers")) > 0).cast(pl.Int8).alias("has_transfer")
    ])

    selected_df = df.filter(pl.col(target_col) == 1)

    agg_cols = [
        "totalPrice", "taxes", "legs0_duration", "legs1_duration",
        "cabin_class", "total_num_transfers"
    ] + [f"{col}_hour" for col in time_cols]

    fallback_expr = lambda col, alias, dtype, default: (
        pl.col(col)
        .value_counts(sort=True)
        .struct.field(col)
        .first()
        .cast(dtype)
        .fill_null(default)
        .alias(alias)
    )

    global_stats = selected_df.select([
        *(pl.mean(c).alias(f"global_mean_{c}") for c in agg_cols),
        pl.len().alias("global_selected_count"),
        fallback_expr("cabin_class", "global_mode_cabin_class", pl.Int32, 0),
        fallback_expr("has_transfer", "global_mode_has_transfer", pl.Int8, 0),
        fallback_expr("total_num_transfers", "global_mode_transfer_num", pl.Int64, 0),
    ]).to_dicts()[0]

    for k, v in global_stats.items():
        if v is None:
            global_stats[k] = 0

    comb_group_stats = (
        selected_df
        .group_by([comb_col])
        .agg([
            *(pl.sum(c).alias(f"sum_{c}") for c in agg_cols),
            pl.len().alias("count"),
            pl.col("cabin_class").value_counts(sort=True).struct.field("cabin_class").first().cast(pl.Int32).alias("mode_cabin_class"),
            pl.col("has_transfer").value_counts(sort=True).struct.field("has_transfer").first().cast(pl.Int8).alias("mode_has_transfer"),
            pl.col("total_num_transfers").value_counts(sort=True).struct.field("total_num_transfers").first().cast(pl.Int64).alias("mode_transfer_num")
        ])
    )

    df = df.join(comb_group_stats, on=comb_col, how="left")

    total_occurrences_df = df.select([company_col, ranker_col]).unique()
    company_ranker_counts = total_occurrences_df.group_by(company_col).agg(pl.count().alias("total_occurrences"))
    df = df.join(company_ranker_counts, on=company_col, how="left")

    loo_exprs = []
    for col in agg_cols:
        fallback = global_stats[f"global_mean_{col}"]
        loo_exprs.append(
            (
                (pl.col(f"sum_{col}") - pl.when(pl.col(target_col) == 1).then(pl.col(col)).otherwise(0)) /
                (pl.col("count") - pl.col(target_col))
            ).fill_nan(fallback).fill_null(fallback).alias(f"{company_col}_loo_mean_{col}")
        )

    loo_exprs.append(
        (pl.col("count") - pl.col(target_col)).fill_null(global_stats["global_selected_count"]).alias(f"{company_col}_loo_selected_count")
    )

    loo_exprs.append(
        pl.col("total_occurrences").fill_null(0).alias(f"{company_col}_total_occurrences")
    )

    for mode_col, dtype in zip([
        "cabin_class", "has_transfer", "transfer_num"
    ], [pl.Int32, pl.Int8, pl.Int64]):
        fallback = global_stats[f"global_mode_{mode_col}"]
        loo_exprs.append(
            pl.col(f"mode_{mode_col}").cast(dtype)
            .fill_null(fallback)
            .alias(f"{company_col}_mode_{mode_col}")
        )

    df = df.with_columns(loo_exprs)
    # ===== 產生 residual：orig - loo_mean =====
    resid_exprs = []
    for col in agg_cols:
        loo_col = f"{company_col}_loo_mean_{col}"
        if loo_col in df.columns:
            resid_exprs.append(
                (pl.col(col) - pl.col(loo_col)).alias(f"{company_col}_resid_{col}")
            )
    df = df.with_columns(resid_exprs)

    # 要保留的非均值先驗（count/mode/occurrences）
    keep_side_cols = [
        f"{company_col}_loo_selected_count",
        f"{company_col}_total_occurrences",
        f"{company_col}_mode_cabin_class",
        f"{company_col}_mode_has_transfer",
        f"{company_col}_mode_transfer_num",
    ]

    # 丟掉：原始數值欄 + 所有 loo_mean_* 欄位（只留 residual 與 side cols）
    drop_cols = [c for c in agg_cols if c in df.columns]
    drop_cols += [f"{company_col}_loo_mean_{c}" for c in agg_cols if f"{company_col}_loo_mean_{c}" in df.columns]

    df = df.drop(drop_cols)

    # 最後輸出：Id + residuals + side cols
    resid_cols = [f"{company_col}_resid_{c}" for c in agg_cols if f"{company_col}_resid_{c}" in df.columns]
    output_cols = ["Id"] + resid_cols + [c for c in keep_side_cols if c in df.columns]
    df = df.select(output_cols)

    # 儲存
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        df_path = os.path.join(output_dir, "15_companyID_searchRoute_features_resid.parquet")
        df.write_parquet(df_path)
        print(f"✅ 已儲存 features: {df_path}")

    # loo_feat_cols = [e.meta.output_name() for e in loo_exprs]

    # anchor = df.filter(pl.col(target_col) == 1).select([ranker_col] + loo_feat_cols).unique(subset=[ranker_col])

    # df = df.drop(loo_feat_cols)
    # df = df.join(anchor, on=ranker_col, how="left")
    # df = df.select(["Id"] + loo_feat_cols)

    # if output_dir:
    #     os.makedirs(output_dir, exist_ok=True)
    #     df_path = os.path.join(output_dir, "12_companyID_searchRoute_features.parquet")
    #     df.write_parquet(df_path)
    #     print(f"✅ 已儲存 features: {df_path}")
    return df


In [14]:
import os
import pickle
from typing import Optional, Dict,Tuple
import polars as pl
from tqdm import tqdm
def searchRoute_encoding(
    df: pl.DataFrame,
    transform_config: Dict,
    id_col: str = "Id",
) -> pl.DataFrame:
    if id_col not in df.columns:
        raise ValueError(f"'{id_col}' 不存在於df.columns，無法作為主鍵")
    if transform_config is None:
        raise ValueError("transform_config 為必填，請提供已訓練的編碼映射字典")

    label_encoders = transform_config["label_encoders"]
    df_result = df.select([id_col, "searchRoute"])

    df_result = df_result.with_columns([
        pl.col("searchRoute").cast(pl.Utf8)
    ])

    enc = label_encoders["searchRoute"]
    mapping_df = pl.DataFrame({
            "value": enc["values"],
            "rank_id": enc["codes"]
    })
    df_col = (
        df.select([id_col, "searchRoute"])
        .with_columns(pl.col("searchRoute").cast(pl.Utf8))
        .join(mapping_df.rename({"value": "searchRoute"}), on="searchRoute", how="left")
        .with_columns(
            pl.col("rank_id").fill_null(-1).cast(pl.Int32).alias("searchRoute")
        )
        .drop("rank_id")
    )
    df_result = df_result.drop("searchRoute")
    df_result = df_result.join(df_col, on=id_col, how="left")

    print("✅ Label Encoding 完成 (僅 searchRoute)")
    return df_result


In [9]:
def build_company_loo_features(
    df: pl.DataFrame,
    comb_col_min_count = 20,
    searchRouter_encoded_path = "data/extra_features/transform_config_rank.pkl",
    neighbors_dict: Optional[str] = None,
    unseen_data_neighbors_companyID_dict: Optional[str] = None,
    output_dir: Optional[str] = None,
    transform_dict: Optional[Dict] = None
) -> Tuple[pl.DataFrame, Optional[Dict]]:
    target_col = "selected"
    company_col = "companyID"
    route_col = "searchRoute"
    comb_col = "companyID_searchRoute"
    ranker_col = "ranker_id"

    # === 讀取 searchRoute encoder 並編碼 ===
    with open(searchRouter_encoded_path, "rb") as f:
        config = pickle.load(f)
    encoded_searchRoute = searchRoute_encoding(df, config)
    df = df.drop(route_col)
    df = df.join(encoded_searchRoute, on="Id", how="left")

    # === 建 comb key ===
    df = df.with_columns([
        (pl.col(company_col).cast(pl.Utf8) + "::" + pl.col(route_col).cast(pl.Utf8)).alias(comb_col)
    ])

    # === 時長字串轉數值（若需要） ===
    for dur_col in ["legs0_duration", "legs1_duration"]:
        if dur_col in df.columns and df[dur_col].dtype == pl.Utf8:
            df = df.with_columns(
                pl.when(pl.col(dur_col).is_in([None, "missing"]))
                .then(0)
                .otherwise(
                    pl.col(dur_col).str.extract(r"^(\d+):", 1).cast(pl.Int64) * 60 +
                    pl.col(dur_col).str.extract(r":(\d+):", 1).cast(pl.Int64)
                )
                .alias(dur_col)
            )

    # === 時間特徵（小時） ===
    time_cols = ["legs0_departureAt", "legs0_arrivalAt", "legs1_departureAt", "legs1_arrivalAt"]
    time_exprs = []
    for col in time_cols:
        cleaned_col = (
            pl.when(pl.col(col).is_in(["missing", None, ""])).then(None).otherwise(pl.col(col))
        )
        dt = cleaned_col.str.to_datetime(strict=False)
        h = dt.dt.hour()
        time_exprs.append(h.fill_null(-1).alias(f"{col}_hour"))
    df = df.with_columns(time_exprs)

    # === 其他原始特徵 ===
    if "legs0_segments0_cabinClass" in df.columns:
        df = df.with_columns(pl.col("legs0_segments0_cabinClass").cast(pl.Float32).alias("cabin_class"))
    else:
        df = df.with_columns(pl.lit(None, dtype=pl.Float32).alias("cabin_class"))

    df = df.with_columns([
        pl.sum_horizontal([
            ((pl.col(f"legs0_segments{i}_departureFrom_airport_iata").is_not_null()) &
             (pl.col(f"legs0_segments{i}_departureFrom_airport_iata") != "missing")).cast(pl.Int8)
            for i in range(1, 4)
        ]).alias("legs0_num_transfers"),
        pl.sum_horizontal([
            ((pl.col(f"legs1_segments{i}_departureFrom_airport_iata").is_not_null()) &
             (pl.col(f"legs1_segments{i}_departureFrom_airport_iata") != "missing")).cast(pl.Int8)
            for i in range(1, 4)
        ]).alias("legs1_num_transfers")
    ])
    df = df.with_columns([
        (pl.col("legs0_num_transfers") + pl.col("legs1_num_transfers")).fill_null(0).cast(pl.Int64).alias("total_num_transfers"),
        ((pl.col("legs0_num_transfers") + pl.col("legs1_num_transfers")) > 0).cast(pl.Int8).alias("has_transfer")
    ])

    # === 需要殘差化的原始欄位 ===
    agg_cols = [
        "totalPrice", "taxes", "legs0_duration", "legs1_duration",
        "cabin_class", "total_num_transfers"
    ] + [f"{c}_hour" for c in time_cols]

    # ============ 訓練：建立 LOO 統計（含鄰居 fallback） ============
    if transform_dict is None:
        # company_route 聚合（selected==1）
        all_stats = (
            df.filter(pl.col(target_col) == 1)
            .group_by(comb_col)
            .agg([
                *(pl.mean(c).alias(f"{c}_mean") for c in agg_cols),
                pl.count().alias("selected_count")
            ])
        )

        # 低樣本的 group → 鄰居回退
        low_count_keys = (
            all_stats
            .filter(pl.col("selected_count") < comb_col_min_count)
            .select(comb_col)
            .to_series()
            .to_list()
        )

        fallback_rows = []
        if neighbors_dict:
            for key in tqdm(low_count_keys, desc="Fallback 運算中"):
                neighbors = neighbors_dict.get(key, [])
                neighbor_rows = df.filter(
                    (pl.col(comb_col).is_in(neighbors)) & (pl.col(target_col) == 1)
                )
                if neighbor_rows.height == 0:
                    continue
                mean_row = neighbor_rows.select([
                    *(pl.mean(col).alias(f"{col}_mean") for col in agg_cols),
                    pl.count().alias("selected_count")
                ])
                mean_row = mean_row.with_columns([pl.lit(key).alias(comb_col)])
                cols = [comb_col] + [f"{col}_mean" for col in agg_cols] + ["selected_count"]
                mean_row = mean_row.select(cols)
                fallback_rows.append(mean_row)

        if fallback_rows:
            fallback_df = pl.concat(fallback_rows, how="vertical")
            all_stats = pl.concat([
                all_stats.filter(~pl.col(comb_col).is_in(low_count_keys)),
                fallback_df
            ], how="vertical")

        # mode tables（selected==1）
        def mode_table(gb_col, col, alias, dtype):
            return (
                df.filter(pl.col(target_col) == 1)
                .group_by(gb_col)
                .agg([
                    pl.col(col).value_counts(sort=True).struct.field(col).first().cast(dtype).alias(alias)
                ])
            )
        cabin_mode       = mode_table(comb_col, "cabin_class",       "mode_cabin_class",       pl.Int32)
        transfer_mode    = mode_table(comb_col, "has_transfer",      "mode_has_transfer",      pl.Int8)
        transfer_num_mode= mode_table(comb_col, "total_num_transfers","mode_transfer_num",      pl.Int64)

        # company 出現次數（以 ranker 粒度去重）
        company_occurrence_df = (
            df.select([company_col, ranker_col]).unique()
              .group_by(company_col).agg(pl.count().alias("total_occurrences"))
        )
        df = df.join(company_occurrence_df, on=company_col, how="left").with_columns(
            pl.col("total_occurrences").alias("companyID_total_occurrences")
        )

        # === 連回均值先驗 + mode ===
        df = df.join(all_stats, on=comb_col, how="left")
        df = df.join(cabin_mode, on=comb_col, how="left")
        df = df.join(transfer_mode, on=comb_col, how="left")
        df = df.join(transfer_num_mode, on=comb_col, how="left")

        # 將均值欄位統一加前綴（companyID_loo_mean_*）
        df = df.with_columns([
            pl.col(f"{c}_mean").alias(f"{company_col}_loo_mean_{c}") for c in agg_cols if f"{c}_mean" in df.columns
        ] + [
            pl.col("selected_count").alias(f"{company_col}_loo_selected_count")
        ])

        # 將 mode 欄位改名加前綴（作為 side cols）
        df = df.rename({
            "mode_cabin_class":       f"{company_col}_mode_cabin_class",
            "mode_has_transfer":      f"{company_col}_mode_has_transfer",
            "mode_transfer_num":      f"{company_col}_mode_transfer_num",
        })

        # === 產生 residual：orig - loo_mean ===
        resid_exprs = []
        for col in agg_cols:
            loo_col = f"{company_col}_loo_mean_{col}"
            if (col in df.columns) and (loo_col in df.columns):
                resid_exprs.append((pl.col(col) - pl.col(loo_col)).alias(f"{company_col}_resid_{col}"))
        if resid_exprs:
            df = df.with_columns(resid_exprs)

        # === 只保留 residual 與 side cols ===
        # 1) side cols
        keep_side_cols = [
            f"{company_col}_loo_selected_count",
            "companyID_total_occurrences",
            f"{company_col}_mode_cabin_class",
            f"{company_col}_mode_has_transfer",
            f"{company_col}_mode_transfer_num",
        ]
        # 2) residual cols
        resid_cols = [f"{company_col}_resid_{c}" for c in agg_cols if f"{company_col}_resid_{c}" in df.columns]

        # 丟掉原始與均值欄
        drop_cols = [c for c in agg_cols if c in df.columns] \
                  + [f"{company_col}_loo_mean_{c}" for c in agg_cols if f"{company_col}_loo_mean_{c}" in df.columns]
        df = df.drop([c for c in drop_cols if c in df.columns])

        # 最終輸出
        output_cols = ["Id"] + resid_cols + [c for c in keep_side_cols if c in df.columns]
        df = df.select(output_cols)

        # 存 transform_dict
        transform_dict = {
            "all_stats": all_stats.to_dict(as_series=False),
            "cabin_mode": cabin_mode.to_dict(as_series=False),
            "transfer_mode": transfer_mode.to_dict(as_series=False),
            "transfer_num_mode": transfer_num_mode.to_dict(as_series=False),
            "company_occurrences": company_occurrence_df.to_dict(as_series=False),
            "agg_cols": agg_cols  # 供推論側一致性
        }

        if output_dir:
            os.makedirs(output_dir, exist_ok=True)
            df_path = os.path.join(output_dir, "13_companyID_searchRoute_resid_features.parquet")
            df.write_parquet(df_path)
            print(f"✅ 已儲存 residual features: {df_path}")
            config_path = os.path.join(output_dir, "transform_dict_companyID_searchRoute.pkl")
            with open(config_path, "wb") as f:
                pickle.dump(transform_dict, f)
            print(f"✅ 已儲存 transform_dict: {config_path}")

        return df, transform_dict

    # ============ 推論：載入 transform_dict 並殘差化 ============
    else:
        all_stats = pl.DataFrame(transform_dict["all_stats"])
        cabin_mode = pl.DataFrame(transform_dict["cabin_mode"])
        transfer_mode = pl.DataFrame(transform_dict["transfer_mode"])
        transfer_num_mode = pl.DataFrame(transform_dict["transfer_num_mode"])
        company_occurrences = pl.DataFrame(transform_dict["company_occurrences"])
        agg_cols = transform_dict.get("agg_cols", [])  # 與訓練側一致

        # unseen company/company-route 的回退（沿用你原本的處理）
        if unseen_data_neighbors_companyID_dict:
            all_stats_df = pl.DataFrame(transform_dict["all_stats"])
            unseen_company_ids = set(unseen_data_neighbors_companyID_dict)

            fallback_rows = []
            fallback_occurrence_rows = []

            for company_id in tqdm(unseen_company_ids, desc="處理 unseen companyID"):
                related_routes = (
                    df.filter(pl.col(company_col).cast(pl.Utf8) == str(company_id))
                      .select(route_col).unique().to_series().to_list()
                )
                neighbors = unseen_data_neighbors_companyID_dict[company_id]

                neighbor_stats = all_stats_df.filter(
                    pl.col(comb_col).str.split("::").list.get(0).cast(pl.Utf8).is_in([str(n) for n in neighbors])
                )
                if neighbor_stats.height == 0:
                    continue

                mean_row = neighbor_stats.select([
                    *(pl.mean(f"{c}_mean").alias(f"{c}_mean") for c in agg_cols),
                    pl.mean("selected_count").alias("selected_count")
                ])

                for route in related_routes:
                    fallback_key = f"{company_id}::{route}"
                    fallback_row = mean_row.with_columns([pl.lit(fallback_key).alias(comb_col)])
                    fallback_row = fallback_row.select([comb_col] + [c for c in fallback_row.columns if c != comb_col])
                    fallback_rows.append(fallback_row)

                neighbor_occ = company_occurrences.filter(
                    pl.col(company_col).cast(pl.Utf8).is_in([str(n) for n in neighbors])
                )
                if neighbor_occ.height > 0:
                    mean_occ = neighbor_occ.select(
                        pl.mean("total_occurrences").alias("total_occurrences")
                    ).with_columns([pl.lit(str(company_id)).alias("companyID")]).select(["companyID", "total_occurrences"])
                    expected_schema = company_occurrences.schema
                    mean_occ = mean_occ.cast(expected_schema)
                    fallback_occurrence_rows.append(mean_occ)

            if fallback_occurrence_rows:
                fallback_occurrence_df = pl.concat(fallback_occurrence_rows, how="vertical")
                company_occurrences = pl.concat([company_occurrences, fallback_occurrence_df], how="vertical")

            if fallback_rows:
                fallback_df = pl.concat(fallback_rows, how="vertical")
                fallback_df = fallback_df.select(all_stats_df.columns).cast(all_stats_df.schema)
                all_stats = pl.concat([all_stats_df, fallback_df], how="vertical")
            else:
                all_stats = all_stats_df

        # 回連 occurrences / 均值 / mode
        df = df.join(company_occurrences, on=company_col, how="left").with_columns(
            pl.col("total_occurrences").alias("companyID_total_occurrences")
        )
        df = df.join(all_stats, on=comb_col, how="left")
        df = df.join(cabin_mode, on=comb_col, how="left")
        df = df.join(transfer_mode, on=comb_col, how="left")
        df = df.join(transfer_num_mode, on=comb_col, how="left")

        # 均值欄位加前綴；selected_count 也加前綴
        df = df.with_columns([
            pl.col(f"{c}_mean").alias(f"{company_col}_loo_mean_{c}") for c in agg_cols if f"{c}_mean" in df.columns
        ] + [
            pl.col("selected_count").alias(f"{company_col}_loo_selected_count")
        ])

        # mode 欄位改名加前綴
        df = df.rename({
            "mode_cabin_class":       f"{company_col}_mode_cabin_class",
            "mode_has_transfer":      f"{company_col}_mode_has_transfer",
            "mode_transfer_num":      f"{company_col}_mode_transfer_num",
        })

        # 產生 residual
        resid_exprs = []
        for col in agg_cols:
            loo_col = f"{company_col}_loo_mean_{col}"
            if (col in df.columns) and (loo_col in df.columns):
                resid_exprs.append((pl.col(col) - pl.col(loo_col)).alias(f"{company_col}_resid_{col}"))
        if resid_exprs:
            df = df.with_columns(resid_exprs)

        # 只保留 residual + side cols
        keep_side_cols = [
            f"{company_col}_loo_selected_count",
            "companyID_total_occurrences",
            f"{company_col}_mode_cabin_class",
            f"{company_col}_mode_has_transfer",
            f"{company_col}_mode_transfer_num",
        ]
        resid_cols = [f"{company_col}_resid_{c}" for c in agg_cols if f"{company_col}_resid_{c}" in df.columns]

        drop_cols = [c for c in agg_cols if c in df.columns] \
                  + [f"{company_col}_loo_mean_{c}" for c in agg_cols if f"{company_col}_loo_mean_{c}" in df.columns]
        df = df.drop([c for c in drop_cols if c in df.columns])

        output_cols = ["Id"] + resid_cols + [c for c in keep_side_cols if c in df.columns]
        df = df.select(output_cols)

        if output_dir:
            os.makedirs(output_dir, exist_ok=True)
            df_path = os.path.join(output_dir, "13_companyID_searchRoute_resid_features.parquet")
            df.write_parquet(df_path)
            print(f"✅ 已儲存 residual features: {df_path}")

        return df, transform_dict


In [32]:
def _add_resid_ranks(
    df: pl.DataFrame,
    resid_cols: List[str],
    ranker_col: str,
    *,
    method: str = "dense",
    nulls_last: bool = True,
) -> pl.DataFrame:
    """
    以 select() 單獨計算每個 residual 的 group rank，避免 with_columns + window 的限制。
    回傳的 df 會多出 rc + f"{rc}_rank" 欄位（rank 為 Int32，數值小→名次小）。
    """
    if not resid_cols:
        return df

    # 只取計算排名需要的欄位
    base = df.select(["Id", ranker_col] + resid_cols)

    # 為每個 resid 建 rank 表達式
    rank_exprs = []
    for rc in resid_cols:
        x = pl.col(rc)
        if nulls_last:
            # 組內最大值 + 1，將 null 推到最後；若整組皆 null，max 仍為 null → 跟著是 null（合理）
            grp_max = x.max().over(ranker_col).fill_null(float("inf"))
            x = x.fill_null(grp_max + 1.0)

        rank_exprs.append(
            x.rank(method=method, descending=False)
             .over(ranker_col)
             .cast(pl.Int32)
             .alias(f"{rc}_rank")
        )

    ranks = base.with_columns(rank_exprs).select(["Id"] + [f"{rc}_rank" for rc in resid_cols])

    # 把 rank 結果 join 回原 df
    df = df.join(ranks, on="Id", how="left")
    return df


In [33]:
from typing import Optional, Dict, Tuple, List
import polars as pl
import pickle
import os
from tqdm import tqdm

def build_company_loo_features(
    df: pl.DataFrame,
    comb_col_min_count: int = 20,
    searchRouter_encoded_path: str = "data/extra_features/transform_config_rank.pkl",
    neighbors_dict: Optional[Dict[str, List[str]]] = None,
    unseen_data_neighbors_companyID_dict: Optional[Dict[str, List[str]]] = None,
    output_dir: Optional[str] = None,
    transform_dict: Optional[Dict] = None,
    *,
    add_rank: bool = True,
    rank_method: str = "dense",     # "dense" | "ordinal" | "average" | "min" | "max"
    rank_nulls_last: bool = True
) -> Tuple[pl.DataFrame, Optional[Dict]]:
    """
    產生 companyID_searchRoute 的 LOO 統計，殘差特徵（orig - loo_mean），
    並在 residuals 基礎上依 ranker_id 做群組排名，數值越小 rank 越靠前（=1）。
    """
    target_col = "selected"
    company_col = "companyID"
    route_col = "searchRoute"
    comb_col = "companyID_searchRoute"
    ranker_col = "ranker_id"

    # === 讀取 searchRoute encoder 並編碼 ===
    with open(searchRouter_encoded_path, "rb") as f:
        config = pickle.load(f)
    encoded_searchRoute = searchRoute_encoding(df, config)
    df = df.drop(route_col)
    df = df.join(encoded_searchRoute, on="Id", how="left")

    # === 建 comb key ===
    df = df.with_columns([
        (pl.col(company_col).cast(pl.Utf8) + "::" + pl.col(route_col).cast(pl.Utf8)).alias(comb_col)
    ])

    # === 時長字串轉數值（若需要） ===
    for dur_col in ["legs0_duration", "legs1_duration"]:
        if dur_col in df.columns and df[dur_col].dtype == pl.Utf8:
            df = df.with_columns(
                pl.when(pl.col(dur_col).is_in([None, "missing"]))
                .then(0)
                .otherwise(
                    pl.col(dur_col).str.extract(r"^(\d+):", 1).cast(pl.Int64) * 60 +
                    pl.col(dur_col).str.extract(r":(\d+):", 1).cast(pl.Int64)
                )
                .alias(dur_col)
            )

    # === 時間特徵（小時） ===
    time_cols = ["legs0_departureAt", "legs0_arrivalAt", "legs1_departureAt", "legs1_arrivalAt"]
    time_exprs = []
    for col in time_cols:
        cleaned_col = (
            pl.when(pl.col(col).is_in(["missing", None, ""])).then(None).otherwise(pl.col(col))
        )
        dt = cleaned_col.str.to_datetime(strict=False)
        h = dt.dt.hour()
        time_exprs.append(h.fill_null(-1).alias(f"{col}_hour"))
    df = df.with_columns(time_exprs)

    # === 其他原始特徵 ===
    if "legs0_segments0_cabinClass" in df.columns:
        df = df.with_columns(pl.col("legs0_segments0_cabinClass").cast(pl.Float32).alias("cabin_class"))
    else:
        df = df.with_columns(pl.lit(None, dtype=pl.Float32).alias("cabin_class"))

    df = df.with_columns([
        pl.sum_horizontal([
            ((pl.col(f"legs0_segments{i}_departureFrom_airport_iata").is_not_null()) &
             (pl.col(f"legs0_segments{i}_departureFrom_airport_iata") != "missing")).cast(pl.Int8)
            for i in range(1, 4)
        ]).alias("legs0_num_transfers"),
        pl.sum_horizontal([
            ((pl.col(f"legs1_segments{i}_departureFrom_airport_iata").is_not_null()) &
             (pl.col(f"legs1_segments{i}_departureFrom_airport_iata") != "missing")).cast(pl.Int8)
            for i in range(1, 4)
        ]).alias("legs1_num_transfers")
    ])
    df = df.with_columns([
        (pl.col("legs0_num_transfers") + pl.col("legs1_num_transfers")).fill_null(0).cast(pl.Int64).alias("total_num_transfers"),
        ((pl.col("legs0_num_transfers") + pl.col("legs1_num_transfers")) > 0).cast(pl.Int8).alias("has_transfer")
    ])

    # === 需要殘差化的原始欄位 ===
    agg_cols = [
        "totalPrice", "taxes", "legs0_duration", "legs1_duration",
        "cabin_class", "total_num_transfers"
    ] + [f"{c}_hour" for c in time_cols]

    # ============ 訓練：建立 LOO 統計（含鄰居 fallback） ============
    if transform_dict is None:
        # company_route 聚合（selected==1）
        all_stats = (
            df.filter(pl.col(target_col) == 1)
            .group_by(comb_col)
            .agg([
                *(pl.mean(c).alias(f"{c}_mean") for c in agg_cols),
                pl.count().alias("selected_count")
            ])
        )

        # 低樣本的 group → 鄰居回退
        low_count_keys = (
            all_stats
            .filter(pl.col("selected_count") < comb_col_min_count)
            .select(comb_col)
            .to_series()
            .to_list()
        )

        fallback_rows = []
        if neighbors_dict:
            for key in tqdm(low_count_keys, desc="Fallback 運算中"):
                neighbors = neighbors_dict.get(key, [])
                neighbor_rows = df.filter(
                    (pl.col(comb_col).is_in(neighbors)) & (pl.col(target_col) == 1)
                )
                if neighbor_rows.height == 0:
                    continue
                mean_row = neighbor_rows.select([
                    *(pl.mean(col).alias(f"{col}_mean") for col in agg_cols),
                    pl.count().alias("selected_count")
                ])
                mean_row = mean_row.with_columns([pl.lit(key).alias(comb_col)])
                cols = [comb_col] + [f"{col}_mean" for col in agg_cols] + ["selected_count"]
                mean_row = mean_row.select(cols)
                fallback_rows.append(mean_row)

        if fallback_rows:
            fallback_df = pl.concat(fallback_rows, how="vertical")
            all_stats = pl.concat([
                all_stats.filter(~pl.col(comb_col).is_in(low_count_keys)),
                fallback_df
            ], how="vertical")

        # mode tables（selected==1）
        def mode_table(gb_col, col, alias, dtype):
            return (
                df.filter(pl.col(target_col) == 1)
                .group_by(gb_col)
                .agg([
                    pl.col(col).value_counts(sort=True).struct.field(col).first().cast(dtype).alias(alias)
                ])
            )
        cabin_mode        = mode_table(comb_col, "cabin_class",        "mode_cabin_class",        pl.Int32)
        transfer_mode     = mode_table(comb_col, "has_transfer",       "mode_has_transfer",       pl.Int8)
        transfer_num_mode = mode_table(comb_col, "total_num_transfers","mode_transfer_num",       pl.Int64)

        # company 出現次數（以 ranker 粒度去重）
        company_occurrence_df = (
            df.select([company_col, ranker_col]).unique()
              .group_by(company_col).agg(pl.count().alias("total_occurrences"))
        )
        df = df.join(company_occurrence_df, on=company_col, how="left").with_columns(
            pl.col("total_occurrences").alias("companyID_total_occurrences")
        )

        # === 連回均值先驗 + mode ===
        df = df.join(all_stats, on=comb_col, how="left")
        df = df.join(cabin_mode, on=comb_col, how="left")
        df = df.join(transfer_mode, on=comb_col, how="left")
        df = df.join(transfer_num_mode, on=comb_col, how="left")

        # 將均值欄位統一加前綴（companyID_loo_mean_*）
        df = df.with_columns([
            pl.col(f"{c}_mean").alias(f"{company_col}_loo_mean_{c}") for c in agg_cols if f"{c}_mean" in df.columns
        ] + [
            pl.col("selected_count").alias(f"{company_col}_loo_selected_count")
        ])

        # 將 mode 欄位改名加前綴（作為 side cols）
        df = df.rename({
            "mode_cabin_class":       f"{company_col}_mode_cabin_class",
            "mode_has_transfer":      f"{company_col}_mode_has_transfer",
            "mode_transfer_num":      f"{company_col}_mode_transfer_num",
        })

        # === 產生 residual：orig - loo_mean ===
        resid_exprs = []
        for col in agg_cols:
            loo_col = f"{company_col}_loo_mean_{col}"
            if (col in df.columns) and (loo_col in df.columns):
                resid_exprs.append((pl.col(col) - pl.col(loo_col)).alias(f"{company_col}_resid_{col}"))
        if resid_exprs:
            df = df.with_columns(resid_exprs)
        # === 產生 residual ranks（依 ranker_id） ===
        resid_cols = [f"{company_col}_resid_{c}" for c in agg_cols if f"{company_col}_resid_{c}" in df.columns]
        rank_exprs = []
        if add_rank and resid_cols:
            for rc in resid_cols:
                base = pl.col(rc)
                if rank_nulls_last:
                    group_max = base.max().over(ranker_col).fill_null(float("inf"))
                    base = base.fill_null(group_max + 1.0)

                rank_exprs.append(
                    base.rank(method=rank_method, descending=False)
                        .over(ranker_col)
                        .cast(pl.Int32)
                        .alias(f"{rc}_rank")
                )
        df = df.with_columns(rank_exprs)


        # === 只保留 residual 與 side cols（含 ranks） ===
        keep_side_cols = [
            # f"{company_col}_loo_selected_count",
            "companyID_total_occurrences",
            # f"{company_col}_mode_cabin_class",
            # f"{company_col}_mode_has_transfer",
            # f"{company_col}_mode_transfer_num",
        ]
        rank_cols = [f"{rc}_rank" for rc in resid_cols if f"{rc}_rank" in df.columns]

        # 丟掉原始與均值欄
        drop_cols = [c for c in agg_cols if c in df.columns] \
                  + [f"{company_col}_loo_mean_{c}" for c in agg_cols if f"{company_col}_loo_mean_{c}" in df.columns]
        df = df.drop([c for c in drop_cols if c in df.columns])

        # 最終輸出
        output_cols = ["Id"] + resid_cols + rank_cols + [c for c in keep_side_cols if c in df.columns]
        df = df.select(output_cols)

        # 存 transform_dict
        transform_dict = {
            "all_stats": all_stats.to_dict(as_series=False),
            "cabin_mode": cabin_mode.to_dict(as_series=False),
            "transfer_mode": transfer_mode.to_dict(as_series=False),
            "transfer_num_mode": transfer_num_mode.to_dict(as_series=False),
            "company_occurrences": company_occurrence_df.to_dict(as_series=False),
            "agg_cols": agg_cols
        }

        if output_dir:
            os.makedirs(output_dir, exist_ok=True)
            df_path = os.path.join(output_dir, "13_companyID_searchRoute_resid_features.parquet")
            df.write_parquet(df_path)
            print(f"✅ 已儲存 residual features: {df_path}")
            config_path = os.path.join(output_dir, "transform_dict_companyID_searchRoute.pkl")
            with open(config_path, "wb") as f:
                pickle.dump(transform_dict, f)
            print(f"✅ 已儲存 transform_dict: {config_path}")

        return df, transform_dict

    # ============ 推論：載入 transform_dict 並殘差化 ============
    else:
        all_stats = pl.DataFrame(transform_dict["all_stats"])
        cabin_mode = pl.DataFrame(transform_dict["cabin_mode"])
        transfer_mode = pl.DataFrame(transform_dict["transfer_mode"])
        transfer_num_mode = pl.DataFrame(transform_dict["transfer_num_mode"])
        company_occurrences = pl.DataFrame(transform_dict["company_occurrences"])
        agg_cols = transform_dict.get("agg_cols", [])

        # unseen company/company-route 的回退（沿用你原本的處理）
        if unseen_data_neighbors_companyID_dict:
            all_stats_df = pl.DataFrame(transform_dict["all_stats"])
            unseen_company_ids = set(unseen_data_neighbors_companyID_dict)
            fallback_rows = []
            fallback_occurrence_rows = []

            for company_id in tqdm(unseen_company_ids, desc="處理 unseen companyID"):
                related_routes = (
                    df.filter(pl.col(company_col).cast(pl.Utf8) == str(company_id))
                      .select(route_col).unique().to_series().to_list()
                )
                neighbors = unseen_data_neighbors_companyID_dict[company_id]

                neighbor_stats = all_stats_df.filter(
                    pl.col(comb_col).str.split("::").list.get(0).cast(pl.Utf8).is_in([str(n) for n in neighbors])
                )
                if neighbor_stats.height == 0:
                    continue

                mean_row = neighbor_stats.select([
                    *(pl.mean(f"{c}_mean").alias(f"{c}_mean") for c in agg_cols),
                    pl.mean("selected_count").alias("selected_count")
                ])

                for route in related_routes:
                    fallback_key = f"{company_id}::{route}"
                    fallback_row = mean_row.with_columns([pl.lit(fallback_key).alias(comb_col)])
                    fallback_row = fallback_row.select([comb_col] + [c for c in fallback_row.columns if c != comb_col])
                    fallback_rows.append(fallback_row)

                neighbor_occ = company_occurrences.filter(
                    pl.col(company_col).cast(pl.Utf8).is_in([str(n) for n in neighbors])
                )
                if neighbor_occ.height > 0:
                    mean_occ = neighbor_occ.select(
                        pl.mean("total_occurrences").alias("total_occurrences")
                    ).with_columns([pl.lit(str(company_id)).alias("companyID")]).select(["companyID", "total_occurrences"])
                    expected_schema = company_occurrences.schema
                    mean_occ = mean_occ.cast(expected_schema)
                    fallback_occurrence_rows.append(mean_occ)

            if fallback_occurrence_rows:
                fallback_occurrence_df = pl.concat(fallback_occurrence_rows, how="vertical")
                company_occurrences = pl.concat([company_occurrences, fallback_occurrence_df], how="vertical")

            if fallback_rows:
                fallback_df = pl.concat(fallback_rows, how="vertical")
                fallback_df = fallback_df.select(all_stats_df.columns).cast(all_stats_df.schema)
                all_stats = pl.concat([all_stats_df, fallback_df], how="vertical")
            else:
                all_stats = all_stats_df

        # 回連 occurrences / 均值 / mode
        df = df.join(company_occurrences, on=company_col, how="left").with_columns(
            pl.col("total_occurrences").alias("companyID_total_occurrences")
        )
        df = df.join(all_stats, on=comb_col, how="left")
        df = df.join(cabin_mode, on=comb_col, how="left")
        df = df.join(transfer_mode, on=comb_col, how="left")
        df = df.join(transfer_num_mode, on=comb_col, how="left")

        # 均值欄位加前綴；selected_count 也加前綴
        df = df.with_columns([
            pl.col(f"{c}_mean").alias(f"{company_col}_loo_mean_{c}") for c in agg_cols if f"{c}_mean" in df.columns
        ] + [
            pl.col("selected_count").alias(f"{company_col}_loo_selected_count")
        ])

        # mode 欄位改名加前綴
        df = df.rename({
            "mode_cabin_class":       f"{company_col}_mode_cabin_class",
            "mode_has_transfer":      f"{company_col}_mode_has_transfer",
            "mode_transfer_num":      f"{company_col}_mode_transfer_num",
        })

        # 產生 residual
        resid_exprs = []
        for col in agg_cols:
            loo_col = f"{company_col}_loo_mean_{col}"
            if (col in df.columns) and (loo_col in df.columns):
                resid_exprs.append((pl.col(col) - pl.col(loo_col)).alias(f"{company_col}_resid_{col}"))
        if resid_exprs:
            df = df.with_columns(resid_exprs)

        # === 產生 residual ranks（依 ranker_id） ===
        resid_cols = [f"{company_col}_resid_{c}" for c in agg_cols if f"{company_col}_resid_{c}" in df.columns]
        rank_exprs = []
        if add_rank and resid_cols:
            for rc in resid_cols:
                base = pl.col(rc)
                if rank_nulls_last:
                    group_max = base.max().over(ranker_col).fill_null(float("inf"))
                    base = base.fill_null(group_max + 1.0)

                rank_exprs.append(
                    base.rank(method=rank_method, descending=False)
                        .over(ranker_col)
                        .cast(pl.Int32)
                        .alias(f"{rc}_rank")
                )
        df = df.with_columns(rank_exprs)



        # 只保留 residual + side cols（含 ranks）
        keep_side_cols = [
            # f"{company_col}_loo_selected_count",
            "companyID_total_occurrences",
            # f"{company_col}_mode_cabin_class",
            # f"{company_col}_mode_has_transfer",
            # f"{company_col}_mode_transfer_num",
        ]
        rank_cols = [f"{rc}_rank" for rc in resid_cols if f"{rc}_rank" in df.columns]

        drop_cols = [c for c in agg_cols if c in df.columns] \
                  + [f"{company_col}_loo_mean_{c}" for c in agg_cols if f"{company_col}_loo_mean_{c}" in df.columns]
        df = df.drop([c for c in drop_cols if c in df.columns])

        output_cols = ["Id"] + resid_cols + rank_cols + [c for c in keep_side_cols if c in df.columns]
        df = df.select(output_cols)

        if output_dir:
            os.makedirs(output_dir, exist_ok=True)
            df_path = os.path.join(output_dir, "13_companyID_searchRoute_resid_features.parquet")
            df.write_parquet(df_path)
            print(f"✅ 已儲存 residual features: {df_path}")

        return df, transform_dict


In [110]:
transform_dict

{'all_stats': {'companyID_searchRoute': ['54163::2912',
   '42620::3399',
   '53268::5363',
   '63394::2945',
   '56912::3301',
   '61295::3130',
   '56901::175',
   '62212::4974',
   '60628::3778',
   '63381::2400',
   '53407::1384',
   '53317::1782',
   '54579::2320',
   '59974::2319',
   '42702::3000',
   '42620::3315',
   '25667::1518',
   '54163::2960',
   '63393::2709',
   '52573::2038',
   '54579::2464',
   '58666::4845',
   '36948::2418',
   '60628::108',
   '63370::677',
   '60482::1756',
   '63418::3137',
   '36948::2400',
   '57496::5365',
   '42188::2709',
   '24728::2709',
   '60482::2957',
   '38150::2996',
   '54218::84',
   '53317::1895',
   '57323::4866',
   '36948::3051',
   '26833::3105',
   '57323::3834',
   '40253::2472',
   '43113::2809',
   '39054::3611',
   '57323::5109',
   '42702::4597',
   '53365::3558',
   '59766::2802',
   '24728::2921',
   '57323::3827',
   '44518::2400',
   '57323::5428',
   '40253::4876',
   '60187::2400',
   '36948::84',
   '57323::4566

In [22]:
# from scripts.feature_enigeer  import build_company_loo_features_loo_train
import pickle

cols_to_keep = [
    "Id",
    "companyID",
    "ranker_id",
    "selected",
    "totalPrice",
    "taxes",
    "legs0_duration",
    "legs1_duration",
    "legs0_departureAt",
    "legs0_arrivalAt",
    "legs1_departureAt",
    "legs1_arrivalAt",
    "legs0_segments0_cabinClass",
    "legs0_segments0_departureFrom_airport_iata",
    "legs0_segments1_departureFrom_airport_iata",
    "legs0_segments2_departureFrom_airport_iata",
    "legs0_segments3_departureFrom_airport_iata",
    "legs1_segments0_departureFrom_airport_iata",
    "legs1_segments1_departureFrom_airport_iata",
    "legs1_segments2_departureFrom_airport_iata",
    "legs1_segments3_departureFrom_airport_iata",
    "legs0_segments0_arrivalTo_airport_iata",
    "legs1_segments0_arrivalTo_airport_iata",
    "searchRoute"
]

train_filled = train_filled.select(cols_to_keep)
# with open("data/extra_features/try/transform_dict_companyID_searchRoute.pkl", "rb") as f:
#     config = pickle.load(f)
    
# train_new = build_company_loo_resid_features_train(train_filled,
#                                                  output_dir= 'data/extra_features/train')
#                                                     #    transform_dict=config)


In [29]:
import json

with open("data/nearest_neigbor/companyID_route_neighbors.json", "r") as f:
    neighbors_dict = json.load(f)

train_new, config = build_company_loo_features(train_filled,
                                               comb_col_min_count=20,
                                               neighbors_dict = neighbors_dict,
                                               output_dir= 'data/extra_features/train')
                                                    #    transform_dict=config)


✅ Label Encoding 完成 (僅 searchRoute)


(Deprecated in version 0.20.5)
  pl.count().alias("selected_count")
(Deprecated in version 0.20.5)
  pl.count().alias("selected_count")
Fallback 運算中: 100%|██████████| 24879/24879 [26:03<00:00, 15.91it/s]    
(Deprecated in version 0.20.5)
  .group_by(company_col).agg(pl.count().alias("total_occurrences"))


InvalidOperationError: window expression not allowed in aggregation

In [None]:
train_new

Id,companyID_resid_totalPrice,companyID_resid_taxes,companyID_resid_legs0_duration,companyID_resid_legs1_duration,companyID_resid_cabin_class,companyID_resid_total_num_transfers,companyID_resid_legs0_departureAt_hour,companyID_resid_legs0_arrivalAt_hour,companyID_resid_legs1_departureAt_hour,companyID_resid_legs1_arrivalAt_hour,companyID_loo_selected_count,companyID_total_occurrences,companyID_mode_cabin_class,companyID_mode_has_transfer,companyID_mode_transfer_num
i64,f64,f64,f64,f64,f32,f64,f64,f64,f64,f64,u32,u32,i32,i8,i64
0,-14888.166667,-1024.966667,-47.333333,-121.0,0.0,-0.3,4.3,5.1,-3.1,-1.966667,30,6113,1,0,0
1,19352.833333,845.033333,237.666667,229.0,0.0,1.7,-1.7,3.1,9.9,-7.966667,30,6113,1,0,0
2,21922.833333,845.033333,237.666667,229.0,0.0,1.7,-1.7,3.1,9.9,-7.966667,30,6113,1,0,0
3,50107.833333,845.033333,237.666667,229.0,0.0,1.7,-1.7,3.1,9.9,-7.966667,30,6113,1,0,0
4,54297.833333,845.033333,237.666667,229.0,0.0,1.7,-1.7,3.1,9.9,-7.966667,30,6113,1,0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
18146427,14280.0,4180.0,506.875,726.25,0.0,2.0,8.3125,-7.25,0.5,-11.1875,16,622,1,0,0
18146428,11210.0,4180.0,911.875,646.25,0.0,2.0,-11.6875,3.75,2.5,-11.1875,16,622,1,0,0
18146429,8010.0,4080.0,911.875,726.25,0.0,2.0,-11.6875,3.75,0.5,-11.1875,16,622,1,0,0
18146430,8910.0,4180.0,1031.875,646.25,0.0,2.0,7.3125,0.75,2.5,-11.1875,16,622,1,0,0


In [34]:
# from scripts.feature_enigeer  import build_company_loo_features
import pickle

cols_to_keep = [
    "Id",
    "companyID",
    "ranker_id",
    "selected",
    "totalPrice",
    "taxes",
    "legs0_duration",
    "legs1_duration",
    "legs0_departureAt",
    "legs0_arrivalAt",
    "legs1_departureAt",
    "legs1_arrivalAt",
    "legs0_segments0_cabinClass",
    "legs0_segments0_departureFrom_airport_iata",
    "legs0_segments1_departureFrom_airport_iata",
    "legs0_segments2_departureFrom_airport_iata",
    "legs0_segments3_departureFrom_airport_iata",
    "legs1_segments0_departureFrom_airport_iata",
    "legs1_segments1_departureFrom_airport_iata",
    "legs1_segments2_departureFrom_airport_iata",
    "legs1_segments3_departureFrom_airport_iata",
    "legs0_segments0_arrivalTo_airport_iata",
    "legs1_segments0_arrivalTo_airport_iata",
    "searchRoute"
]
# with open("data/extra_features/train/transform_dict_companyID_searchRoute.pkl", "rb") as f:
#     config = pickle.load(f)
with open("data/nearest_neigbor/unseen_neighbors_companyID.json", "r") as f:
    unseen_neighbors_companyID_dict = json.load(f)
test_filled = test_filled.select(cols_to_keep)
test_new, transform_dict = build_company_loo_features(test_filled,
                                                       comb_col_min_count=20,
                                                       output_dir= "data/extra_features/test",
                                                       unseen_data_neighbors_companyID_dict=unseen_neighbors_companyID_dict,
                                                       transform_dict=config)


✅ Label Encoding 完成 (僅 searchRoute)


處理 unseen companyID: 100%|██████████| 5273/5273 [01:21<00:00, 64.59it/s]


InvalidOperationError: window expression not allowed in aggregation

In [None]:
test_new

Id,companyID,ranker_id,selected,totalPrice,taxes,legs0_duration,legs1_duration,legs0_departureAt,legs0_arrivalAt,legs1_departureAt,legs1_arrivalAt,legs0_segments0_cabinClass,legs0_segments0_departureFrom_airport_iata,legs0_segments1_departureFrom_airport_iata,legs0_segments2_departureFrom_airport_iata,legs0_segments3_departureFrom_airport_iata,legs1_segments0_departureFrom_airport_iata,legs1_segments1_departureFrom_airport_iata,legs1_segments2_departureFrom_airport_iata,legs1_segments3_departureFrom_airport_iata,legs0_segments0_arrivalTo_airport_iata,legs1_segments0_arrivalTo_airport_iata,searchRoute
i64,i64,str,i64,f64,f64,str,str,str,str,str,str,f64,str,str,str,null,str,str,str,null,str,str,str
18144679,62840,"""c9373e5f772e43d593dd6ad2fa90f6…",0,9818.0,1018.0,"""02:30:00""","""02:45:00""","""2024-12-19T06:50:00""","""2024-12-19T11:20:00""","""2024-12-21T21:10:00""","""2024-12-21T21:55:00""",1.0,"""SVO""","""missing""","""missing""",,"""SVX""","""missing""","""missing""",,"""SVX""","""SVO""","""MOWSVX/SVXMOW"""
18144680,62840,"""c9373e5f772e43d593dd6ad2fa90f6…",0,14018.0,1018.0,"""02:30:00""","""02:45:00""","""2024-12-19T06:50:00""","""2024-12-19T11:20:00""","""2024-12-21T21:10:00""","""2024-12-21T21:55:00""",1.0,"""SVO""","""missing""","""missing""",,"""SVX""","""missing""","""missing""",,"""SVX""","""SVO""","""MOWSVX/SVXMOW"""
18144681,62840,"""c9373e5f772e43d593dd6ad2fa90f6…",0,22418.0,1018.0,"""02:30:00""","""02:45:00""","""2024-12-19T06:50:00""","""2024-12-19T11:20:00""","""2024-12-21T21:10:00""","""2024-12-21T21:55:00""",1.0,"""SVO""","""missing""","""missing""",,"""SVX""","""missing""","""missing""",,"""SVX""","""SVO""","""MOWSVX/SVXMOW"""
18144682,62840,"""c9373e5f772e43d593dd6ad2fa90f6…",0,12974.0,3284.0,"""02:20:00""","""02:35:00""","""2024-12-19T08:25:00""","""2024-12-19T12:45:00""","""2024-12-21T12:00:00""","""2024-12-21T12:35:00""",1.0,"""DME""","""missing""","""missing""",,"""SVX""","""missing""","""missing""",,"""SVX""","""DME""","""MOWSVX/SVXMOW"""
18144683,62840,"""c9373e5f772e43d593dd6ad2fa90f6…",0,16974.0,3284.0,"""02:20:00""","""02:35:00""","""2024-12-19T08:25:00""","""2024-12-19T12:45:00""","""2024-12-21T12:00:00""","""2024-12-21T12:35:00""",1.0,"""DME""","""missing""","""missing""",,"""SVX""","""missing""","""missing""",,"""SVX""","""DME""","""MOWSVX/SVXMOW"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
25043143,57320,"""c5622e0de0594bde95a4dd8c1fcff7…",0,16486.0,566.0,"""02:45:00""","""missing""","""2025-01-08T09:05:00""","""2025-01-08T12:50:00""","""missing""","""missing""",1.0,"""SVO""","""missing""","""missing""",,"""missing""","""missing""","""missing""",,"""ASF""","""missing""","""MOWASF"""
25043144,57320,"""c5622e0de0594bde95a4dd8c1fcff7…",0,11701.0,566.0,"""02:45:00""","""missing""","""2025-01-08T21:25:00""","""2025-01-09T01:10:00""","""missing""","""missing""",1.0,"""SVO""","""missing""","""missing""",,"""missing""","""missing""","""missing""",,"""ASF""","""missing""","""MOWASF"""
25043145,57320,"""c5622e0de0594bde95a4dd8c1fcff7…",0,16486.0,566.0,"""02:45:00""","""missing""","""2025-01-08T21:25:00""","""2025-01-09T01:10:00""","""missing""","""missing""",1.0,"""SVO""","""missing""","""missing""",,"""missing""","""missing""","""missing""",,"""ASF""","""missing""","""MOWASF"""
25043146,57320,"""c5622e0de0594bde95a4dd8c1fcff7…",0,14431.0,566.0,"""02:40:00""","""missing""","""2025-01-08T15:10:00""","""2025-01-08T18:50:00""","""missing""","""missing""",1.0,"""SVO""","""missing""","""missing""",,"""missing""","""missing""","""missing""",,"""ASF""","""missing""","""MOWASF"""


In [18]:
import json

with open("data/nearest_neigbor/unseen_neighbors_companyID.json", "r") as f:
    unseen_neighbors_companyID_dict = json.load(f)
with open("data/min_20/try/extra_features/transform_dict_companyID_searchRoute.pkl", "rb") as f:
    config = pickle.load(f)
test_new, transform_dict = build_company_loo_features(test_filled,
                                                       output_dir= "data/min_20/try/extra_features/test",
                                                       unseen_data_neighbors_companyID_dict=unseen_neighbors_companyID_dict,
                                                       transform_dict=config)

✅ Label Encoding 完成 (僅 searchRoute)


處理 unseen companyID: 100%|██████████| 41/41 [00:01<00:00, 28.74it/s]


✅ 已儲存 features: data/min_20/try/extra_features/test/12_companyID_searchRoute_features.parquet


In [11]:
import polars as pl

# 將 dict 轉為 polars DataFrame
company_occurrences_df = pl.DataFrame(config["company_occurrences"])

# 正確查詢方式（使用整數）
target_company_id = 63461
result = company_occurrences_df.filter(pl.col("companyID") == target_company_id)

print(result)


shape: (1, 2)
┌───────────┬───────────────────┐
│ companyID ┆ total_occurrences │
│ ---       ┆ ---               │
│ i64       ┆ i64               │
╞═══════════╪═══════════════════╡
│ 63461     ┆ 27                │
└───────────┴───────────────────┘


In [21]:
test_new
test_new.filter(test_new["Id"]== '18969384')

Id,companyID_loo_mean_totalPrice,companyID_loo_mean_taxes,companyID_loo_mean_legs0_duration,companyID_loo_mean_legs1_duration,companyID_loo_mean_cabin_class,companyID_loo_mean_total_num_transfers,companyID_loo_mean_legs0_departureAt_hour,companyID_loo_mean_legs0_arrivalAt_hour,companyID_loo_mean_legs1_departureAt_hour,companyID_loo_mean_legs1_arrivalAt_hour,companyID_loo_selected_count,companyID_total_occurrences
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64
18969384,28823.698957,1315.63328,271.352621,225.265098,1.059456,0.669333,11.444543,12.806654,11.483799,8.786299,49,10


In [20]:
test_filled.filter(test_filled["companyID"]== '63196')

Id,companyID,ranker_id,selected,totalPrice,taxes,legs0_duration,legs1_duration,legs0_departureAt,legs0_arrivalAt,legs1_departureAt,legs1_arrivalAt,legs0_segments0_cabinClass,legs0_segments0_departureFrom_airport_iata,legs0_segments1_departureFrom_airport_iata,legs0_segments2_departureFrom_airport_iata,legs0_segments3_departureFrom_airport_iata,legs1_segments0_departureFrom_airport_iata,legs1_segments1_departureFrom_airport_iata,legs1_segments2_departureFrom_airport_iata,legs1_segments3_departureFrom_airport_iata,legs0_segments0_arrivalTo_airport_iata,legs1_segments0_arrivalTo_airport_iata,searchRoute
i64,i64,str,i64,f64,f64,str,str,str,str,str,str,f64,str,str,str,null,str,str,str,null,str,str,str
18969384,63196,"""eb358549171641e48eaf46f03fa305…",0,24914.0,1554.0,"""12:45:00""","""04:10:00""","""2024-11-14T12:25:00""","""2024-11-14T21:10:00""","""2024-11-16T23:05:00""","""2024-11-17T07:15:00""",1.0,"""TOF""","""OVB""","""missing""",,"""DME""","""missing""","""missing""",,"""OVB""","""TOF""","""TOFMOW/MOWTOF"""
18969385,63196,"""eb358549171641e48eaf46f03fa305…",0,33854.0,1554.0,"""12:45:00""","""04:10:00""","""2024-11-14T12:25:00""","""2024-11-14T21:10:00""","""2024-11-16T23:05:00""","""2024-11-17T07:15:00""",1.0,"""TOF""","""OVB""","""missing""",,"""DME""","""missing""","""missing""",,"""OVB""","""TOF""","""TOFMOW/MOWTOF"""
18969386,63196,"""eb358549171641e48eaf46f03fa305…",0,59174.0,1554.0,"""12:45:00""","""04:10:00""","""2024-11-14T12:25:00""","""2024-11-14T21:10:00""","""2024-11-16T23:05:00""","""2024-11-17T07:15:00""",1.0,"""TOF""","""OVB""","""missing""",,"""DME""","""missing""","""missing""",,"""OVB""","""TOF""","""TOFMOW/MOWTOF"""
18969387,63196,"""eb358549171641e48eaf46f03fa305…",0,38849.0,714.0,"""04:35:00""","""04:20:00""","""2024-11-14T09:00:00""","""2024-11-14T09:35:00""","""2024-11-16T09:20:00""","""2024-11-16T17:40:00""",1.0,"""TOF""","""missing""","""missing""",,"""SVO""","""missing""","""missing""",,"""SVO""","""TOF""","""TOFMOW/MOWTOF"""
18969388,63196,"""eb358549171641e48eaf46f03fa305…",0,48049.0,714.0,"""04:35:00""","""04:20:00""","""2024-11-14T09:00:00""","""2024-11-14T09:35:00""","""2024-11-16T09:20:00""","""2024-11-16T17:40:00""",1.0,"""TOF""","""missing""","""missing""",,"""SVO""","""missing""","""missing""",,"""SVO""","""TOF""","""TOFMOW/MOWTOF"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
18969414,63196,"""eb358549171641e48eaf46f03fa305…",0,32489.0,974.0,"""16:50:00""","""23:30:00""","""2024-11-14T06:35:00""","""2024-11-14T19:25:00""","""2024-11-16T02:25:00""","""2024-11-17T05:55:00""",1.0,"""TOF""","""LED""","""missing""",,"""SVO""","""SVX""","""missing""",,"""LED""","""SVX""","""TOFMOW/MOWTOF"""
18969415,63196,"""eb358549171641e48eaf46f03fa305…",0,32283.0,2530.0,"""08:35:00""","""2.04:05:00""","""2024-11-14T15:00:00""","""2024-11-14T19:35:00""","""2024-11-16T08:10:00""","""2024-11-18T16:15:00""",1.0,"""TOF""","""SGC""","""missing""",,"""ZIA""","""KUF""","""missing""",,"""SGC""","""KUF""","""TOFMOW/MOWTOF"""
18969416,63196,"""eb358549171641e48eaf46f03fa305…",0,30025.0,2202.0,"""09:45:00""","""2.04:05:00""","""2024-11-14T06:50:00""","""2024-11-14T12:35:00""","""2024-11-16T08:10:00""","""2024-11-18T16:15:00""",1.0,"""TOF""","""SVX""","""missing""",,"""ZIA""","""KUF""","""missing""",,"""SVX""","""KUF""","""TOFMOW/MOWTOF"""
18969417,63196,"""eb358549171641e48eaf46f03fa305…",0,30610.0,897.0,"""12:25:00""","""2.04:05:00""","""2024-11-14T06:35:00""","""2024-11-14T15:00:00""","""2024-11-16T08:10:00""","""2024-11-18T16:15:00""",1.0,"""TOF""","""LED""","""missing""",,"""ZIA""","""KUF""","""missing""",,"""LED""","""KUF""","""TOFMOW/MOWTOF"""


In [None]:
import json

with open("data/nearest_neigbor/unseen_neighbors_companyID.json", "r") as f:
    unseen_neighbors_companyID_dict = json.load(f)
unseen_neighbors_companyID_dict


{'63196': [59457, 61200, 63461],
 '59487': [58395,
  61822,
  61607,
  59277,
  47350,
  62441,
  43019,
  63231,
  47405,
  57311,
  26387],
 '59818': [54575, 57922, 61177],
 '61751': [61874, 60834, 54273],
 '62098': [45683, 62134, 51331],
 '59070': [54257, 54169, 54294],
 '63243': [61836, 62452, 61724, 62105, 54402],
 '62020': [58365, 60545],
 '62871': [58137, 60771, 53598],
 '61818': [28235, 40069, 59090, 47405, 47350],
 '61654': [63420],
 '61874': [61751, 60834, 54273],
 '16880': [62052, 63273],
 '61822': [63231, 51287],
 '62368': [60241, 58356, 62024, 24217],
 '59090': [61818, 28235, 28222],
 '62459': [62279, 62409, 58059],
 '59277': [26387],
 '62463': [60657, 51446, 28348, 45774, 27153],
 '60219': [57260, 25691, 57734],
 '62794': [57869, 60136],
 '60288': [62340, 54439, 63362, 59847],
 '43019': [47350, 47405, 40069, 28235],
 '62029': [62273],
 '62110': [60555, 27094],
 '61579': [60410, 63332],
 '60838': [62382,
  61173,
  61129,
  61090,
  61214,
  61169,
  61147,
  58261,
  5466

# A make companyID

In [5]:
from scripts.feature_enigeer import make_companyID_into_features

_, transform_dict = make_companyID_into_features(
    df=train_filled,
    output_dir="data/extra_features/train/",
)

✅ 已儲存 transform_dict: data/extra_features/train/12_companyID_into_features.parquet
✅ 已儲存 transform_dict: data/extra_features/train/transform_dict_companyID.pkl


In [16]:
from scripts.feature_enigeer import make_companyID_into_features

with open("data/extra_features/transform_dict_companyID.pkl", "rb") as f:
    config = pickle.load(f)
    
# 呼叫 enrich_flight_view_features
test_filled, _ = make_companyID_into_features(
    test_filled,
    output_dir="data/extra_features/test/",
    transform_dict=config
)

✅ 已儲存 transform_dict: data/extra_features/test/12_companyID_into_features.parquet


# Add cluster features

In [None]:
from scripts.feature_enigeer import build_cluster_transform_dict
transform_dict = build_cluster_transform_dict(
    transform_path="data/extra_features/transform_dict_companyID.pkl",
    output_path="data/extra_features/transform_dict_cluster.pkl",
    k=3
)

In [None]:
from scripts.feature_enigeer import add_cluster_features_and_save


needed_cols = ["Id","companyID"]
# 篩選
train_filled = train_filled.select([c for c in needed_cols if c in train_filled.columns])


df_with_cluster = add_cluster_features_and_save(
    df=train_filled,
    transform_dict_path="data/extra_features/transform_dict_cluster.pkl",
    output_dir="data/extra_features/train"
)


In [3]:
from scripts.feature_enigeer import add_cluster_features_and_save


needed_cols = ["Id","companyID"]
# 篩選
test_filled = test_filled.select([c for c in needed_cols if c in test_filled.columns])


df_with_cluster = add_cluster_features_and_save(
    df=test_filled,
    transform_dict_path="data/extra_features/transform_dict_cluster.pkl",
    output_dir="data/extra_features/test"
)


✅ 已儲存 cluster features: data/extra_features/test/13_cluster_features.parquet
shape: (5, 18)
┌──────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ Id       ┆ companyID ┆ cluster_l ┆ totalPric ┆ … ┆ mode_cabi ┆ mode_has_ ┆ mode_tran ┆ total_occ │
│ ---      ┆ ---       ┆ abel      ┆ e_mean_me ┆   ┆ n_class_m ┆ transfer_ ┆ sfer_num_ ┆ urrences_ │
│ i64      ┆ i64       ┆ ---       ┆ an        ┆   ┆ ean       ┆ mean      ┆ mean      ┆ mean      │
│          ┆           ┆ i64       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│          ┆           ┆           ┆ f64       ┆   ┆ f64       ┆ f64       ┆ f64       ┆ f64       │
╞══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 18144679 ┆ 62840     ┆ 0         ┆ 31749.003 ┆ … ┆ 1.036313  ┆ 0.0       ┆ 0.0       ┆ 25450.321 │
│          ┆           ┆           ┆ 054       ┆   ┆           ┆           ┆           ┆ 229       │

# remove time

In [5]:
from scripts.feature_enigeer import merge_original_with_extra_features, clean_fill_and_cast_columns,drop_constant_numeric_columns, replace_group_minmax_for_rankish_features

merged_df = merge_original_with_extra_features(
    base_parquet_path="data/train.parquet",
    extra_features_dir="data/extra_features/train"
)


# 如果要儲存
merged_df  = clean_fill_and_cast_columns(merged_df)
# merged_df = drop_constant_numeric_columns(merged_df,0.99)
# 直接覆寫 merged_df 中所有「欄名含 rank」的特徵為 group-wise Min–Max 值
# merged_df = replace_group_minmax_for_rankish_features(
#     merged_df, group_col="ranker_id", id_col="Id", constant_fill=0.5
# )
merged_df.write_parquet("data/train_with_companyID_features.parquet")

✅ 讀取原始資料: data/train.parquet
✅ 共找到 12 個 Parquet 要合併
🔹 合併第 1/12 個: data/extra_features/train\1_price_features.parquet
🔹 合併第 2/12 個: data/extra_features/train\2_duration_features.parquet
⚠️ 10 個特徵將被新檔案覆蓋: ['legs0_segments0_duration', 'legs0_duration', 'legs1_segments0_duration', 'legs0_segments3_duration', 'legs1_segments3_duration', 'legs1_segments2_duration', 'legs1_segments1_duration', 'legs0_segments1_duration', 'legs1_duration', 'legs0_segments2_duration']
🔹 合併第 3/12 個: data/extra_features/train\3_frequent_flyer_features.parquet
⚠️ 41 個特徵將被新檔案覆蓋: ['legs0_segments0_operatingCarrier_code', 'legs0_duration_rank', 'total_duration_rank', 'legs1_segments0_marketingCarrier_code', 'isVip', 'legs0_segments0_marketingCarrier_code', 'legs1_segments1_operatingCarrier_code', 'total_duration', 'legs0_segments0_duration', 'legs1_segments3_marketingCarrier_code', 'legs1_segments0_operatingCarrier_code', 'legs0_duration', 'legs1_segments0_duration', 'legs1_segments2_duration_rank', 'legs0_segments3_

In [4]:
from scripts.feature_enigeer import merge_original_with_extra_features,clean_fill_and_cast_columns,replace_group_minmax_for_rankish_features
merged_df = merge_original_with_extra_features(
    base_parquet_path="data/test.parquet",
    extra_features_dir="data/extra_features/test")

# 如果要儲存
merged_df  = clean_fill_and_cast_columns(merged_df)
# merged_df = merged_df.drop(['bySelf', 'legs0_segments2_baggageAllowance_quantity', 'legs0_segments2_baggageAllowance_weightMeasurementType', 'legs0_segments3_baggageAllowance_quantity', 'legs0_segments3_baggageAllowance_weightMeasurementType', 'legs0_segments3_seatsAvailable', 'legs1_segments2_baggageAllowance_quantity', 'legs1_segments2_baggageAllowance_weightMeasurementType', 'legs1_segments2_seatsAvailable', 'legs1_segments3_baggageAllowance_quantity', 'legs1_segments3_baggageAllowance_weightMeasurementType', 'legs1_segments3_seatsAvailable', 'miniRules0_percentage', 'miniRules1_percentage', 'pricingInfo_passengerCount', 'selected', 'legs0_segments2_marketingCarrier_code_in_ff', 'legs0_segments2_operatingCarrier_code_in_ff', 'legs0_segments3_marketingCarrier_code_in_ff', 'legs0_segments3_operatingCarrier_code_in_ff', 'legs1_segments2_marketingCarrier_code_in_ff', 'legs1_segments2_operatingCarrier_code_in_ff', 'legs1_segments3_marketingCarrier_code_in_ff', 'legs1_segments3_operatingCarrier_code_in_ff', 'legs0_segments3_duration_rank', 'legs1_segments2_duration_rank', 'legs1_segments3_duration_rank', 'legs0_segments3_cabinClass', 'legs1_segments2_cabinClass', 'legs1_segments3_cabinClass', 'legs0_segments3_duration', 'legs1_segments2_duration', 'legs1_segments3_duration', 'has_corporate_tariff', 'legs0_segments3_arrivalTo_airport_city_iata', 'legs0_segments3_arrivalTo_airport_iata', 'legs0_segments3_departureFrom_airport_iata', 'legs1_segments2_arrivalTo_airport_city_iata', 'legs1_segments2_arrivalTo_airport_iata', 'legs1_segments2_departureFrom_airport_iata', 'legs1_segments3_arrivalTo_airport_city_iata', 'legs1_segments3_arrivalTo_airport_iata', 'legs1_segments3_departureFrom_airport_iata', 'legs0_segments3_marketingCarrier_code', 'legs0_segments3_operatingCarrier_code', 'legs1_segments2_marketingCarrier_code', 'legs1_segments2_operatingCarrier_code', 'legs1_segments3_marketingCarrier_code', 'legs1_segments3_operatingCarrier_code', 'legs0_segments3_aircraft_code', 'legs1_segments2_aircraft_code', 'legs1_segments3_aircraft_code', 'legs0_segments3_flightNumber', 'legs1_segments2_flightNumber', 'legs1_segments3_flightNumber'])
# 直接覆寫 merged_df 中所有「欄名含 rank」的特徵為 group-wise Min–Max 值
# merged_df = replace_group_minmax_for_rankish_features(
#     merged_df, group_col="ranker_id", id_col="Id", constant_fill=0.5
# )

merged_df.write_parquet("data/test_with_companyID_features.parquet")

✅ 讀取原始資料: data/test.parquet
✅ 共找到 12 個 Parquet 要合併
🔹 合併第 1/12 個: data/extra_features/test\1_price_features.parquet
🔹 合併第 2/12 個: data/extra_features/test\2_duration_features.parquet
⚠️ 10 個特徵將被新檔案覆蓋: ['legs0_segments0_duration', 'legs0_duration', 'legs1_segments0_duration', 'legs0_segments3_duration', 'legs1_segments3_duration', 'legs1_segments2_duration', 'legs1_segments1_duration', 'legs0_segments1_duration', 'legs1_duration', 'legs0_segments2_duration']
🔹 合併第 3/12 個: data/extra_features/test\3_frequent_flyer_features.parquet
⚠️ 41 個特徵將被新檔案覆蓋: ['legs0_segments0_operatingCarrier_code', 'legs0_duration_rank', 'total_duration_rank', 'legs1_segments0_marketingCarrier_code', 'isVip', 'legs0_segments0_marketingCarrier_code', 'legs1_segments1_operatingCarrier_code', 'total_duration', 'legs0_segments0_duration', 'legs1_segments3_marketingCarrier_code', 'legs1_segments0_operatingCarrier_code', 'legs0_duration', 'legs1_segments0_duration', 'legs1_segments2_duration_rank', 'legs0_segments3_mark

In [3]:
# 找字串欄
import polars as pl
str_cols = [c for c in merged_df.columns if merged_df[c].dtype in (pl.Utf8, pl.String)]
# 找數值欄
numeric_cols = [c for c in merged_df.columns if merged_df[c].dtype.is_numeric()]
# 找布林欄
bool_cols = [c for c in merged_df.columns if merged_df[c].dtype == pl.Boolean]

print(f"✅ 共找到 {len(str_cols)} 個字串欄位")
print(f"✅ 共找到 {len(numeric_cols)} 個數值欄位")
print(f"✅ 共找到 {len(bool_cols)} 個布林欄位")
known_cols = set(str_cols + numeric_cols + bool_cols)
other_cols = [c for c in merged_df.columns if c not in known_cols]

print(f"🔍 尚未分類的欄位共有 {len(other_cols)} 個：")
print(other_cols)



✅ 共找到 6 個字串欄位
✅ 共找到 279 個數值欄位
✅ 共找到 0 個布林欄位
🔍 尚未分類的欄位共有 1 個：
['requestDate']


In [3]:
import polars as pl

# 讀 parquet
test = pl.read_parquet('data/Try only companyID/test_filled.parquet')
train = pl.read_parquet('data/Try only companyID/train_filled.parquet')

# 比較欄位
train_cols = set(train.columns)
test_cols = set(test.columns)

print("✅ Train columns:", len(train_cols))
print("✅ Test columns:", len(test_cols))

print("\n🎯 Train 中有但 Test 沒有的欄位:")
print(sorted(train_cols - test_cols))

print("\n🎯 Test 中有但 Train 沒有的欄位:")
print(sorted(test_cols - train_cols))

if train_cols == test_cols:
    print("\n✅ Train 和 Test 欄位完全一致！")

# 比較 companyID
train_company_ids = set(train["companyID"].unique().to_list())
test_company_ids = set(test["companyID"].unique().to_list())

only_in_test = sorted(test_company_ids - train_company_ids)

print("\n🎯 Test 中有但 Train 沒有的 companyID:")
print(only_in_test)

print(f"\n✅ Test unique companyID: {len(test_company_ids)}")
print(f"✅ Train unique companyID: {len(train_company_ids)}")
print(f"✅ Test 但不在 Train 的 companyID 數量: {len(only_in_test)}")


✅ Train columns: 288
✅ Test columns: 290

🎯 Train 中有但 Test 沒有的欄位:
['selected']

🎯 Test 中有但 Train 沒有的欄位:
['companyID_mode_cabin_class', 'companyID_mode_has_transfer', 'companyID_mode_transfer_num']

🎯 Test 中有但 Train 沒有的 companyID:
[16880, 37962, 43019, 45683, 46743, 52514, 54908, 57310, 59070, 59090, 59277, 59487, 59818, 60219, 60288, 60602, 60838, 61579, 61607, 61627, 61641, 61654, 61709, 61724, 61751, 61810, 61818, 61822, 61832, 61874, 62020, 62029, 62098, 62110, 62368, 62459, 62463, 62794, 62871, 63196, 63243]

✅ Test unique companyID: 495
✅ Train unique companyID: 641
✅ Test 但不在 Train 的 companyID 數量: 41
