In [1]:
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import time
import xgboost as xgb

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [2]:
# Load data
train_filled = pl.read_parquet('data/train.parquet').drop('__index_level_0__')
from scripts.feature_enigeer import clean_fill_and_cast_columns
train_filled = clean_fill_and_cast_columns(train_filled)

✅ 共找到 73 個字串欄位
✅ 共找到 48 個數值欄位
✅ 共找到 4 個布林欄位
✅ 已完成空字串處理、缺失補值、布林轉0/1


In [2]:
# Load data
test_filled = pl.read_parquet('data/test.parquet').drop('__index_level_0__').with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))
from scripts.feature_enigeer import clean_fill_and_cast_columns


test_filled = clean_fill_and_cast_columns(test_filled, test= True)

✅ 共找到 57 個字串欄位
✅ 共找到 48 個數值欄位
✅ 共找到 4 個布林欄位
✅ test=True: 已將 10 個duration欄位轉str並填'missing'
✅ 已完成空字串處理、缺失補值、布林轉0/1


# add view time

In [5]:
needed_cols

['legs0_segments0_marketingCarrier_code',
 'legs1_segments1_arrivalTo_airport_iata',
 'legs0_segments2_arrivalTo_airport_iata',
 'legs1_segments0_marketingCarrier_code',
 'legs1_segments2_duration',
 'legs1_segments0_operatingCarrier_code',
 'legs0_duration',
 'legs0_segments1_arrivalTo_airport_iata',
 'legs1_duration',
 'Id',
 'legs0_segments3_duration',
 'legs1_segments2_arrivalTo_airport_iata',
 'legs0_segments0_arrivalTo_airport_iata',
 'legs0_segments0_operatingCarrier_code',
 'legs0_segments0_departureFrom_airport_iata',
 'frequentFlyer',
 'legs0_segments3_departureFrom_airport_iata',
 'legs1_segments1_marketingCarrier_code',
 'legs0_segments3_operatingCarrier_code',
 'legs1_segments0_duration',
 'legs0_segments2_duration',
 'legs1_segments3_duration',
 'legs1_segments1_operatingCarrier_code',
 'legs0_segments3_arrivalTo_airport_iata',
 'legs1_segments3_departureFrom_airport_iata',
 'legs0_segments2_marketingCarrier_code',
 'legs1_segments0_arrivalTo_airport_iata',
 'legs1_segmen

In [3]:

import os
import polars as pl
from typing import Optional
import json
def enrich_flight_view_features(
    df: pl.DataFrame,
    output_dir: Optional[str] = None,
    output_filename: str = "11_flight_view_features.parquet",
    transform_config: Optional[dict] = None
) -> tuple[pl.DataFrame, dict]:
    def make_leg_segment_keys(leg_prefix):
        keys = []
        for i in range(4):
            key_name = f"{leg_prefix}_segments{i}_key"
            dep = pl.col(f"{leg_prefix}_segments{i}_departureFrom_airport_iata").fill_null("missing")
            arr = pl.col(f"{leg_prefix}_segments{i}_arrivalTo_airport_iata").fill_null("missing")
            keys.append((dep + "-" + arr).alias(key_name))
        return keys

    df = df.with_columns(make_leg_segment_keys("legs0") + make_leg_segment_keys("legs1"))

    all_segments = [f"legs0_segments{i}_key" for i in range(4)] + [f"legs1_segments{i}_key" for i in range(4)]

    if transform_config is None:
        # 訓練模式:建立segment counts
        segment_counts = (
            df.melt(id_vars=[], value_vars=all_segments)
            .filter(pl.col("value") != "missing-missing")
            .group_by("value")
            .agg(pl.count().alias("segment_view_count"))
        )
        segment_counts_dict = segment_counts.to_dict(as_series=False)
    else:
        # 預測模式:使用已有segment_counts
        segment_counts = pl.DataFrame(transform_config["segment_counts"])

    for seg_col in all_segments:
        df = df.join(
            segment_counts,
            left_on=seg_col,
            right_on="value",
            how="left"
        ).with_columns(
            pl.col("segment_view_count").fill_null(0).alias(f"{seg_col}_view_count")
        ).drop("segment_view_count")

    def make_leg_full_key(leg_prefix):
        seg_keys = [f"{leg_prefix}_segments{i}_key" for i in range(4)]
        return pl.concat_str([pl.col(k) for k in seg_keys], separator="|").alias(f"{leg_prefix}_key")

    df = df.with_columns([
        make_leg_full_key("legs0"),
        make_leg_full_key("legs1"),
        (
            pl.concat_str([
                pl.concat_str([pl.col(f"legs0_segments{i}_key") for i in range(4)], separator="|"),
                pl.lit("||"),
                pl.concat_str([pl.col(f"legs1_segments{i}_key") for i in range(4)], separator="|"),
            ], separator="")
        ).alias("all_key")
    ])

    if transform_config is None:
        leg0_counts = df.group_by("legs0_key").agg(pl.count().alias("leg0_flight_view_count"))
        leg1_counts = df.group_by("legs1_key").agg(pl.count().alias("leg1_flight_view_count"))
        all_counts = df.group_by("all_key").agg(pl.count().alias("all_flight_view_count"))
        leg0_counts_dict = leg0_counts.to_dict(as_series=False)
        leg1_counts_dict = leg1_counts.to_dict(as_series=False)
        all_counts_dict = all_counts.to_dict(as_series=False)
    else:
        leg0_counts = pl.DataFrame(transform_config["leg0_counts"])
        leg1_counts = pl.DataFrame(transform_config["leg1_counts"])
        all_counts = pl.DataFrame(transform_config["all_counts"])

    df = df.join(leg0_counts, on="legs0_key", how="left")
    df = df.join(leg1_counts, on="legs1_key", how="left")
    df = df.join(all_counts, on="all_key", how="left")

    ranker_stats = df.group_by("ranker_id").agg([
        pl.max("leg0_flight_view_count").alias("leg0_view_max"),
        pl.max("leg1_flight_view_count").alias("leg1_view_max"),
        pl.max("all_flight_view_count").alias("all_view_max"),
    ])

    df = df.join(ranker_stats, on="ranker_id", how="left")

    df = df.with_columns([
        (pl.col("leg0_flight_view_count") / (pl.col("leg0_view_max") + 1e-5)).alias("leg0_view_norm"),
        (pl.col("leg1_flight_view_count") / (pl.col("leg1_view_max") + 1e-5)).alias("leg1_view_norm"),
        (pl.col("all_flight_view_count") / (pl.col("all_view_max") + 1e-5)).alias("all_view_norm"),
    ])

    ranker_stats_mean = df.group_by("ranker_id").agg([
        pl.mean("leg0_flight_view_count").alias("leg0_view_mean"),
        pl.mean("leg1_flight_view_count").alias("leg1_view_mean"),
        pl.mean("all_flight_view_count").alias("all_view_mean"),
    ])

    df = df.join(ranker_stats_mean, on="ranker_id", how="left")

    df = df.with_columns([
        (pl.col("leg0_flight_view_count") - pl.col("leg0_view_mean")).alias("leg0_view_diff_mean"),
        (pl.col("leg1_flight_view_count") - pl.col("leg1_view_mean")).alias("leg1_view_diff_mean"),
        (pl.col("all_flight_view_count") - pl.col("all_view_mean")).alias("all_view_diff_mean"),
    ])

    rank_features = [
        "leg0_flight_view_count",
        "leg1_flight_view_count",
        "all_flight_view_count",
    ] + [f"legs0_segments{i}_key_view_count" for i in range(4)] + [f"legs1_segments{i}_key_view_count" for i in range(4)]

    rank_exprs = []
    for col in rank_features:
        rank_exprs.append(
            pl.col(col).rank(method="dense").over("ranker_id").alias(f"{col}_rank")
        )

    df = df.with_columns(rank_exprs)



    output_config = None
    if transform_config is None:
        output_config = {
            "segment_counts": segment_counts_dict,
            "leg0_counts": leg0_counts_dict,
            "leg1_counts": leg1_counts_dict,
            "all_counts": all_counts_dict
        }
    columns_to_drop = [
        "leg0_view_max", "leg1_view_max", "all_view_max",
        "leg0_view_mean", "leg1_view_mean", "all_view_mean",
        "legs0_key", "legs1_key", "all_key"
    ] + [f"legs0_segments{i}_key" for i in range(4)] + [f"legs1_segments{i}_key" for i in range(4)]

    df = df.drop(columns_to_drop)
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        df.write_parquet(os.path.join(output_dir, output_filename))
        if transform_config is None and output_config is not None:
            # 將output_config儲存成pickle
            config_path = os.path.join(output_dir, "transform_flight_view_key_config.pkl")
            with open(config_path, "wb") as f:
                pickle.dump(output_config, f)
                
    return df, output_config

In [7]:
import polars as pl
# 所需欄位 (補齊所有必要欄位)
needed_cols = [
    "Id",
    "ranker_id",
    # segment columns for legs0
    "legs0_segments0_departureFrom_airport_iata",
    "legs0_segments0_arrivalTo_airport_iata",
    "legs0_segments1_departureFrom_airport_iata",
    "legs0_segments1_arrivalTo_airport_iata",
    "legs0_segments2_departureFrom_airport_iata",
    "legs0_segments2_arrivalTo_airport_iata",
    "legs0_segments3_departureFrom_airport_iata",
    "legs0_segments3_arrivalTo_airport_iata",
    # segment columns for legs1
    "legs1_segments0_departureFrom_airport_iata",
    "legs1_segments0_arrivalTo_airport_iata",
    "legs1_segments1_departureFrom_airport_iata",
    "legs1_segments1_arrivalTo_airport_iata",
    "legs1_segments2_departureFrom_airport_iata",
    "legs1_segments2_arrivalTo_airport_iata",
    "legs1_segments3_departureFrom_airport_iata",
    "legs1_segments3_arrivalTo_airport_iata"
]



# 篩選
train_filled = train_filled.select([c for c in needed_cols if c in train_filled.columns])

# 呼叫 enrich_flight_view_features
train_filled, _ = enrich_flight_view_features(
    train_filled,
    output_dir="data/extra_features/train/",

)

NameError: name 'enrich_flight_view_features' is not defined

In [6]:
import polars as pl
# 所需欄位 (補齊所有必要欄位)
# 所需欄位 (補齊必要欄位，並去除重複)
needed_cols = [
    "Id",
    "ranker_id",
    # segment columns for legs0
    "legs0_segments0_departureFrom_airport_iata",
    "legs0_segments0_arrivalTo_airport_iata",
    "legs0_segments1_departureFrom_airport_iata",
    "legs0_segments1_arrivalTo_airport_iata",
    "legs0_segments2_departureFrom_airport_iata",
    "legs0_segments2_arrivalTo_airport_iata",
    "legs0_segments3_departureFrom_airport_iata",
    "legs0_segments3_arrivalTo_airport_iata",
    # segment columns for legs1
    "legs1_segments0_departureFrom_airport_iata",
    "legs1_segments0_arrivalTo_airport_iata",
    "legs1_segments1_departureFrom_airport_iata",
    "legs1_segments1_arrivalTo_airport_iata",
    "legs1_segments2_departureFrom_airport_iata",
    "legs1_segments2_arrivalTo_airport_iata",
    "legs1_segments3_departureFrom_airport_iata",
    "legs1_segments3_arrivalTo_airport_iata"
]



# 篩選
train_filled.select([c for c in needed_cols if c in train_filled.columns])


Id,ranker_id,legs0_segments0_departureFrom_airport_iata,legs0_segments0_arrivalTo_airport_iata,legs0_segments1_departureFrom_airport_iata,legs0_segments1_arrivalTo_airport_iata,legs0_segments2_departureFrom_airport_iata,legs0_segments2_arrivalTo_airport_iata,legs0_segments3_departureFrom_airport_iata,legs0_segments3_arrivalTo_airport_iata,legs1_segments0_departureFrom_airport_iata,legs1_segments0_arrivalTo_airport_iata,legs1_segments1_departureFrom_airport_iata,legs1_segments1_arrivalTo_airport_iata,legs1_segments2_departureFrom_airport_iata,legs1_segments2_arrivalTo_airport_iata,legs1_segments3_departureFrom_airport_iata,legs1_segments3_arrivalTo_airport_iata
i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
0,"""98ce0dabf6964640b63079fbafd42c…","""TLK""","""KJA""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""KJA""","""TLK""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing"""
1,"""98ce0dabf6964640b63079fbafd42c…","""TLK""","""OVB""","""OVB""","""KJA""","""missing""","""missing""","""missing""","""missing""","""KJA""","""OVB""","""OVB""","""TLK""","""missing""","""missing""","""missing""","""missing"""
2,"""98ce0dabf6964640b63079fbafd42c…","""TLK""","""OVB""","""OVB""","""KJA""","""missing""","""missing""","""missing""","""missing""","""KJA""","""OVB""","""OVB""","""TLK""","""missing""","""missing""","""missing""","""missing"""
3,"""98ce0dabf6964640b63079fbafd42c…","""TLK""","""OVB""","""OVB""","""KJA""","""missing""","""missing""","""missing""","""missing""","""KJA""","""OVB""","""OVB""","""TLK""","""missing""","""missing""","""missing""","""missing"""
4,"""98ce0dabf6964640b63079fbafd42c…","""TLK""","""OVB""","""OVB""","""KJA""","""missing""","""missing""","""missing""","""missing""","""KJA""","""OVB""","""OVB""","""TLK""","""missing""","""missing""","""missing""","""missing"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
18146427,"""88f8c53a28bf4f438941fd67338009…","""VKO""","""SGC""","""SGC""","""SVX""","""missing""","""missing""","""missing""","""missing""","""SVX""","""UFA""","""UFA""","""VKO""","""missing""","""missing""","""missing""","""missing"""
18146428,"""88f8c53a28bf4f438941fd67338009…","""VKO""","""UFA""","""UFA""","""SVX""","""missing""","""missing""","""missing""","""missing""","""SVX""","""SGC""","""SGC""","""VKO""","""missing""","""missing""","""missing""","""missing"""
18146429,"""88f8c53a28bf4f438941fd67338009…","""VKO""","""UFA""","""UFA""","""SVX""","""missing""","""missing""","""missing""","""missing""","""SVX""","""UFA""","""UFA""","""VKO""","""missing""","""missing""","""missing""","""missing"""
18146430,"""88f8c53a28bf4f438941fd67338009…","""VKO""","""KUF""","""KUF""","""SVX""","""missing""","""missing""","""missing""","""missing""","""SVX""","""SGC""","""SGC""","""VKO""","""missing""","""missing""","""missing""","""missing"""


In [4]:
train_filled

Id,bySelf,companyID,corporateTariffCode,frequentFlyer,nationality,isAccess3D,isVip,legs0_arrivalAt,legs0_departureAt,legs0_duration,legs0_segments0_aircraft_code,legs0_segments0_arrivalTo_airport_city_iata,legs0_segments0_arrivalTo_airport_iata,legs0_segments0_baggageAllowance_quantity,legs0_segments0_baggageAllowance_weightMeasurementType,legs0_segments0_cabinClass,legs0_segments0_departureFrom_airport_iata,legs0_segments0_duration,legs0_segments0_flightNumber,legs0_segments0_marketingCarrier_code,legs0_segments0_operatingCarrier_code,legs0_segments0_seatsAvailable,legs0_segments1_aircraft_code,legs0_segments1_arrivalTo_airport_city_iata,legs0_segments1_arrivalTo_airport_iata,legs0_segments1_baggageAllowance_quantity,legs0_segments1_baggageAllowance_weightMeasurementType,legs0_segments1_cabinClass,legs0_segments1_departureFrom_airport_iata,legs0_segments1_duration,legs0_segments1_flightNumber,legs0_segments1_marketingCarrier_code,legs0_segments1_operatingCarrier_code,legs0_segments1_seatsAvailable,legs0_segments2_aircraft_code,legs0_segments2_arrivalTo_airport_city_iata,…,legs1_segments2_baggageAllowance_quantity,legs1_segments2_baggageAllowance_weightMeasurementType,legs1_segments2_cabinClass,legs1_segments2_departureFrom_airport_iata,legs1_segments2_duration,legs1_segments2_flightNumber,legs1_segments2_marketingCarrier_code,legs1_segments2_operatingCarrier_code,legs1_segments2_seatsAvailable,legs1_segments3_aircraft_code,legs1_segments3_arrivalTo_airport_city_iata,legs1_segments3_arrivalTo_airport_iata,legs1_segments3_baggageAllowance_quantity,legs1_segments3_baggageAllowance_weightMeasurementType,legs1_segments3_cabinClass,legs1_segments3_departureFrom_airport_iata,legs1_segments3_duration,legs1_segments3_flightNumber,legs1_segments3_marketingCarrier_code,legs1_segments3_operatingCarrier_code,legs1_segments3_seatsAvailable,miniRules0_monetaryAmount,miniRules0_percentage,miniRules0_statusInfos,miniRules1_monetaryAmount,miniRules1_percentage,miniRules1_statusInfos,pricingInfo_isAccessTP,pricingInfo_passengerCount,profileId,ranker_id,requestDate,searchRoute,sex,taxes,totalPrice,selected
i64,i8,i64,i64,str,i64,i8,i8,str,str,str,str,str,str,f64,f64,f64,str,str,str,str,str,f64,str,str,str,f64,f64,f64,str,str,str,str,str,f64,str,str,…,f64,f64,f64,str,str,str,str,str,f64,str,str,str,f64,f64,f64,str,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,str,datetime[ns],str,i8,f64,f64,i64
0,1,57323,0,"""S7/SU/UT""",36,0,0,"""2024-06-15T16:20:00""","""2024-06-15T15:40:00""","""02:40:00""","""YK2""","""KJA""","""KJA""",1.0,0.0,1.0,"""TLK""","""02:40:00""","""216""","""KV""","""KV""",9.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""","""missing""","""missing""","""missing""","""missing""",0.0,"""missing""","""missing""",…,0.0,0.0,0.0,"""missing""","""missing""","""missing""","""missing""","""missing""",0.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""","""missing""","""missing""","""missing""","""missing""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,2087645,"""98ce0dabf6964640b63079fbafd42c…",2024-05-17 03:03:08,"""TLKKJA/KJATLK""",1,370.0,16884.0,1
1,1,57323,123,"""S7/SU/UT""",36,1,0,"""2024-06-15T14:50:00""","""2024-06-15T09:25:00""","""07:25:00""","""E70""","""OVB""","""OVB""",1.0,0.0,1.0,"""TLK""","""02:50:00""","""5358""","""S7""","""S7""",4.0,"""E70""","""KJA""","""KJA""",1.0,0.0,1.0,"""OVB""","""01:20:00""","""5311""","""S7""","""S7""",4.0,"""missing""","""missing""",…,0.0,0.0,0.0,"""missing""","""missing""","""missing""","""missing""","""missing""",0.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""","""missing""","""missing""","""missing""","""missing""",0.0,2300.0,0.0,1.0,3500.0,0.0,1.0,1.0,1,2087645,"""98ce0dabf6964640b63079fbafd42c…",2024-05-17 03:03:08,"""TLKKJA/KJATLK""",1,2240.0,51125.0,0
2,1,57323,0,"""S7/SU/UT""",36,0,0,"""2024-06-15T14:50:00""","""2024-06-15T09:25:00""","""07:25:00""","""E70""","""OVB""","""OVB""",1.0,0.0,1.0,"""TLK""","""02:50:00""","""5358""","""S7""","""S7""",4.0,"""E70""","""KJA""","""KJA""",1.0,0.0,1.0,"""OVB""","""01:20:00""","""5311""","""S7""","""S7""",4.0,"""missing""","""missing""",…,0.0,0.0,0.0,"""missing""","""missing""","""missing""","""missing""","""missing""",0.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""","""missing""","""missing""","""missing""","""missing""",0.0,2300.0,0.0,1.0,3500.0,0.0,1.0,1.0,1,2087645,"""98ce0dabf6964640b63079fbafd42c…",2024-05-17 03:03:08,"""TLKKJA/KJATLK""",1,2240.0,53695.0,0
3,1,57323,123,"""S7/SU/UT""",36,1,0,"""2024-06-15T14:50:00""","""2024-06-15T09:25:00""","""07:25:00""","""E70""","""OVB""","""OVB""",1.0,0.0,1.0,"""TLK""","""02:50:00""","""5358""","""S7""","""S7""",4.0,"""E70""","""KJA""","""KJA""",1.0,0.0,1.0,"""OVB""","""01:20:00""","""5311""","""S7""","""S7""",4.0,"""missing""","""missing""",…,0.0,0.0,0.0,"""missing""","""missing""","""missing""","""missing""","""missing""",0.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""","""missing""","""missing""","""missing""","""missing""",0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1,2087645,"""98ce0dabf6964640b63079fbafd42c…",2024-05-17 03:03:08,"""TLKKJA/KJATLK""",1,2240.0,81880.0,0
4,1,57323,0,"""S7/SU/UT""",36,0,0,"""2024-06-15T14:50:00""","""2024-06-15T09:25:00""","""07:25:00""","""E70""","""OVB""","""OVB""",1.0,0.0,1.0,"""TLK""","""02:50:00""","""5358""","""S7""","""S7""",4.0,"""E70""","""KJA""","""KJA""",1.0,0.0,1.0,"""OVB""","""01:20:00""","""5311""","""S7""","""S7""",4.0,"""missing""","""missing""",…,0.0,0.0,0.0,"""missing""","""missing""","""missing""","""missing""","""missing""",0.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""","""missing""","""missing""","""missing""","""missing""",0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1,2087645,"""98ce0dabf6964640b63079fbafd42c…",2024-05-17 03:03:08,"""TLKKJA/KJATLK""",1,2240.0,86070.0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
18146427,1,54154,44,"""missing""",36,1,0,"""2024-11-06T09:45:00""","""2024-11-05T20:50:00""","""10:55:00""","""738""","""SGC""","""SGC""",0.0,0.0,1.0,"""VKO""","""03:20:00""","""247""","""UT""","""UT""",3.0,"""AT7""","""SVX""","""SVX""",0.0,0.0,1.0,"""SGC""","""02:30:00""","""111""","""UT""","""UT""",5.0,"""missing""","""missing""",…,0.0,0.0,0.0,"""missing""","""missing""","""missing""","""missing""","""missing""",0.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""","""missing""","""missing""","""missing""","""missing""",0.0,24000.0,0.0,1.0,0.0,0.0,0.0,0.0,1,3046852,"""88f8c53a28bf4f438941fd67338009…",2024-10-29 12:46:20,"""MOWSVX/SVXMOW""",1,5560.0,30730.0,0
18146428,1,54154,44,"""missing""",36,1,0,"""2024-11-05T20:00:00""","""2024-11-05T00:20:00""","""17:40:00""","""738""","""UFA""","""UFA""",0.0,0.0,1.0,"""VKO""","""02:05:00""","""363""","""UT""","""UT""",3.0,"""AT7""","""SVX""","""SVX""",0.0,0.0,1.0,"""UFA""","""01:30:00""","""106""","""UT""","""UT""",9.0,"""missing""","""missing""",…,0.0,0.0,0.0,"""missing""","""missing""","""missing""","""missing""","""missing""",0.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""","""missing""","""missing""","""missing""","""missing""",0.0,24000.0,0.0,1.0,0.0,0.0,0.0,0.0,1,3046852,"""88f8c53a28bf4f438941fd67338009…",2024-10-29 12:46:20,"""MOWSVX/SVXMOW""",1,5560.0,27660.0,0
18146429,1,54154,44,"""missing""",36,1,0,"""2024-11-05T20:00:00""","""2024-11-05T00:20:00""","""17:40:00""","""738""","""UFA""","""UFA""",0.0,0.0,1.0,"""VKO""","""02:05:00""","""363""","""UT""","""UT""",3.0,"""AT7""","""SVX""","""SVX""",0.0,0.0,1.0,"""UFA""","""01:30:00""","""106""","""UT""","""UT""",9.0,"""missing""","""missing""",…,0.0,0.0,0.0,"""missing""","""missing""","""missing""","""missing""","""missing""",0.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""","""missing""","""missing""","""missing""","""missing""",0.0,24000.0,0.0,1.0,0.0,0.0,0.0,0.0,1,3046852,"""88f8c53a28bf4f438941fd67338009…",2024-10-29 12:46:20,"""MOWSVX/SVXMOW""",1,5460.0,24460.0,0
18146430,1,54154,44,"""missing""",36,1,0,"""2024-11-06T17:10:00""","""2024-11-05T19:30:00""","""19:40:00""","""738""","""KUF""","""KUF""",0.0,0.0,1.0,"""VKO""","""01:45:00""","""357""","""UT""","""UT""",4.0,"""AT7""","""SVX""","""SVX""",0.0,0.0,1.0,"""KUF""","""02:05:00""","""282""","""UT""","""UT""",2.0,"""missing""","""missing""",…,0.0,0.0,0.0,"""missing""","""missing""","""missing""","""missing""","""missing""",0.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""","""missing""","""missing""","""missing""","""missing""",0.0,24000.0,0.0,1.0,0.0,0.0,0.0,0.0,1,3046852,"""88f8c53a28bf4f438941fd67338009…",2024-10-29 12:46:20,"""MOWSVX/SVXMOW""",1,5560.0,25360.0,0


In [7]:
import polars as pl

# 動態建立每個 segment key（包含0~3）
def make_leg_segment_keys(leg_prefix):
    keys = []
    for i in range(4):
        key_name = f"{leg_prefix}_segments{i}_key"
        dep = pl.col(f"{leg_prefix}_segments{i}_departureFrom_airport_iata").fill_null("missing")
        arr = pl.col(f"{leg_prefix}_segments{i}_arrivalTo_airport_iata").fill_null("missing")
        keys.append((dep + "-" + arr).alias(key_name))
    return keys

train_filled = train_filled.with_columns(
    make_leg_segment_keys("legs0") + make_leg_segment_keys("legs1")
)

all_segments = []
for i in range(4):
    all_segments.append(f"legs0_segments{i}_key")
    all_segments.append(f"legs1_segments{i}_key")

segment_counts = (
    train_filled.melt(id_vars=[], value_vars=all_segments)
    .filter(pl.col("value") != "missing-missing")
    .group_by("value")
    .agg(pl.count().alias("segment_view_count"))
)

for seg_col in all_segments:
    train_filled = train_filled.join(
        segment_counts,
        left_on=seg_col,
        right_on="value",
        how="left"
    ).with_columns(
        pl.col("segment_view_count").fill_null(0).alias(f"{seg_col}_view_count")
    ).drop("segment_view_count")


def make_leg_full_key(leg_prefix):
    seg_keys = [f"{leg_prefix}_segments{i}_key" for i in range(4)]
    return pl.concat_str([pl.col(k) for k in seg_keys], separator="|").alias(f"{leg_prefix}_key")

train_filled = train_filled.with_columns([
    make_leg_full_key("legs0"),
    make_leg_full_key("legs1")
])

train_filled = train_filled.with_columns([
    (pl.col("legs0_key") + "||" + pl.col("legs1_key")).alias("all_key")
])

leg0_counts = train_filled.group_by("legs0_key").agg(pl.count().alias("leg0_flight_view_count"))
leg1_counts = train_filled.group_by("legs1_key").agg(pl.count().alias("leg1_flight_view_count"))
all_counts = train_filled.group_by("all_key").agg(pl.count().alias("all_flight_view_count"))

train_filled = train_filled.join(leg0_counts, on="legs0_key", how="left")
train_filled = train_filled.join(leg1_counts, on="legs1_key", how="left")
train_filled = train_filled.join(all_counts, on="all_key", how="left")

# 加上 group by ranker_id normalized features
ranker_stats = train_filled.group_by("ranker_id").agg([
    pl.max("leg0_flight_view_count").alias("leg0_view_max"),
    pl.max("leg1_flight_view_count").alias("leg1_view_max"),
    pl.max("all_flight_view_count").alias("all_view_max"),
])

train_filled = train_filled.join(ranker_stats, on="ranker_id", how="left")

train_filled = train_filled.with_columns([
    (pl.col("leg0_flight_view_count") / (pl.col("leg0_view_max") + 1e-5)).alias("leg0_view_norm"),
    (pl.col("leg1_flight_view_count") / (pl.col("leg1_view_max") + 1e-5)).alias("leg1_view_norm"),
    (pl.col("all_flight_view_count") / (pl.col("all_view_max") + 1e-5)).alias("all_view_norm"),
    
])

# 新增 group by ranker_id 的 mean 統計
ranker_stats_mean = train_filled.group_by("ranker_id").agg([
    pl.mean("leg0_flight_view_count").alias("leg0_view_mean"),
    pl.mean("leg1_flight_view_count").alias("leg1_view_mean"),
    pl.mean("all_flight_view_count").alias("all_view_mean"),
])

train_filled = train_filled.join(ranker_stats_mean, on="ranker_id", how="left")

# 加上 normalized 百分比以及與 mean 差距特徵
train_filled = train_filled.with_columns([
    # 與 mean 差距
    (pl.col("leg0_flight_view_count") - pl.col("leg0_view_mean")).alias("leg0_view_diff_mean"),
    (pl.col("leg1_flight_view_count") - pl.col("leg1_view_mean")).alias("leg1_view_diff_mean"),
    (pl.col("all_flight_view_count") - pl.col("all_view_mean")).alias("all_view_diff_mean"),
])

# 最後把多餘中間欄位 drop 掉
columns_to_drop = [
    "leg0_view_max", "leg1_view_max", "all_view_max",
    "leg0_view_mean", "leg1_view_mean", "all_view_mean",
    "legs0_key", "legs1_key", "all_key"
] + [f"legs0_segments{i}_key" for i in range(4)] + [f"legs1_segments{i}_key" for i in range(4)]


train_filled = train_filled.drop(columns_to_drop)


# 找出你要改成rank的特徵欄位
rank_features = [
    "leg0_flight_view_count",
    "leg1_flight_view_count",
    "all_flight_view_count",
] + [f"legs0_segments{i}_key_view_count" for i in range(4)] + [f"legs1_segments{i}_key_view_count" for i in range(4)]

# 對每個 ranker_id group 計算 rank (數字越小 rank越接近1)
rank_exprs = []
for col in rank_features:
    rank_exprs.append(
        pl.col(col).rank(method="dense").over("ranker_id").alias(f"{col}_rank")
    )

train_filled = train_filled.with_columns(rank_exprs)

  train_filled.melt(id_vars=[], value_vars=all_segments)
(Deprecated in version 0.20.5)
  .agg(pl.count().alias("segment_view_count"))
(Deprecated in version 0.20.5)
  leg0_counts = train_filled.group_by("legs0_key").agg(pl.count().alias("leg0_flight_view_count"))
(Deprecated in version 0.20.5)
  leg1_counts = train_filled.group_by("legs1_key").agg(pl.count().alias("leg1_flight_view_count"))
(Deprecated in version 0.20.5)
  all_counts = train_filled.group_by("all_key").agg(pl.count().alias("all_flight_view_count"))


In [4]:
for col in rank_features:
    rank_exprs.append(
        pl.col(col).rank(method="dense").over("ranker_id").alias(f"{col}_rank")
    )

train_filled = train_filled.with_columns(rank_exprs)

In [None]:
import polars as pl

def enrich_flight_view_features(df: pl.DataFrame) -> pl.DataFrame:
    def make_leg_segment_keys(leg_prefix):
        keys = []
        for i in range(4):
            key_name = f"{leg_prefix}_segments{i}_key"
            dep = pl.col(f"{leg_prefix}_segments{i}_departureFrom_airport_iata").fill_null("missing")
            arr = pl.col(f"{leg_prefix}_segments{i}_arrivalTo_airport_iata").fill_null("missing")
            keys.append((dep + "-" + arr).alias(key_name))
        return keys

    df = df.with_columns(make_leg_segment_keys("legs0") + make_leg_segment_keys("legs1"))

    all_segments = [f"legs0_segments{i}_key" for i in range(4)] + [f"legs1_segments{i}_key" for i in range(4)]

    segment_counts = (
        df.melt(id_vars=[], value_vars=all_segments)
        .filter(pl.col("value") != "missing-missing")
        .group_by("value")
        .agg(pl.count().alias("segment_view_count"))
    )

    for seg_col in all_segments:
        df = df.join(
            segment_counts,
            left_on=seg_col,
            right_on="value",
            how="left"
        ).with_columns(
            pl.col("segment_view_count").fill_null(0).alias(f"{seg_col}_view_count")
        ).drop("segment_view_count")

    def make_leg_full_key(leg_prefix):
        seg_keys = [f"{leg_prefix}_segments{i}_key" for i in range(4)]
        return pl.concat_str([pl.col(k) for k in seg_keys], separator="|").alias(f"{leg_prefix}_key")

    df = df.with_columns([
        make_leg_full_key("legs0"),
        make_leg_full_key("legs1"),
        (pl.col("legs0_key") + "||" + pl.col("legs1_key")).alias("all_key")
    ])

    leg0_counts = df.group_by("legs0_key").agg(pl.count().alias("leg0_flight_view_count"))
    leg1_counts = df.group_by("legs1_key").agg(pl.count().alias("leg1_flight_view_count"))
    all_counts = df.group_by("all_key").agg(pl.count().alias("all_flight_view_count"))

    df = df.join(leg0_counts, on="legs0_key", how="left")
    df = df.join(leg1_counts, on="legs1_key", how="left")
    df = df.join(all_counts, on="all_key", how="left")

    ranker_stats = df.group_by("ranker_id").agg([
        pl.max("leg0_flight_view_count").alias("leg0_view_max"),
        pl.max("leg1_flight_view_count").alias("leg1_view_max"),
        pl.max("all_flight_view_count").alias("all_view_max"),
    ])

    df = df.join(ranker_stats, on="ranker_id", how="left")

    df = df.with_columns([
        (pl.col("leg0_flight_view_count") / (pl.col("leg0_view_max") + 1e-5)).alias("leg0_view_norm"),
        (pl.col("leg1_flight_view_count") / (pl.col("leg1_view_max") + 1e-5)).alias("leg1_view_norm"),
        (pl.col("all_flight_view_count") / (pl.col("all_view_max") + 1e-5)).alias("all_view_norm"),
    ])

    ranker_stats_mean = df.group_by("ranker_id").agg([
        pl.mean("leg0_flight_view_count").alias("leg0_view_mean"),
        pl.mean("leg1_flight_view_count").alias("leg1_view_mean"),
        pl.mean("all_flight_view_count").alias("all_view_mean"),
    ])

    df = df.join(ranker_stats_mean, on="ranker_id", how="left")

    df = df.with_columns([
        (pl.col("leg0_flight_view_count") - pl.col("leg0_view_mean")).alias("leg0_view_diff_mean"),
        (pl.col("leg1_flight_view_count") - pl.col("leg1_view_mean")).alias("leg1_view_diff_mean"),
        (pl.col("all_flight_view_count") - pl.col("all_view_mean")).alias("all_view_diff_mean"),
    ])

    columns_to_drop = [
        "leg0_view_max", "leg1_view_max", "all_view_max",
        "leg0_view_mean", "leg1_view_mean", "all_view_mean",
        "legs0_key", "legs1_key", "all_key"
    ] + [f"legs0_segments{i}_key" for i in range(4)] + [f"legs1_segments{i}_key" for i in range(4)]

    df = df.drop(columns_to_drop)

    rank_features = [
        "leg0_flight_view_count",
        "leg1_flight_view_count",
        "all_flight_view_count",
    ] + [f"legs0_segments{i}_key_view_count" for i in range(4)] + [f"legs1_segments{i}_key_view_count" for i in range(4)]

    rank_exprs = []
    for col in rank_features:
        rank_exprs.append(
            pl.col(col).rank(method="dense").over("ranker_id").alias(f"{col}_rank")
        )
        
    df = df.with_columns(rank_exprs)

    return df


In [6]:
train_filled["selected", "legs0_segments1_key","legs0_segments1_key_view_count","leg0_flight_view_count", "leg1_flight_view_count", "all_flight_view_count"]

selected,legs0_segments1_key,legs0_segments1_key_view_count,leg0_flight_view_count,leg1_flight_view_count,all_flight_view_count
i64,str,u32,u32,u32,u32
1,"""missing-missing""",0,19,7,1
0,"""OVB-KJA""",99061,93,24,24
0,"""OVB-KJA""",99061,93,24,24
0,"""OVB-KJA""",99061,93,24,24
0,"""OVB-KJA""",99061,93,24,24
…,…,…,…,…,…
0,"""SGC-SVX""",9089,1542,293,83
0,"""UFA-SVX""",13035,1654,701,232
0,"""UFA-SVX""",13035,1654,293,90
0,"""KUF-SVX""",14413,402,701,67


In [7]:
import polars as pl

# 確定新創造的欄位
new_cols = [
    col for col in train_filled.columns
    if col.endswith("_view_count") or col.endswith("_flight_view_count")
]

# 對 selected 分組統計
summary = (
    train_filled
    .group_by("selected")
    .agg([
        pl.col(c).mean().alias(f"{c}_mean") for c in new_cols
    ] + [
        pl.col(c).median().alias(f"{c}_median") for c in new_cols
    ])
)

# 顯示結果
print(summary)


shape: (2, 23)
┌──────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ selected ┆ legs0_seg ┆ legs1_seg ┆ legs0_seg ┆ … ┆ legs1_seg ┆ leg0_flig ┆ leg1_flig ┆ all_fligh │
│ ---      ┆ ments0_ke ┆ ments0_ke ┆ ments1_ke ┆   ┆ ments3_ke ┆ ht_view_c ┆ ht_view_c ┆ t_view_co │
│ i64      ┆ y_view_co ┆ y_view_co ┆ y_view_co ┆   ┆ y_view_co ┆ ount_medi ┆ ount_medi ┆ unt_media │
│          ┆ unt…      ┆ unt…      ┆ unt…      ┆   ┆ unt…      ┆ an        ┆ an        ┆ n         │
│          ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│          ┆ f64       ┆ f64       ┆ f64       ┆   ┆ f64       ┆ f64       ┆ f64       ┆ f64       │
╞══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 0        ┆ 1.6158e6  ┆ 1.1866e6  ┆ 62546.798 ┆ … ┆ 0.0       ┆ 254143.0  ┆ 591002.0  ┆ 164134.0  │
│          ┆           ┆           ┆ 422       ┆   ┆           ┆           ┆

In [8]:
summary

selected,legs0_segments0_key_view_count_mean,legs1_segments0_key_view_count_mean,legs0_segments1_key_view_count_mean,legs1_segments1_key_view_count_mean,legs0_segments2_key_view_count_mean,legs1_segments2_key_view_count_mean,legs0_segments3_key_view_count_mean,legs1_segments3_key_view_count_mean,leg0_flight_view_count_mean,leg1_flight_view_count_mean,all_flight_view_count_mean,legs0_segments0_key_view_count_median,legs1_segments0_key_view_count_median,legs0_segments1_key_view_count_median,legs1_segments1_key_view_count_median,legs0_segments2_key_view_count_median,legs1_segments2_key_view_count_median,legs0_segments3_key_view_count_median,legs1_segments3_key_view_count_median,leg0_flight_view_count_median,leg1_flight_view_count_median,all_flight_view_count_median
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0,1615800.0,1186600.0,62546.798422,63814.990028,1041.334794,579.357371,0.000518,8e-06,854642.840459,1534900.0,392850.936772,642527.0,215527.0,0.0,0.0,0.0,0.0,0.0,0.0,254143.0,591002.0,164134.0
1,457649.008461,168575.473086,40684.721999,20811.323814,23.279754,15.748169,0.0,0.0,215366.434967,2498900.0,84134.372033,46027.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6998.0,4387201.0,2289.0


# price rank

In [None]:
from scripts.feature_enigeer import build_price_features
# 執行價格特徵工程，並存檔
price_features = build_price_features(
    train_filled,
    output_dir="data/extra_features/train/"
)

# 查看結果
print(price_features)


In [None]:
from scripts.feature_enigeer import build_price_features
# 執行價格特徵工程，並存檔
price_features = build_price_features(
    test_filled,
    output_dir="data/extra_features/test/"
)

# 查看結果
print(price_features)


# 處理duration

In [None]:
import polars as pl
from scripts.feature_enigeer import build_duration_features
# 所需欄位
duration_cols = [
    "legs0_duration",
    "legs1_duration",
    "legs0_segments0_duration",
    "legs0_segments1_duration",
    "legs0_segments2_duration",
    "legs0_segments3_duration",
    "legs1_segments0_duration",
    "legs1_segments1_duration",
    "legs1_segments2_duration",
    "legs1_segments3_duration"
]

needed_cols = ["Id", "ranker_id", "totalPrice"] + duration_cols

# 篩選
train_filled = train_filled.select([c for c in needed_cols if c in train_filled.columns])

# 再交給 build_duration_features
duration_features = build_duration_features(
    train_filled,
    output_dir="data/extra_features/train/"
)


In [None]:
import polars as pl
from scripts.feature_enigeer import build_duration_features

# 再交給 build_duration_features
duration_features = build_duration_features(
    test_filled,
    output_dir="data/extra_features/test/"
)


# frequent_flyer_features code

In [3]:
import polars as pl
from scripts.feature_enigeer import build_frequent_flyer_match_features
# 所需欄位
needed_cols = [
        "Id", "ranker_id", "frequentFlyer",
        "legs0_duration", "legs1_duration",
        *[f"{s}_{t}" for s in [
            "legs0_segments0", "legs0_segments1", "legs0_segments2", "legs0_segments3",
            "legs1_segments0", "legs1_segments1", "legs1_segments2", "legs1_segments3"
        ] for t in ["marketingCarrier_code", "operatingCarrier_code", "duration"]]
    ]


# 篩選
train_filled = train_filled.select([c for c in needed_cols if c in train_filled.columns])

# 再交給 build_duration_features
ff_features = build_frequent_flyer_match_features(
    train_filled,
    output_dir="data/extra_features/train/"
)


✅ 已儲存 Parquet: data/extra_features/train/3_frequent_flyer_features.parquet
✅ 已完成 frequentFlyer 特徵 + match 特徵 + duration 特徵生成


In [9]:
import polars as pl
from scripts.feature_enigeer import build_frequent_flyer_match_features
# 所需欄位
needed_cols = [
        "Id", "ranker_id", "frequentFlyer",
        "legs0_duration", "legs1_duration",
        *[f"{s}_{t}" for s in [
            "legs0_segments0", "legs0_segments1", "legs0_segments2", "legs0_segments3",
            "legs1_segments0", "legs1_segments1", "legs1_segments2", "legs1_segments3"
        ] for t in ["marketingCarrier_code", "operatingCarrier_code", "duration"]]
    ]


# 篩選
test_filled = test_filled.select([c for c in needed_cols if c in test_filled.columns])

# 再交給 build_duration_features
ff_features = build_frequent_flyer_match_features(
    test_filled,
    output_dir="data/extra_features/test/"
)


✅ 已儲存 Parquet: data/extra_features/test/3_frequent_flyer_features.parquet
✅ 已完成 frequentFlyer 特徵 + match 特徵 + duration 特徵生成


# Bagges

In [None]:
from scripts.feature_enigeer import build_baggage_fee_features

baggage_fee_df = build_baggage_fee_features(
    df=train_filled.select([
        "Id",
        "totalPrice",
        'ranker_id',
        "legs0_segments0_baggageAllowance_quantity",
        "legs1_segments0_baggageAllowance_quantity",
        "miniRules0_monetaryAmount",
        "miniRules1_monetaryAmount"
    ]),
    output_dir="data/extra_features/train/"
)


In [3]:
from scripts.feature_enigeer import build_baggage_fee_features

baggage_fee_df = build_baggage_fee_features(
    df=test_filled.select([
        "Id",
        "totalPrice",
        'ranker_id',
        "legs0_segments0_baggageAllowance_quantity",
        "legs1_segments0_baggageAllowance_quantity",
        "miniRules0_monetaryAmount",
        "miniRules1_monetaryAmount"
    ]),
    output_dir="data/extra_features/test/"
)


✅ 已儲存 Parquet: data/extra_features/test/4_baggage_fee_features.parquet


# cabinClass

In [None]:
from scripts.feature_enigeer import build_cabin_features
train_filled = train_filled.select([
        "Id",
        *[f"legs{i}_segments{j}_cabinClass" for i in [0,1] for j in range(4)],
        *[f"legs{i}_segments{j}_duration" for i in [0,1] for j in range(4)]
    ])

cabin_features_df = build_cabin_features(
    df=train_filled,
    output_dir="data/extra_features/train/"
)


In [4]:
from scripts.feature_enigeer import build_cabin_features
test_filled = test_filled.select([
        "Id",
        *[f"legs{i}_segments{j}_cabinClass" for i in [0,1] for j in range(4)],
        *[f"legs{i}_segments{j}_duration" for i in [0,1] for j in range(4)]
    ])

cabin_features_df = build_cabin_features(
    df=test_filled,
    output_dir="data/extra_features/test/"
)


  pl.struct(all_duration_cols + all_cabin_cols)


✅ 已儲存 Parquet: data/extra_features/test/5_cabin_features.parquet


# add time

In [None]:
from scripts.feature_enigeer import build_time_features
train_filled = train_filled.select(["Id", "requestDate", "legs0_departureAt", "legs0_arrivalAt", "legs1_departureAt", "legs1_arrivalAt"])

df_time_features = build_time_features(
    df=train_filled,
    output_dir="data/extra_features/train/"
)


In [3]:
from scripts.feature_enigeer import build_time_features
test_filled = test_filled.select(["Id", "requestDate", "legs0_departureAt", "legs0_arrivalAt", "legs1_departureAt", "legs1_arrivalAt"])

df_time_features = build_time_features(
    df=test_filled,
    output_dir="data/extra_features/test/"
)


✅ 已儲存 Parquet: data/extra_features/test/6_time_features.parquet
✅ 所有時間特徵已生成完成


# build_corporate_access_route_features

In [None]:
from scripts.feature_enigeer import build_corporate_access_route_features
df_features = build_corporate_access_route_features(
    df=train_filled.select(["Id", "corporateTariffCode", "pricingInfo_isAccessTP", "searchRoute"]),
    output_dir="data/extra_features/train/"
)


In [3]:
from scripts.feature_enigeer import build_corporate_access_route_features
df_features = build_corporate_access_route_features(
    df=test_filled.select(["Id", "corporateTariffCode", "pricingInfo_isAccessTP", "searchRoute"]),
    output_dir="data/extra_features/test/"
)


✅ 已儲存 Parquet: data/extra_features/test/7_corporate_access_route_features.parquet
✅ 已完成 corporate/access/route 特徵生成


# 加上是否直飛跟轉機幾次

In [None]:
from scripts.feature_enigeer import build_transfer_features
transfer_features_df = build_transfer_features(
    df=train_filled.select([
        "Id", "ranker_id", "legs0_duration", "legs1_duration",
        "legs0_segments1_departureFrom_airport_iata",
        "legs0_segments2_departureFrom_airport_iata",
        "legs0_segments3_departureFrom_airport_iata",
        "legs1_segments1_departureFrom_airport_iata",
        "legs1_segments2_departureFrom_airport_iata",
        "legs1_segments3_departureFrom_airport_iata"
    ]),
    output_dir="data/extra_features/train/"
)


In [4]:
from scripts.feature_enigeer import build_transfer_features
transfer_features_df = build_transfer_features(
    df=test_filled.select([
        "Id", "ranker_id", "legs0_duration", "legs1_duration",
        "legs0_segments1_departureFrom_airport_iata",
        "legs0_segments2_departureFrom_airport_iata",
        "legs0_segments3_departureFrom_airport_iata",
        "legs1_segments1_departureFrom_airport_iata",
        "legs1_segments2_departureFrom_airport_iata",
        "legs1_segments3_departureFrom_airport_iata"
    ]),
    output_dir="data/extra_features/test/"
)


✅ 已儲存 Parquet: data/extra_features/test/8_transfer_features.parquet
✅ 已完成轉機特徵生成


# Carrier

In [3]:
from scripts.feature_enigeer import build_carrier_consistency_features
required_columns = [
    # 主鍵
    "Id",
    "ranker_id",
    # legs0 轉機判斷
    "legs0_segments1_departureFrom_airport_iata",
    "legs0_segments2_departureFrom_airport_iata",
    "legs0_segments3_departureFrom_airport_iata",
    # legs1 轉機判斷
    "legs1_segments1_departureFrom_airport_iata",
    "legs1_segments2_departureFrom_airport_iata",
    "legs1_segments3_departureFrom_airport_iata",
    # legs0 Marketing
    "legs0_segments0_marketingCarrier_code",
    "legs0_segments1_marketingCarrier_code",
    "legs0_segments2_marketingCarrier_code",
    "legs0_segments3_marketingCarrier_code",
    # legs1 Marketing
    "legs1_segments0_marketingCarrier_code",
    "legs1_segments1_marketingCarrier_code",
    "legs1_segments2_marketingCarrier_code",
    "legs1_segments3_marketingCarrier_code"
]

train_filled = train_filled.select(required_columns)
import pickle
with open("data/extra_features/transform_config_rank.pkl", "rb") as f:
    config = pickle.load(f)

carrier_df = build_carrier_consistency_features(
    df=train_filled,
    output_dir="data/extra_features/train/",
    transform_config=config
)


✅ 正在共用carrier encoding處理 ['legs0_main_carrier', 'legs1_main_carrier']
✅ 已儲存 Parquet: data/extra_features/train/9_carrier_consistency_features.parquet
✅ 已完成主Carrier一致性與轉機次數特徵


In [3]:
from scripts.feature_enigeer import build_carrier_consistency_features
required_columns = [
    # 主鍵
    "Id",
    "ranker_id",
    # legs0 轉機判斷
    "legs0_segments1_departureFrom_airport_iata",
    "legs0_segments2_departureFrom_airport_iata",
    "legs0_segments3_departureFrom_airport_iata",
    # legs1 轉機判斷
    "legs1_segments1_departureFrom_airport_iata",
    "legs1_segments2_departureFrom_airport_iata",
    "legs1_segments3_departureFrom_airport_iata",
    # legs0 Marketing
    "legs0_segments0_marketingCarrier_code",
    "legs0_segments1_marketingCarrier_code",
    "legs0_segments2_marketingCarrier_code",
    "legs0_segments3_marketingCarrier_code",
    # legs1 Marketing
    "legs1_segments0_marketingCarrier_code",
    "legs1_segments1_marketingCarrier_code",
    "legs1_segments2_marketingCarrier_code",
    "legs1_segments3_marketingCarrier_code"
]

test_filled = test_filled.select(required_columns)
import pickle
with open("data/extra_features/transform_config_rank.pkl", "rb") as f:
    config = pickle.load(f)

carrier_df = build_carrier_consistency_features(
    df=test_filled,
    output_dir="data/extra_features/test/",
    transform_config=config
)


✅ 正在共用carrier encoding處理 ['legs0_main_carrier', 'legs1_main_carrier']
✅ 已儲存 Parquet: data/extra_features/test/9_carrier_consistency_features.parquet
✅ 已完成主Carrier一致性與轉機次數特徵


# encoding category features.

In [4]:
from scripts.feature_enigeer import build_label_encoding_features
# ✅ 先把所有 columns 列出
all_cols = train_filled.columns

# ✅ Collect relevant columns
selected_cols = []
# ✅ Collect relevant columns + Id
selected_cols = ["Id"]  # 一定要先放Id

# Aircraft code
selected_cols += [c for c in all_cols if c.endswith("_aircraft_code")]

# Flight Number
selected_cols += [c for c in all_cols if c.endswith("_flightNumber")]

# Airport / City
selected_cols += [c for c in all_cols if "_arrivalTo_airport_" in c or "_departureFrom_airport_" in c]

# Carrier
selected_cols += [c for c in all_cols if c.endswith("_marketingCarrier_code") or c.endswith("_operatingCarrier_code")]

# searchRoute
selected_cols.append("searchRoute")

# frequentFlyer
selected_cols.append("frequentFlyer")

# 去除重複
selected_cols = list(dict.fromkeys(selected_cols))

print("✅ build_label_encoding_features 需要的欄位：", selected_cols)

train_filled = train_filled.select(selected_cols)
import pickle
with open("data/extra_features/transform_config_rank.pkl", "rb") as f:
    config = pickle.load(f)

# 執行編碼
df_encoded, config = build_label_encoding_features(
    df=train_filled,
    output_dir="data/extra_features/train/",
    transform_config=config
)


✅ build_label_encoding_features 需要的欄位： ['Id', 'legs0_segments0_aircraft_code', 'legs0_segments1_aircraft_code', 'legs0_segments2_aircraft_code', 'legs0_segments3_aircraft_code', 'legs1_segments0_aircraft_code', 'legs1_segments1_aircraft_code', 'legs1_segments2_aircraft_code', 'legs1_segments3_aircraft_code', 'legs0_segments0_flightNumber', 'legs0_segments1_flightNumber', 'legs0_segments2_flightNumber', 'legs0_segments3_flightNumber', 'legs1_segments0_flightNumber', 'legs1_segments1_flightNumber', 'legs1_segments2_flightNumber', 'legs1_segments3_flightNumber', 'legs0_segments0_arrivalTo_airport_city_iata', 'legs0_segments0_arrivalTo_airport_iata', 'legs0_segments0_departureFrom_airport_iata', 'legs0_segments1_arrivalTo_airport_city_iata', 'legs0_segments1_arrivalTo_airport_iata', 'legs0_segments1_departureFrom_airport_iata', 'legs0_segments2_arrivalTo_airport_city_iata', 'legs0_segments2_arrivalTo_airport_iata', 'legs0_segments2_departureFrom_airport_iata', 'legs0_segments3_arrivalTo_ai

In [3]:
from scripts.feature_enigeer import build_label_encoding_features
import pickle

# ✅ 先把所有 columns 列出
all_cols = test_filled.columns

# ✅ Collect relevant columns
selected_cols = []
# ✅ Collect relevant columns + Id
selected_cols = ["Id"]  # 一定要先放Id

# Aircraft code
selected_cols += [c for c in all_cols if c.endswith("_aircraft_code")]

# Flight Number
selected_cols += [c for c in all_cols if c.endswith("_flightNumber")]

# Airport / City
selected_cols += [c for c in all_cols if "_arrivalTo_airport_" in c or "_departureFrom_airport_" in c]

# Carrier
selected_cols += [c for c in all_cols if c.endswith("_marketingCarrier_code") or c.endswith("_operatingCarrier_code")]

# searchRoute
selected_cols.append("searchRoute")

# frequentFlyer
selected_cols.append("frequentFlyer")

# 去除重複
selected_cols = list(dict.fromkeys(selected_cols))

print("✅ build_label_encoding_features 需要的欄位：", selected_cols)

test_filled = test_filled.select(selected_cols)
with open("data/extra_features/transform_config_rank.pkl", "rb") as f:
    config = pickle.load(f)

# 執行編碼
df_encoded, config = build_label_encoding_features(
    df=test_filled,
    output_dir="data/extra_features/test/",
    transform_config = config
)


✅ build_label_encoding_features 需要的欄位： ['Id', 'legs0_segments0_aircraft_code', 'legs0_segments1_aircraft_code', 'legs0_segments2_aircraft_code', 'legs0_segments3_aircraft_code', 'legs1_segments0_aircraft_code', 'legs1_segments1_aircraft_code', 'legs1_segments2_aircraft_code', 'legs1_segments3_aircraft_code', 'legs0_segments0_flightNumber', 'legs0_segments1_flightNumber', 'legs0_segments2_flightNumber', 'legs0_segments3_flightNumber', 'legs1_segments0_flightNumber', 'legs1_segments1_flightNumber', 'legs1_segments2_flightNumber', 'legs1_segments3_flightNumber', 'legs0_segments0_arrivalTo_airport_city_iata', 'legs0_segments0_arrivalTo_airport_iata', 'legs0_segments0_departureFrom_airport_iata', 'legs0_segments1_arrivalTo_airport_city_iata', 'legs0_segments1_arrivalTo_airport_iata', 'legs0_segments1_departureFrom_airport_iata', 'legs0_segments2_arrivalTo_airport_city_iata', 'legs0_segments2_arrivalTo_airport_iata', 'legs0_segments2_departureFrom_airport_iata', 'legs0_segments3_arrivalTo_ai

# remove time

In [1]:
from scripts.feature_enigeer import merge_original_with_extra_features, clean_fill_and_cast_columns

merged_df = merge_original_with_extra_features(
    base_parquet_path="data/train.parquet",
    extra_features_dir="data/extra_features/train/"
)

# 如果要儲存
merged_df  = clean_fill_and_cast_columns(merged_df)
merged_df.write_parquet("data/train_filled.parquet")

✅ 讀取原始資料: data/train.parquet
✅ 共找到 10 個 Parquet 要合併
🔹 合併第 1/10 個: data/extra_features/train/1_price_features.parquet
🔹 合併第 2/10 個: data/extra_features/train/2_duration_features.parquet
⚠️ 10 個特徵將被新檔案覆蓋: ['legs1_segments2_duration', 'legs1_segments0_duration', 'legs0_segments3_duration', 'legs0_duration', 'legs0_segments1_duration', 'legs0_segments2_duration', 'legs1_segments1_duration', 'legs1_duration', 'legs1_segments3_duration', 'legs0_segments0_duration']
🔹 合併第 3/10 個: data/extra_features/train/3_frequent_flyer_features.parquet
⚠️ 40 個特徵將被新檔案覆蓋: ['legs1_segments0_duration', 'legs1_segments3_duration_rank', 'legs0_segments3_duration_rank', 'legs0_segments0_duration_rank', 'legs1_segments3_operatingCarrier_code', 'legs0_segments3_marketingCarrier_code', 'total_duration_rank', 'legs0_segments1_marketingCarrier_code', 'legs1_segments0_operatingCarrier_code', 'legs1_duration_rank', 'legs1_segments0_duration_rank', 'legs1_segments1_duration', 'legs1_duration', 'legs1_segments1_duration_r

In [1]:
from scripts.feature_enigeer import merge_original_with_extra_features,clean_fill_and_cast_columns
merged_df = merge_original_with_extra_features(
    base_parquet_path="data/test.parquet",
    extra_features_dir="data/extra_features/test/")

# 如果要儲存
merged_df  = clean_fill_and_cast_columns(merged_df)

merged_df.write_parquet("data/test_filled.parquet")
merged_df

✅ 讀取原始資料: data/test.parquet
✅ 共找到 10 個 Parquet 要合併
🔹 合併第 1/10 個: data/extra_features/test/1_price_features.parquet
🔹 合併第 2/10 個: data/extra_features/test/2_duration_features.parquet
⚠️ 10 個特徵將被新檔案覆蓋: ['legs0_segments3_duration', 'legs1_segments1_duration', 'legs0_segments1_duration', 'legs1_segments2_duration', 'legs0_duration', 'legs1_segments0_duration', 'legs1_segments3_duration', 'legs0_segments2_duration', 'legs0_segments0_duration', 'legs1_duration']
🔹 合併第 3/10 個: data/extra_features/test/3_frequent_flyer_features.parquet
⚠️ 40 個特徵將被新檔案覆蓋: ['legs1_segments3_duration_rank', 'legs0_segments3_duration', 'legs0_segments1_duration', 'legs0_segments0_marketingCarrier_code', 'legs1_segments0_duration', 'legs1_segments1_operatingCarrier_code', 'legs0_segments2_duration', 'legs0_segments0_duration', 'legs1_segments0_duration_rank', 'legs1_segments1_duration_rank', 'legs1_segments0_marketingCarrier_code', 'legs0_segments3_duration_rank', 'legs0_segments2_duration_rank', 'legs0_duration_ran

Id,bySelf,companyID,nationality,isAccess3D,isVip,legs0_segments0_baggageAllowance_weightMeasurementType,legs0_segments0_seatsAvailable,legs0_segments1_baggageAllowance_quantity,legs0_segments1_baggageAllowance_weightMeasurementType,legs0_segments1_seatsAvailable,legs0_segments2_baggageAllowance_quantity,legs0_segments2_baggageAllowance_weightMeasurementType,legs0_segments2_seatsAvailable,legs0_segments3_baggageAllowance_quantity,legs0_segments3_baggageAllowance_weightMeasurementType,legs0_segments3_seatsAvailable,legs1_segments0_baggageAllowance_weightMeasurementType,legs1_segments0_seatsAvailable,legs1_segments1_baggageAllowance_quantity,legs1_segments1_baggageAllowance_weightMeasurementType,legs1_segments1_seatsAvailable,legs1_segments2_baggageAllowance_quantity,legs1_segments2_baggageAllowance_weightMeasurementType,legs1_segments2_seatsAvailable,legs1_segments3_baggageAllowance_quantity,legs1_segments3_baggageAllowance_weightMeasurementType,legs1_segments3_seatsAvailable,miniRules0_percentage,miniRules0_statusInfos,miniRules1_percentage,miniRules1_statusInfos,pricingInfo_passengerCount,profileId,sex,taxes,__index_level_0__,…,legs1_segments2_departureFrom_airport_iata,legs1_segments3_arrivalTo_airport_city_iata,legs1_segments3_arrivalTo_airport_iata,legs1_segments3_departureFrom_airport_iata,legs0_segments0_marketingCarrier_code,legs0_segments0_operatingCarrier_code,legs0_segments1_marketingCarrier_code,legs0_segments1_operatingCarrier_code,legs0_segments2_marketingCarrier_code,legs0_segments2_operatingCarrier_code,legs0_segments3_marketingCarrier_code,legs0_segments3_operatingCarrier_code,legs1_segments0_marketingCarrier_code,legs1_segments0_operatingCarrier_code,legs1_segments1_marketingCarrier_code,legs1_segments1_operatingCarrier_code,legs1_segments2_marketingCarrier_code,legs1_segments2_operatingCarrier_code,legs1_segments3_marketingCarrier_code,legs1_segments3_operatingCarrier_code,legs0_segments0_aircraft_code,legs0_segments1_aircraft_code,legs0_segments2_aircraft_code,legs0_segments3_aircraft_code,legs1_segments0_aircraft_code,legs1_segments1_aircraft_code,legs1_segments2_aircraft_code,legs1_segments3_aircraft_code,legs0_segments0_flightNumber,legs0_segments1_flightNumber,legs0_segments2_flightNumber,legs0_segments3_flightNumber,legs1_segments0_flightNumber,legs1_segments1_flightNumber,legs1_segments2_flightNumber,legs1_segments3_flightNumber,searchRoute
i64,i8,i64,i64,i8,i8,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,i8,f64,i64,…,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32
18144679,1,62840,36,0,0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,3604015,0,1018.0,18144679,…,678,-1,-1,-1,184,184,233,233,233,233,-1,-1,184,184,233,233,233,233,-1,-1,12,118,118,-1,13,118,118,-1,456,8790,8790,-1,450,8790,8790,-1,3236
18144680,1,62840,36,0,0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1,3604015,0,1018.0,18144680,…,678,-1,-1,-1,184,184,233,233,233,233,-1,-1,184,184,233,233,233,233,-1,-1,12,118,118,-1,13,118,118,-1,456,8790,8790,-1,450,8790,8790,-1,3236
18144681,1,62840,36,0,0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1,3604015,0,1018.0,18144681,…,678,-1,-1,-1,184,184,233,233,233,233,-1,-1,184,184,233,233,233,233,-1,-1,12,118,118,-1,13,118,118,-1,456,8790,8790,-1,450,8790,8790,-1,3236
18144682,1,62840,36,0,0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,3604015,0,3284.0,18144682,…,678,-1,-1,-1,197,197,233,233,233,233,-1,-1,197,197,233,233,233,233,-1,-1,9,118,118,-1,16,118,118,-1,1867,8790,8790,-1,2132,8790,8790,-1,3236
18144683,1,62840,36,0,0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1,3604015,0,3284.0,18144683,…,678,-1,-1,-1,197,197,233,233,233,233,-1,-1,197,197,233,233,233,233,-1,-1,9,118,118,-1,16,118,118,-1,1867,8790,8790,-1,2132,8790,8790,-1,3236
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
25043143,0,57320,36,1,0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1,3647580,1,566.0,25043143,…,678,-1,-1,-1,184,184,233,233,233,233,-1,-1,233,233,233,233,233,233,-1,-1,12,118,118,-1,118,118,118,-1,749,8790,8790,-1,8790,8790,8790,-1,2725
25043144,0,57320,36,1,0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1,3647580,1,566.0,25043144,…,678,-1,-1,-1,184,184,233,233,233,233,-1,-1,233,233,233,233,233,233,-1,-1,12,118,118,-1,118,118,118,-1,192,8790,8790,-1,8790,8790,8790,-1,2725
25043145,0,57320,36,1,0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1,3647580,1,566.0,25043145,…,678,-1,-1,-1,184,184,233,233,233,233,-1,-1,233,233,233,233,233,233,-1,-1,12,118,118,-1,118,118,118,-1,192,8790,8790,-1,8790,8790,8790,-1,2725
25043146,0,57320,36,1,0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1,3647580,1,566.0,25043146,…,678,-1,-1,-1,184,184,233,233,233,233,-1,-1,233,233,233,233,233,233,-1,-1,13,118,118,-1,118,118,118,-1,194,8790,8790,-1,8790,8790,8790,-1,2725


In [3]:
# 找字串欄
import polars as pl
str_cols = [c for c in merged_df.columns if merged_df[c].dtype in (pl.Utf8, pl.String)]
# 找數值欄
numeric_cols = [c for c in merged_df.columns if merged_df[c].dtype.is_numeric()]
# 找布林欄
bool_cols = [c for c in merged_df.columns if merged_df[c].dtype == pl.Boolean]

print(f"✅ 共找到 {len(str_cols)} 個字串欄位")
print(f"✅ 共找到 {len(numeric_cols)} 個數值欄位")
print(f"✅ 共找到 {len(bool_cols)} 個布林欄位")


✅ 共找到 6 個字串欄位
✅ 共找到 228 個數值欄位
✅ 共找到 0 個布林欄位


In [None]:
merged_df[str_cols]

frequentFlyer,legs0_departureAt,legs0_arrivalAt,legs1_departureAt,legs1_arrivalAt,ranker_id
str,str,str,str,str,str
"""missing""","""2024-12-19T06:50:00""","""2024-12-19T11:20:00""","""2024-12-21T21:10:00""","""2024-12-21T21:55:00""","""c9373e5f772e43d593dd6ad2fa90f6…"
"""missing""","""2024-12-19T06:50:00""","""2024-12-19T11:20:00""","""2024-12-21T21:10:00""","""2024-12-21T21:55:00""","""c9373e5f772e43d593dd6ad2fa90f6…"
"""missing""","""2024-12-19T06:50:00""","""2024-12-19T11:20:00""","""2024-12-21T21:10:00""","""2024-12-21T21:55:00""","""c9373e5f772e43d593dd6ad2fa90f6…"
"""missing""","""2024-12-19T08:25:00""","""2024-12-19T12:45:00""","""2024-12-21T12:00:00""","""2024-12-21T12:35:00""","""c9373e5f772e43d593dd6ad2fa90f6…"
"""missing""","""2024-12-19T08:25:00""","""2024-12-19T12:45:00""","""2024-12-21T12:00:00""","""2024-12-21T12:35:00""","""c9373e5f772e43d593dd6ad2fa90f6…"
…,…,…,…,…,…
"""missing""","""2025-01-08T09:05:00""","""2025-01-08T12:50:00""","""missing""","""missing""","""c5622e0de0594bde95a4dd8c1fcff7…"
"""missing""","""2025-01-08T21:25:00""","""2025-01-09T01:10:00""","""missing""","""missing""","""c5622e0de0594bde95a4dd8c1fcff7…"
"""missing""","""2025-01-08T21:25:00""","""2025-01-09T01:10:00""","""missing""","""missing""","""c5622e0de0594bde95a4dd8c1fcff7…"
"""missing""","""2025-01-08T15:10:00""","""2025-01-08T18:50:00""","""missing""","""missing""","""c5622e0de0594bde95a4dd8c1fcff7…"


: 

In [2]:
import polars as pl
# 先讀 parquet
test = pl.read_parquet('data/test_filled.parquet')
train = pl.read_parquet('data/train_filled.parquet')  # 這裡你可能打錯了，train 路徑

# 取欄位名稱
train_cols = set(train.columns)
test_cols = set(test.columns)

# 列印比較
print("✅ Train columns:", len(train_cols))
print("✅ Test columns:", len(test_cols))

print("\n🎯 Train 中有但 Test 沒有的欄位:")
print(sorted(train_cols - test_cols))

print("\n🎯 Test 中有但 Train 沒有的欄位:")
print(sorted(test_cols - train_cols))

# 如果完全一樣
if train_cols == test_cols:
    print("\n✅ Train 和 Test 欄位完全一致！")


✅ Train columns: 236
✅ Test columns: 235

🎯 Train 中有但 Test 沒有的欄位:
['selected']

🎯 Test 中有但 Train 沒有的欄位:
[]
