In [1]:
import polars as pl

# 1. 讀取 model importance CSV
importance_df = pl.read_csv(
    "model_output/all_features_xgb/v2/v1_model/model_importance/common_features_with_ranks.csv"
)

# 2. 篩選 min_rank_overall == 9999.0
unimportant_features_df = importance_df.filter(pl.col("min_rank_overall") == 9999.0)

# 3. 取得要移除的欄位名稱
unimportant_features = unimportant_features_df["feature"].to_list()

# 4. 印出
print("✅ 以下特徵 min_rank_overall=9999.0，將被移除：")
for f in unimportant_features:
    print(f"- {f}")

# 5. 讀取 parquet
df = pl.read_parquet("data/train_filled_group_aware_features.parquet")

# 6. 移除這些欄位 (如果存在)
existing_cols_to_remove = [c for c in unimportant_features if c in df.columns]
df_filtered = df.drop(existing_cols_to_remove)

# 7. 儲存
output_path = "data/train_filled_group_aware_features_filter.parquet"
df_filtered.write_parquet(output_path)

print(f"✅ 已移除 {len(existing_cols_to_remove)} 個欄位")
print(f"✅ 已儲存: {output_path}")


✅ 以下特徵 min_rank_overall=9999.0，將被移除：
- legs1_segments3_marketingCarrier_code
- legs1_segments3_marketingCarrier_code_in_ff
- legs1_segments2_arrivalTo_airport_iata
- legs1_segments2_baggageAllowance_weightMeasurementType
- legs1_segments3_flightNumber
- legs1_segments2_arrivalTo_airport_city_iata
- legs1_segments3_operatingCarrier_code
- legs1_segments3_duration_rank
- legs1_segments3_arrivalTo_airport_iata
- legs1_segments3_departureFrom_airport_iata
- legs1_segments3_cabinClass
- legs1_segments3_seatsAvailable
- legs1_segments3_baggageAllowance_weightMeasurementType
- legs1_segments3_arrivalTo_airport_city_iata
- legs1_segments3_aircraft_code
- legs0_segments3_baggageAllowance_quantity
- legs1_segments3_baggageAllowance_quantity
- legs1_segments2_cabinClass
- legs1_segments3_duration
- legs0_segments3_cabinClass
- legs0_segments3_operatingCarrier_code
- legs0_segments3_duration
- legs0_all_segments_carrier_same
- pricingInfo_passengerCount
- is_round_trip
- bySelf
- has_corporate_tar

KeyboardInterrupt: 

In [1]:
import polars as pl

# 參數：要保留前幾名
TOP_N = 150

# 1. 讀取 model importance CSV
importance_df = pl.read_csv(
    "model_output/all_features_xgb/v2/v1_model/model_importance/common_features_with_ranks.csv"
)

# 2. 排序 (min_rank_overall由小到大)
importance_df = importance_df.sort("avg_rank")

# 3. 取前TOP_N
top_features = importance_df.head(TOP_N)["feature"].to_list()

# 4. 印出
print(f"✅ 保留前 {TOP_N} 個特徵 (依據 min_rank_overall)：")
for f in top_features:
    print(f"- {f}")

# 5. 讀取 parquet
df = pl.read_parquet("data/train_filled_group_aware_features.parquet")

# 6. 保留這些欄位 (如果存在)
cols_to_keep = [c for c in top_features if c in df.columns]
# 必須保留主鍵Id
cols_to_keep = ["Id", "ranker_id", "selected"] + cols_to_keep

df_filtered = df.select(cols_to_keep)

# 7. 儲存
output_path = "data/train_filled_group_aware_features_top150.parquet"
df_filtered.write_parquet(output_path)

print(f"✅ 已保留 {len(cols_to_keep)} 個欄位")
print(f"✅ 已儲存: {output_path}")


✅ 保留前 150 個特徵 (依據 min_rank_overall)：
- total_is_min_transfers
- total_num_transfers_rank
- legs0_segments0_flightNumber
- price_per_duration
- duration_percentile
- companyID
- legs0_num_transfers_rank
- price_per_duration_rank
- has_baggage
- price_rank_times_duration_rank
- legs1_mean_cabin
- price_ratio_group_max
- price_from_median_zscore
- price_percentile
- legs0_weighted_mean_cabin
- has_access_tp
- price_per_fee_rank
- pricingInfo_isAccessTP
- days_before_departure
- price_minus_fee_rank
- total_weighted_mean_cabin
- legs0_arrivalAt_hour
- legs0_mean_cabin
- both_legs_carrier_all_same
- legs0_departureAt_hour
- legs0_segments0_baggageAllowance_quantity
- log_price_times_rank
- baggage_total
- price_per_fee
- duration_ratio_group_max
- legs1_weighted_mean_cabin
- price_per_tax
- legs1_arrivalAt_day_period
- legs1_departureAt_hour
- legs1_segments0_marketingCarrier_code
- total_fees
- legs1_arrivalAt_hour
- duration_ratio
- price_ratio_times_duration_ratio
- log_price_times_ratio

In [3]:
import polars as pl
from itertools import combinations, product
import os

# Load data
df = pl.read_parquet("data/train_filled_v2_top150.parquet")

# 確認欄位
needed_cols = ["ranker_id", "totalPrice", "total_duration", "legs0_mean_cabin", "isVip"]
for c in needed_cols:
    if c not in df.columns:
        raise ValueError(f"❌ 缺少欄位: {c}")

# 所有要做轉換的目標欄位
target_cols = ["totalPrice", "total_duration"]

# 建立轉換表達式
transform_exprs = []
col_transform_map = {col: [] for col in target_cols}

for col in target_cols:
    # 平滑 Z-Score
    name = f"{col}_smoothed_zscore"
    transform_exprs.append(
        ((pl.col(col) - pl.col(col).mean().over("ranker_id")) / (pl.col(col).std().over("ranker_id") + 1))
        .alias(name)
    )
    col_transform_map[col].append(name)

    # Percentile
    name = f"{col}_percentile"
    transform_exprs.append(
        (pl.col(col).rank("average").over("ranker_id") / pl.col(col).count().over("ranker_id"))
        .alias(name)
    )
    col_transform_map[col].append(name)

    # Sqrt
    name = f"{col}_sqrt"
    transform_exprs.append(
        (pl.col(col).sqrt()).alias(name)
    )
    col_transform_map[col].append(name)

    # Relative Difference vs Min
    name = f"{col}_relative_diff"
    transform_exprs.append(
        ((pl.col(col) - pl.col(col).min().over("ranker_id")) / (pl.col(col).min().over("ranker_id") + 1))
        .alias(name)
    )
    col_transform_map[col].append(name)

    # Group Standardization
    name = f"{col}_standardized"
    transform_exprs.append(
        ((pl.col(col) - pl.col(col).mean().over("ranker_id")) / (pl.col(col).std().over("ranker_id") + 1e-3))
        .alias(name)
    )
    col_transform_map[col].append(name)

    # Normalized Rank (-1 ~ +1)
    name = f"{col}_rank_normalized"
    transform_exprs.append(
        (
            (pl.col(col).rank("average").over("ranker_id") - 1) /
            (pl.col(col).count().over("ranker_id") - 1) * 2 - 1
        ).alias(name)
    )
    col_transform_map[col].append(name)

# 執行轉換
df = df.with_columns(transform_exprs)

# 生成二階交互特徵
pair_exprs = []

# 1. 同欄位轉換 × 轉換
for col, trans_cols in col_transform_map.items():
    for a, b in combinations(trans_cols, 2):
        pair_exprs.append(
            (pl.col(a) * pl.col(b)).alias(f"{a}_X_{b}")
        )

# 2. 不同欄位同轉換 × 同轉換
# 先把每個轉換類型收集
transform_types = [name.split("_", 1)[1] for name in col_transform_map[target_cols[0]]]

for t in transform_types:
    col1 = f"{target_cols[0]}_{t}"
    col2 = f"{target_cols[1]}_{t}"
    pair_exprs.append(
        (pl.col(col1) * pl.col(col2)).alias(f"{col1}_X_{col2}")
    )

# ✅ 取消三階交互

# 加入交互特徵
df = df.with_columns(pair_exprs)

output_path = "data/train_filled_v3.parquet"
df.write_parquet(output_path)

print(f"✅ 完成 {sum(len(v) for v in col_transform_map.values())} 個轉換特徵")
print(f"✅ 完成 {len(pair_exprs)} 個二階交互特徵 (僅同欄位轉換 & 不同欄位同轉換)")
print(f"✅ 已輸出到 {output_path}")


✅ 完成 12 個轉換特徵
✅ 完成 36 個二階交互特徵 (僅同欄位轉換 & 不同欄位同轉換)
✅ 已輸出到 data/train_filled_v3.parquet


In [1]:
import polars as pl

# 參數：要保留前幾名
TOP_N = 150

# 1. 讀取 model importance CSV
importance_df = pl.read_csv(
    "model_output/all_features_xgb/v2/v1_model/model_importance/common_features_with_ranks.csv"
)

# 2. 排序 (min_rank_overall由小到大)
importance_df = importance_df.sort("avg_rank")

# 3. 取前TOP_N
top_features = importance_df.head(TOP_N)["feature"].to_list()

# 4. 印出
print(f"✅ 保留前 {TOP_N} 個特徵 (依據 min_rank_overall)：")
for f in top_features:
    print(f"- {f}")

# 5. 讀取 parquet
df = pl.read_parquet("data/test_filled_group_aware_features.parquet")

# 6. 保留這些欄位 (如果存在)
cols_to_keep = [c for c in top_features if c in df.columns]
# 必須保留主鍵Id
cols_to_keep = ["Id", "ranker_id"] + cols_to_keep

df_filtered = df.select(cols_to_keep)

# 7. 儲存
output_path = "data/test_filled_group_aware_features_top150.parquet"
df_filtered.write_parquet(output_path)

print(f"✅ 已保留 {len(cols_to_keep)} 個欄位")
print(f"✅ 已儲存: {output_path}")


✅ 保留前 150 個特徵 (依據 min_rank_overall)：
- total_is_min_transfers
- total_num_transfers_rank
- legs0_segments0_flightNumber
- price_per_duration
- duration_percentile
- companyID
- legs0_num_transfers_rank
- price_per_duration_rank
- has_baggage
- price_rank_times_duration_rank
- legs1_mean_cabin
- price_ratio_group_max
- price_from_median_zscore
- price_percentile
- legs0_weighted_mean_cabin
- has_access_tp
- price_per_fee_rank
- pricingInfo_isAccessTP
- days_before_departure
- price_minus_fee_rank
- total_weighted_mean_cabin
- legs0_arrivalAt_hour
- legs0_mean_cabin
- both_legs_carrier_all_same
- legs0_departureAt_hour
- legs0_segments0_baggageAllowance_quantity
- log_price_times_rank
- baggage_total
- price_per_fee
- duration_ratio_group_max
- legs1_weighted_mean_cabin
- price_per_tax
- legs1_arrivalAt_day_period
- legs1_departureAt_hour
- legs1_segments0_marketingCarrier_code
- total_fees
- legs1_arrivalAt_hour
- duration_ratio
- price_ratio_times_duration_ratio
- log_price_times_ratio

In [1]:
import polars as pl
from itertools import combinations, product
import os

# Load data
df = pl.read_parquet("data/test_filled_v2_top150.parquet")

# 確認欄位
needed_cols = ["ranker_id", "totalPrice", "total_duration", "legs0_mean_cabin", "isVip"]
for c in needed_cols:
    if c not in df.columns:
        raise ValueError(f"❌ 缺少欄位: {c}")

# 所有要做轉換的目標欄位
target_cols = ["totalPrice", "total_duration"]

# 建立轉換表達式
transform_exprs = []
col_transform_map = {col: [] for col in target_cols}

for col in target_cols:
    # 平滑 Z-Score
    name = f"{col}_smoothed_zscore"
    transform_exprs.append(
        ((pl.col(col) - pl.col(col).mean().over("ranker_id")) / (pl.col(col).std().over("ranker_id") + 1))
        .alias(name)
    )
    col_transform_map[col].append(name)

    # Percentile
    name = f"{col}_percentile"
    transform_exprs.append(
        (pl.col(col).rank("average").over("ranker_id") / pl.col(col).count().over("ranker_id"))
        .alias(name)
    )
    col_transform_map[col].append(name)

    # Sqrt
    name = f"{col}_sqrt"
    transform_exprs.append(
        (pl.col(col).sqrt()).alias(name)
    )
    col_transform_map[col].append(name)

    # Relative Difference vs Min
    name = f"{col}_relative_diff"
    transform_exprs.append(
        ((pl.col(col) - pl.col(col).min().over("ranker_id")) / (pl.col(col).min().over("ranker_id") + 1))
        .alias(name)
    )
    col_transform_map[col].append(name)

    # Group Standardization
    name = f"{col}_standardized"
    transform_exprs.append(
        ((pl.col(col) - pl.col(col).mean().over("ranker_id")) / (pl.col(col).std().over("ranker_id") + 1e-3))
        .alias(name)
    )
    col_transform_map[col].append(name)

    # Normalized Rank (-1 ~ +1)
    name = f"{col}_rank_normalized"
    transform_exprs.append(
        (
            (pl.col(col).rank("average").over("ranker_id") - 1) /
            (pl.col(col).count().over("ranker_id") - 1) * 2 - 1
        ).alias(name)
    )
    col_transform_map[col].append(name)

# 執行轉換
df = df.with_columns(transform_exprs)

# 生成二階交互特徵
pair_exprs = []

# 1. 同欄位轉換 × 轉換
for col, trans_cols in col_transform_map.items():
    for a, b in combinations(trans_cols, 2):
        pair_exprs.append(
            (pl.col(a) * pl.col(b)).alias(f"{a}_X_{b}")
        )

# 2. 不同欄位同轉換 × 同轉換
# 先把每個轉換類型收集
transform_types = [name.split("_", 1)[1] for name in col_transform_map[target_cols[0]]]

for t in transform_types:
    col1 = f"{target_cols[0]}_{t}"
    col2 = f"{target_cols[1]}_{t}"
    pair_exprs.append(
        (pl.col(col1) * pl.col(col2)).alias(f"{col1}_X_{col2}")
    )

# ✅ 取消三階交互

# 加入交互特徵
df = df.with_columns(pair_exprs)

output_path = "data/test_filled_v3.parquet"
df.write_parquet(output_path)

print(f"✅ 完成 {sum(len(v) for v in col_transform_map.values())} 個轉換特徵")
print(f"✅ 完成 {len(pair_exprs)} 個二階交互特徵 (僅同欄位轉換 & 不同欄位同轉換)")
print(f"✅ 已輸出到 {output_path}")


✅ 完成 12 個轉換特徵
✅ 完成 36 個二階交互特徵 (僅同欄位轉換 & 不同欄位同轉換)
✅ 已輸出到 data/test_filled_v3.parquet
