In [4]:
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import time
import xgboost as xgb

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [5]:
# Load data
test = pl.read_parquet('data/test.parquet').drop('__index_level_0__').with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))



# 刪除空字串

In [6]:
# 先強制指定 dtype
test = test.with_columns(
    pl.col("legs0_segments3_aircraft_code").cast(pl.Utf8),
    pl.col("legs1_segments3_aircraft_code").cast(pl.Utf8),
    pl.col("legs0_segments3_flightNumber").cast(pl.Utf8),
    pl.col("legs1_segments3_flightNumber").cast(pl.Utf8),
    pl.col("legs0_segments3_arrivalTo_airport_city_iata").cast(pl.Utf8),
    pl.col("legs1_segments3_arrivalTo_airport_city_iata").cast(pl.Utf8),
    pl.col("legs0_segments3_arrivalTo_airport_iata").cast(pl.Utf8),
    pl.col("legs1_segments3_arrivalTo_airport_iata").cast(pl.Utf8),
    pl.col("legs0_segments3_departureFrom_airport_iata").cast(pl.Utf8),
    pl.col("legs1_segments3_departureFrom_airport_iata").cast(pl.Utf8),
    pl.col("legs0_segments3_marketingCarrier_code").cast(pl.Utf8),
    pl.col("legs1_segments3_marketingCarrier_code").cast(pl.Utf8),
    pl.col("legs0_segments3_operatingCarrier_code").cast(pl.Utf8),
    pl.col("legs1_segments3_operatingCarrier_code").cast(pl.Utf8),
)

# 確認字串欄位
str_cols = [c for c in test.columns if test[c].dtype == pl.Utf8 or test[c].dtype == pl.String]

# 確認數值欄位
numeric_cols = [c for c in test.columns if test[c].dtype.is_numeric()]

# 將空字串視為 null
string_exprs = [
    pl.when(pl.col(c).str.strip_chars() == "")
      .then(None)
      .otherwise(pl.col(c))
      .alias(c)
    for c in str_cols
]

# 先把空字串改成 null
test = test.with_columns(string_exprs)

# 再一次性做 fill_null
test = test.with_columns(
    [pl.col(c).fill_null("missing") for c in str_cols] +
    [pl.col(c).fill_null(0) for c in numeric_cols]
)

print("✅ 所有空字串與null已處理完畢，並已確保 legs0_segments3_aircraft_code 是字串")


✅ 所有空字串與null已處理完畢，並已確保 legs0_segments3_aircraft_code 是字串


# 處理duration

In [8]:
duration_cols = [
    "legs0_duration",
    "legs1_duration",
    "legs0_segments0_duration",
    "legs0_segments1_duration",
    "legs0_segments2_duration",
    "legs0_segments3_duration",
    "legs1_segments0_duration",
    "legs1_segments1_duration",
    "legs1_segments2_duration",
    "legs1_segments3_duration"
]
duration_exprs = [
    (
        # 先把 null 轉空字串，避免 str.extract 爆炸
        pl.when(
            pl.col(c).fill_null("")
             .is_in(["", "missing"])
        )
        .then(0)
        .otherwise(
            # 先把 null 轉空字串，確保 extract 不錯誤
            pl.col(c).fill_null("")
             .str.extract(r"^(\d+):", 1).cast(pl.Int64) * 60 +
            pl.col(c).fill_null("")
             .str.extract(r":(\d+):", 1).cast(pl.Int64)
        )
        .alias(c)
    )
    for c in duration_cols if c in test.columns
]


train_filled = test.with_columns(duration_exprs)


In [13]:
train_filled

Id,bySelf,companyID,corporateTariffCode,frequentFlyer,nationality,isAccess3D,isVip,legs0_arrivalAt,legs0_departureAt,legs0_duration,legs0_segments0_aircraft_code,legs0_segments0_arrivalTo_airport_city_iata,legs0_segments0_arrivalTo_airport_iata,legs0_segments0_baggageAllowance_quantity,legs0_segments0_baggageAllowance_weightMeasurementType,legs0_segments0_cabinClass,legs0_segments0_departureFrom_airport_iata,legs0_segments0_duration,legs0_segments0_flightNumber,legs0_segments0_marketingCarrier_code,legs0_segments0_operatingCarrier_code,legs0_segments0_seatsAvailable,legs0_segments1_aircraft_code,legs0_segments1_arrivalTo_airport_city_iata,legs0_segments1_arrivalTo_airport_iata,legs0_segments1_baggageAllowance_quantity,legs0_segments1_baggageAllowance_weightMeasurementType,legs0_segments1_cabinClass,legs0_segments1_departureFrom_airport_iata,legs0_segments1_duration,legs0_segments1_flightNumber,legs0_segments1_marketingCarrier_code,legs0_segments1_operatingCarrier_code,legs0_segments1_seatsAvailable,legs0_segments2_aircraft_code,legs0_segments2_arrivalTo_airport_city_iata,…,legs1_segments2_baggageAllowance_quantity,legs1_segments2_baggageAllowance_weightMeasurementType,legs1_segments2_cabinClass,legs1_segments2_departureFrom_airport_iata,legs1_segments2_duration,legs1_segments2_flightNumber,legs1_segments2_marketingCarrier_code,legs1_segments2_operatingCarrier_code,legs1_segments2_seatsAvailable,legs1_segments3_aircraft_code,legs1_segments3_arrivalTo_airport_city_iata,legs1_segments3_arrivalTo_airport_iata,legs1_segments3_baggageAllowance_quantity,legs1_segments3_baggageAllowance_weightMeasurementType,legs1_segments3_cabinClass,legs1_segments3_departureFrom_airport_iata,legs1_segments3_duration,legs1_segments3_flightNumber,legs1_segments3_marketingCarrier_code,legs1_segments3_operatingCarrier_code,legs1_segments3_seatsAvailable,miniRules0_monetaryAmount,miniRules0_percentage,miniRules0_statusInfos,miniRules1_monetaryAmount,miniRules1_percentage,miniRules1_statusInfos,pricingInfo_isAccessTP,pricingInfo_passengerCount,profileId,ranker_id,requestDate,searchRoute,sex,taxes,totalPrice,selected
i64,bool,i64,i64,str,i64,bool,bool,str,str,i64,str,str,str,f64,f64,f64,str,i64,str,str,str,f64,str,str,str,f64,f64,f64,str,i64,str,str,str,f64,str,str,…,f64,f64,f64,str,i64,str,str,str,f64,null,null,null,f64,f64,f64,null,i64,null,null,null,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,str,datetime[ns],str,bool,f64,f64,i64
18144679,true,62840,0,"""missing""",36,false,false,"""2024-12-19T11:20:00""","""2024-12-19T06:50:00""",150,"""32A""","""SVX""","""SVX""",0.0,0.0,1.0,"""SVO""",150,"""1410""","""SU""","""SU""",4.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,"""missing""","""missing""",…,0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,,,,0.0,0.0,0.0,,0,,,,0.0,2800.0,0.0,1.0,0.0,0.0,0.0,1.0,1,3604015,"""c9373e5f772e43d593dd6ad2fa90f6…",2024-10-29 12:50:42,"""MOWSVX/SVXMOW""",false,1018.0,9818.0,0
18144680,true,62840,0,"""missing""",36,false,false,"""2024-12-19T11:20:00""","""2024-12-19T06:50:00""",150,"""32A""","""SVX""","""SVX""",1.0,0.0,1.0,"""SVO""",150,"""1410""","""SU""","""SU""",4.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,"""missing""","""missing""",…,0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,,,,0.0,0.0,0.0,,0,,,,0.0,2800.0,0.0,1.0,2800.0,0.0,1.0,1.0,1,3604015,"""c9373e5f772e43d593dd6ad2fa90f6…",2024-10-29 12:50:42,"""MOWSVX/SVXMOW""",false,1018.0,14018.0,0
18144681,true,62840,0,"""missing""",36,false,false,"""2024-12-19T11:20:00""","""2024-12-19T06:50:00""",150,"""32A""","""SVX""","""SVX""",2.0,0.0,1.0,"""SVO""",150,"""1410""","""SU""","""SU""",4.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,"""missing""","""missing""",…,0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,,,,0.0,0.0,0.0,,0,,,,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1,3604015,"""c9373e5f772e43d593dd6ad2fa90f6…",2024-10-29 12:50:42,"""MOWSVX/SVXMOW""",false,1018.0,22418.0,0
18144682,true,62840,0,"""missing""",36,false,false,"""2024-12-19T12:45:00""","""2024-12-19T08:25:00""",140,"""320""","""SVX""","""SVX""",0.0,0.0,1.0,"""DME""",140,"""273""","""U6""","""U6""",7.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,"""missing""","""missing""",…,0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,,,,0.0,0.0,0.0,,0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,3604015,"""c9373e5f772e43d593dd6ad2fa90f6…",2024-10-29 12:50:42,"""MOWSVX/SVXMOW""",false,3284.0,12974.0,0
18144683,true,62840,0,"""missing""",36,false,false,"""2024-12-19T12:45:00""","""2024-12-19T08:25:00""",140,"""320""","""SVX""","""SVX""",1.0,0.0,1.0,"""DME""",140,"""273""","""U6""","""U6""",7.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,"""missing""","""missing""",…,0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,,,,0.0,0.0,0.0,,0,,,,0.0,0.0,0.0,1.0,1500.0,0.0,1.0,1.0,1,3604015,"""c9373e5f772e43d593dd6ad2fa90f6…",2024-10-29 12:50:42,"""MOWSVX/SVXMOW""",false,3284.0,16974.0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
25043143,false,57320,65,"""missing""",36,true,false,"""2025-01-08T12:50:00""","""2025-01-08T09:05:00""",165,"""32A""","""ASF""","""ASF""",2.0,0.0,1.0,"""SVO""",165,"""1678""","""SU""","""SU""",4.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,"""missing""","""missing""",…,0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,,,,0.0,0.0,0.0,,0,,,,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1,3647580,"""c5622e0de0594bde95a4dd8c1fcff7…",2024-12-31 18:54:00,"""MOWASF""",true,566.0,16486.0,0
25043144,false,57320,65,"""missing""",36,true,false,"""2025-01-09T01:10:00""","""2025-01-08T21:25:00""",165,"""32A""","""ASF""","""ASF""",1.0,0.0,1.0,"""SVO""",165,"""1172""","""SU""","""SU""",4.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,"""missing""","""missing""",…,0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,,,,0.0,0.0,0.0,,0,,,,0.0,2800.0,0.0,1.0,2800.0,0.0,1.0,1.0,1,3647580,"""c5622e0de0594bde95a4dd8c1fcff7…",2024-12-31 18:54:00,"""MOWASF""",true,566.0,11701.0,0
25043145,false,57320,65,"""missing""",36,true,false,"""2025-01-09T01:10:00""","""2025-01-08T21:25:00""",165,"""32A""","""ASF""","""ASF""",2.0,0.0,1.0,"""SVO""",165,"""1172""","""SU""","""SU""",4.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,"""missing""","""missing""",…,0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,,,,0.0,0.0,0.0,,0,,,,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1,3647580,"""c5622e0de0594bde95a4dd8c1fcff7…",2024-12-31 18:54:00,"""MOWASF""",true,566.0,16486.0,0
25043146,false,57320,65,"""missing""",36,true,false,"""2025-01-08T18:50:00""","""2025-01-08T15:10:00""",160,"""32B""","""ASF""","""ASF""",1.0,0.0,1.0,"""SVO""",160,"""1174""","""SU""","""SU""",9.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,"""missing""","""missing""",…,0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,,,,0.0,0.0,0.0,,0,,,,0.0,2800.0,0.0,1.0,2800.0,0.0,1.0,1.0,1,3647580,"""c5622e0de0594bde95a4dd8c1fcff7…",2024-12-31 18:54:00,"""MOWASF""",true,566.0,14431.0,0


# add time

In [9]:
time_exprs = []
for col in ("legs0_departureAt", "legs0_arrivalAt", "legs1_departureAt", "legs1_arrivalAt"):
    if col in train_filled.columns:
        # 把 "missing" 統一變 null
        cleaned_col = (
            pl.when(pl.col(col) == "missing")
              .then(None)
              .otherwise(pl.col(col))
        )

        # 轉 datetime
        dt = cleaned_col.str.to_datetime(strict=False)

        # 小時
        h = dt.dt.hour()

        # 四段時段
        period = (
            pl.when(h.is_between(0,5))
              .then(0)
              .when(h.is_between(6,11))
              .then(1)
              .when(h.is_between(12,17))
              .then(2)
              .when(h.is_between(18,23))
              .then(3)
        )

        time_exprs.extend([
            h.fill_null(-1).alias(f"{col}_hour"),
            dt.dt.weekday().fill_null(-1).alias(f"{col}_weekday"),
            (
                ((h >= 6) & (h <= 9)) | ((h >= 17) & (h <= 20))
            ).cast(pl.Int32).fill_null(-1).alias(f"{col}_business_time"),
            period.fill_null(-1).alias(f"{col}_day_period")
        ])

train_filled = train_filled.with_columns(time_exprs)

In [13]:
train_filled

Id,bySelf,companyID,corporateTariffCode,frequentFlyer,nationality,isAccess3D,isVip,legs0_arrivalAt,legs0_departureAt,legs0_duration,legs0_segments0_aircraft_code,legs0_segments0_arrivalTo_airport_city_iata,legs0_segments0_arrivalTo_airport_iata,legs0_segments0_baggageAllowance_quantity,legs0_segments0_baggageAllowance_weightMeasurementType,legs0_segments0_cabinClass,legs0_segments0_departureFrom_airport_iata,legs0_segments0_duration,legs0_segments0_flightNumber,legs0_segments0_marketingCarrier_code,legs0_segments0_operatingCarrier_code,legs0_segments0_seatsAvailable,legs0_segments1_aircraft_code,legs0_segments1_arrivalTo_airport_city_iata,legs0_segments1_arrivalTo_airport_iata,legs0_segments1_baggageAllowance_quantity,legs0_segments1_baggageAllowance_weightMeasurementType,legs0_segments1_cabinClass,legs0_segments1_departureFrom_airport_iata,legs0_segments1_duration,legs0_segments1_flightNumber,legs0_segments1_marketingCarrier_code,legs0_segments1_operatingCarrier_code,legs0_segments1_seatsAvailable,legs0_segments2_aircraft_code,legs0_segments2_arrivalTo_airport_city_iata,…,legs1_segments3_duration,legs1_segments3_flightNumber,legs1_segments3_marketingCarrier_code,legs1_segments3_operatingCarrier_code,legs1_segments3_seatsAvailable,miniRules0_monetaryAmount,miniRules0_percentage,miniRules0_statusInfos,miniRules1_monetaryAmount,miniRules1_percentage,miniRules1_statusInfos,pricingInfo_isAccessTP,pricingInfo_passengerCount,profileId,ranker_id,requestDate,searchRoute,sex,taxes,totalPrice,selected,legs0_departureAt_hour,legs0_departureAt_weekday,legs0_departureAt_business_time,legs0_departureAt_day_period,legs0_arrivalAt_hour,legs0_arrivalAt_weekday,legs0_arrivalAt_business_time,legs0_arrivalAt_day_period,legs1_departureAt_hour,legs1_departureAt_weekday,legs1_departureAt_business_time,legs1_departureAt_day_period,legs1_arrivalAt_hour,legs1_arrivalAt_weekday,legs1_arrivalAt_business_time,legs1_arrivalAt_day_period
i64,bool,i64,i64,str,i64,bool,bool,str,str,i64,str,str,str,f64,f64,f64,str,i64,str,str,str,f64,str,str,str,f64,f64,f64,str,i64,str,str,str,f64,str,str,…,i64,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,str,datetime[ns],str,bool,f64,f64,i64,i8,i8,i32,i32,i8,i8,i32,i32,i8,i8,i32,i32,i8,i8,i32,i32
0,true,57323,0,"""S7/SU/UT""",36,false,false,"""2024-06-15T16:20:00""","""2024-06-15T15:40:00""",160,"""YK2""","""KJA""","""KJA""",1.0,0.0,1.0,"""TLK""",160,"""216""","""KV""","""KV""",9.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,"""missing""","""missing""",…,0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,2087645,"""98ce0dabf6964640b63079fbafd42c…",2024-05-17 03:03:08,"""TLKKJA/KJATLK""",true,370.0,16884.0,1,15,6,0,2,16,6,0,2,9,2,1,1,14,2,0,2
1,true,57323,123,"""S7/SU/UT""",36,true,false,"""2024-06-15T14:50:00""","""2024-06-15T09:25:00""",445,"""E70""","""OVB""","""OVB""",1.0,0.0,1.0,"""TLK""",170,"""5358""","""S7""","""S7""",4.0,"""E70""","""KJA""","""KJA""",1.0,0.0,1.0,"""OVB""",80,"""5311""","""S7""","""S7""",4.0,"""missing""","""missing""",…,0,"""missing""","""missing""","""missing""",0.0,2300.0,0.0,1.0,3500.0,0.0,1.0,1.0,1,2087645,"""98ce0dabf6964640b63079fbafd42c…",2024-05-17 03:03:08,"""TLKKJA/KJATLK""",true,2240.0,51125.0,0,9,6,1,1,14,6,0,2,22,2,0,3,8,3,1,1
2,true,57323,0,"""S7/SU/UT""",36,false,false,"""2024-06-15T14:50:00""","""2024-06-15T09:25:00""",445,"""E70""","""OVB""","""OVB""",1.0,0.0,1.0,"""TLK""",170,"""5358""","""S7""","""S7""",4.0,"""E70""","""KJA""","""KJA""",1.0,0.0,1.0,"""OVB""",80,"""5311""","""S7""","""S7""",4.0,"""missing""","""missing""",…,0,"""missing""","""missing""","""missing""",0.0,2300.0,0.0,1.0,3500.0,0.0,1.0,1.0,1,2087645,"""98ce0dabf6964640b63079fbafd42c…",2024-05-17 03:03:08,"""TLKKJA/KJATLK""",true,2240.0,53695.0,0,9,6,1,1,14,6,0,2,22,2,0,3,8,3,1,1
3,true,57323,123,"""S7/SU/UT""",36,true,false,"""2024-06-15T14:50:00""","""2024-06-15T09:25:00""",445,"""E70""","""OVB""","""OVB""",1.0,0.0,1.0,"""TLK""",170,"""5358""","""S7""","""S7""",4.0,"""E70""","""KJA""","""KJA""",1.0,0.0,1.0,"""OVB""",80,"""5311""","""S7""","""S7""",4.0,"""missing""","""missing""",…,0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1,2087645,"""98ce0dabf6964640b63079fbafd42c…",2024-05-17 03:03:08,"""TLKKJA/KJATLK""",true,2240.0,81880.0,0,9,6,1,1,14,6,0,2,22,2,0,3,8,3,1,1
4,true,57323,0,"""S7/SU/UT""",36,false,false,"""2024-06-15T14:50:00""","""2024-06-15T09:25:00""",445,"""E70""","""OVB""","""OVB""",1.0,0.0,1.0,"""TLK""",170,"""5358""","""S7""","""S7""",4.0,"""E70""","""KJA""","""KJA""",1.0,0.0,1.0,"""OVB""",80,"""5311""","""S7""","""S7""",4.0,"""missing""","""missing""",…,0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1,2087645,"""98ce0dabf6964640b63079fbafd42c…",2024-05-17 03:03:08,"""TLKKJA/KJATLK""",true,2240.0,86070.0,0,9,6,1,1,14,6,0,2,22,2,0,3,8,3,1,1
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
18146427,true,54154,44,"""missing""",36,true,false,"""2024-11-06T09:45:00""","""2024-11-05T20:50:00""",655,"""738""","""SGC""","""SGC""",0.0,0.0,1.0,"""VKO""",200,"""247""","""UT""","""UT""",3.0,"""AT7""","""SVX""","""SVX""",0.0,0.0,1.0,"""SGC""",150,"""111""","""UT""","""UT""",5.0,"""missing""","""missing""",…,0,"""missing""","""missing""","""missing""",0.0,24000.0,0.0,1.0,0.0,0.0,0.0,0.0,1,3046852,"""88f8c53a28bf4f438941fd67338009…",2024-10-29 12:46:20,"""MOWSVX/SVXMOW""",true,5560.0,30730.0,0,20,2,1,3,9,3,1,1,19,6,1,3,8,7,1,1
18146428,true,54154,44,"""missing""",36,true,false,"""2024-11-05T20:00:00""","""2024-11-05T00:20:00""",1060,"""738""","""UFA""","""UFA""",0.0,0.0,1.0,"""VKO""",125,"""363""","""UT""","""UT""",3.0,"""AT7""","""SVX""","""SVX""",0.0,0.0,1.0,"""UFA""",90,"""106""","""UT""","""UT""",9.0,"""missing""","""missing""",…,0,"""missing""","""missing""","""missing""",0.0,24000.0,0.0,1.0,0.0,0.0,0.0,0.0,1,3046852,"""88f8c53a28bf4f438941fd67338009…",2024-10-29 12:46:20,"""MOWSVX/SVXMOW""",true,5560.0,27660.0,0,0,2,0,0,20,2,1,3,21,6,0,3,8,7,1,1
18146429,true,54154,44,"""missing""",36,true,false,"""2024-11-05T20:00:00""","""2024-11-05T00:20:00""",1060,"""738""","""UFA""","""UFA""",0.0,0.0,1.0,"""VKO""",125,"""363""","""UT""","""UT""",3.0,"""AT7""","""SVX""","""SVX""",0.0,0.0,1.0,"""UFA""",90,"""106""","""UT""","""UT""",9.0,"""missing""","""missing""",…,0,"""missing""","""missing""","""missing""",0.0,24000.0,0.0,1.0,0.0,0.0,0.0,0.0,1,3046852,"""88f8c53a28bf4f438941fd67338009…",2024-10-29 12:46:20,"""MOWSVX/SVXMOW""",true,5460.0,24460.0,0,0,2,0,0,20,2,1,3,19,6,1,3,8,7,1,1
18146430,true,54154,44,"""missing""",36,true,false,"""2024-11-06T17:10:00""","""2024-11-05T19:30:00""",1180,"""738""","""KUF""","""KUF""",0.0,0.0,1.0,"""VKO""",105,"""357""","""UT""","""UT""",4.0,"""AT7""","""SVX""","""SVX""",0.0,0.0,1.0,"""KUF""",125,"""282""","""UT""","""UT""",2.0,"""missing""","""missing""",…,0,"""missing""","""missing""","""missing""",0.0,24000.0,0.0,1.0,0.0,0.0,0.0,0.0,1,3046852,"""88f8c53a28bf4f438941fd67338009…",2024-10-29 12:46:20,"""MOWSVX/SVXMOW""",true,5560.0,25360.0,0,19,2,1,3,17,3,1,2,21,6,0,3,8,7,1,1


In [10]:
# 時間欄位
time_cols = ["legs0_departureAt", "legs0_arrivalAt", "legs1_departureAt", "legs1_arrivalAt"]

# 來回標記
round_trip_flag = (
    (
        (
            pl.col("legs1_departureAt").is_not_null() & (pl.col("legs1_departureAt") != "missing")
        )
        |
        (
            pl.col("legs1_arrivalAt").is_not_null() & (pl.col("legs1_arrivalAt") != "missing")
        )
    )
    .cast(pl.Int8)
    .alias("is_round_trip")
)



# 加入DataFrame
train_filled = train_filled.with_columns([round_trip_flag])


In [11]:
# 篩選單程資料
one_way_rows = train_filled.filter(pl.col("is_round_trip") == 0)

# 印出前幾筆看看
one_way_rows


Id,bySelf,companyID,corporateTariffCode,frequentFlyer,nationality,isAccess3D,isVip,legs0_arrivalAt,legs0_departureAt,legs0_duration,legs0_segments0_aircraft_code,legs0_segments0_arrivalTo_airport_city_iata,legs0_segments0_arrivalTo_airport_iata,legs0_segments0_baggageAllowance_quantity,legs0_segments0_baggageAllowance_weightMeasurementType,legs0_segments0_cabinClass,legs0_segments0_departureFrom_airport_iata,legs0_segments0_duration,legs0_segments0_flightNumber,legs0_segments0_marketingCarrier_code,legs0_segments0_operatingCarrier_code,legs0_segments0_seatsAvailable,legs0_segments1_aircraft_code,legs0_segments1_arrivalTo_airport_city_iata,legs0_segments1_arrivalTo_airport_iata,legs0_segments1_baggageAllowance_quantity,legs0_segments1_baggageAllowance_weightMeasurementType,legs0_segments1_cabinClass,legs0_segments1_departureFrom_airport_iata,legs0_segments1_duration,legs0_segments1_flightNumber,legs0_segments1_marketingCarrier_code,legs0_segments1_operatingCarrier_code,legs0_segments1_seatsAvailable,legs0_segments2_aircraft_code,legs0_segments2_arrivalTo_airport_city_iata,…,legs1_segments3_flightNumber,legs1_segments3_marketingCarrier_code,legs1_segments3_operatingCarrier_code,legs1_segments3_seatsAvailable,miniRules0_monetaryAmount,miniRules0_percentage,miniRules0_statusInfos,miniRules1_monetaryAmount,miniRules1_percentage,miniRules1_statusInfos,pricingInfo_isAccessTP,pricingInfo_passengerCount,profileId,ranker_id,requestDate,searchRoute,sex,taxes,totalPrice,selected,legs0_departureAt_hour,legs0_departureAt_weekday,legs0_departureAt_business_time,legs0_departureAt_day_period,legs0_arrivalAt_hour,legs0_arrivalAt_weekday,legs0_arrivalAt_business_time,legs0_arrivalAt_day_period,legs1_departureAt_hour,legs1_departureAt_weekday,legs1_departureAt_business_time,legs1_departureAt_day_period,legs1_arrivalAt_hour,legs1_arrivalAt_weekday,legs1_arrivalAt_business_time,legs1_arrivalAt_day_period,is_round_trip
i64,bool,i64,i64,str,i64,bool,bool,str,str,i64,str,str,str,f64,f64,f64,str,i64,str,str,str,f64,str,str,str,f64,f64,f64,str,i64,str,str,str,f64,str,str,…,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,str,datetime[ns],str,bool,f64,f64,i64,i8,i8,i32,i32,i8,i8,i32,i32,i8,i8,i32,i32,i8,i8,i32,i32,i8
18145792,true,28626,0,"""missing""",36,false,false,"""2024-11-02T09:45:00""","""2024-11-02T09:10:00""",275,"""73H""","""MOW""","""SVO""",0.0,0.0,1.0,"""OVB""",275,"""6526""","""DP""","""DP""",9.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,"""missing""","""missing""",…,"""missing""","""missing""","""missing""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,3292775,"""8c387f33dc824a89ba7a6f2cb5908e…",2024-10-29 12:53:16,"""OVBMOW""",true,285.0,7384.0,0,9,6,1,1,9,6,1,1,-1,-1,-1,-1,-1,-1,-1,-1,0
18145793,true,28626,0,"""missing""",36,false,false,"""2024-11-02T09:45:00""","""2024-11-02T09:10:00""",275,"""73H""","""MOW""","""SVO""",1.0,0.0,1.0,"""OVB""",275,"""6526""","""DP""","""DP""",9.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,"""missing""","""missing""",…,"""missing""","""missing""","""missing""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,3292775,"""8c387f33dc824a89ba7a6f2cb5908e…",2024-10-29 12:53:16,"""OVBMOW""",true,285.0,9084.0,0,9,6,1,1,9,6,1,1,-1,-1,-1,-1,-1,-1,-1,-1,0
18145794,true,28626,0,"""missing""",36,false,false,"""2024-11-02T09:45:00""","""2024-11-02T09:10:00""",275,"""73H""","""MOW""","""SVO""",1.0,0.0,1.0,"""OVB""",275,"""6526""","""DP""","""DP""",9.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,"""missing""","""missing""",…,"""missing""","""missing""","""missing""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,3292775,"""8c387f33dc824a89ba7a6f2cb5908e…",2024-10-29 12:53:16,"""OVBMOW""",true,285.0,11934.0,0,9,6,1,1,9,6,1,1,-1,-1,-1,-1,-1,-1,-1,-1,0
18145795,true,28626,0,"""missing""",36,false,false,"""2024-11-02T20:30:00""","""2024-11-02T20:10:00""",260,"""738""","""MOW""","""VKO""",0.0,0.0,1.0,"""OVB""",260,"""543""","""6R""","""6R""",9.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,"""missing""","""missing""",…,"""missing""","""missing""","""missing""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,3292775,"""8c387f33dc824a89ba7a6f2cb5908e…",2024-10-29 12:53:16,"""OVBMOW""",true,185.0,10685.0,0,20,6,1,3,20,6,1,3,-1,-1,-1,-1,-1,-1,-1,-1,0
18145796,true,28626,0,"""missing""",36,false,false,"""2024-11-02T20:30:00""","""2024-11-02T20:10:00""",260,"""738""","""MOW""","""VKO""",1.0,0.0,1.0,"""OVB""",260,"""543""","""6R""","""6R""",9.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,"""missing""","""missing""",…,"""missing""","""missing""","""missing""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,3292775,"""8c387f33dc824a89ba7a6f2cb5908e…",2024-10-29 12:53:16,"""OVBMOW""",true,185.0,13685.0,0,20,6,1,3,20,6,1,3,-1,-1,-1,-1,-1,-1,-1,-1,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
25043143,false,57320,65,"""missing""",36,true,false,"""2025-01-08T12:50:00""","""2025-01-08T09:05:00""",165,"""32A""","""ASF""","""ASF""",2.0,0.0,1.0,"""SVO""",165,"""1678""","""SU""","""SU""",4.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,"""missing""","""missing""",…,"""missing""","""missing""","""missing""",0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1,3647580,"""c5622e0de0594bde95a4dd8c1fcff7…",2024-12-31 18:54:00,"""MOWASF""",true,566.0,16486.0,0,9,3,1,1,12,3,0,2,-1,-1,-1,-1,-1,-1,-1,-1,0
25043144,false,57320,65,"""missing""",36,true,false,"""2025-01-09T01:10:00""","""2025-01-08T21:25:00""",165,"""32A""","""ASF""","""ASF""",1.0,0.0,1.0,"""SVO""",165,"""1172""","""SU""","""SU""",4.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,"""missing""","""missing""",…,"""missing""","""missing""","""missing""",0.0,2800.0,0.0,1.0,2800.0,0.0,1.0,1.0,1,3647580,"""c5622e0de0594bde95a4dd8c1fcff7…",2024-12-31 18:54:00,"""MOWASF""",true,566.0,11701.0,0,21,3,0,3,1,4,0,0,-1,-1,-1,-1,-1,-1,-1,-1,0
25043145,false,57320,65,"""missing""",36,true,false,"""2025-01-09T01:10:00""","""2025-01-08T21:25:00""",165,"""32A""","""ASF""","""ASF""",2.0,0.0,1.0,"""SVO""",165,"""1172""","""SU""","""SU""",4.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,"""missing""","""missing""",…,"""missing""","""missing""","""missing""",0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1,3647580,"""c5622e0de0594bde95a4dd8c1fcff7…",2024-12-31 18:54:00,"""MOWASF""",true,566.0,16486.0,0,21,3,0,3,1,4,0,0,-1,-1,-1,-1,-1,-1,-1,-1,0
25043146,false,57320,65,"""missing""",36,true,false,"""2025-01-08T18:50:00""","""2025-01-08T15:10:00""",160,"""32B""","""ASF""","""ASF""",1.0,0.0,1.0,"""SVO""",160,"""1174""","""SU""","""SU""",9.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,"""missing""","""missing""",…,"""missing""","""missing""","""missing""",0.0,2800.0,0.0,1.0,2800.0,0.0,1.0,1.0,1,3647580,"""c5622e0de0594bde95a4dd8c1fcff7…",2024-12-31 18:54:00,"""MOWASF""",true,566.0,14431.0,0,15,3,0,2,18,3,1,3,-1,-1,-1,-1,-1,-1,-1,-1,0


In [12]:
# 把 legs0_departureAt 轉成 datetime
depart_dt = pl.col("legs0_departureAt").str.to_datetime(strict=False)

# requestDate 已經是 datetime
request_dt = pl.col("requestDate")

# 計算相減（Duration）
duration_ms = (depart_dt - request_dt).dt.total_milliseconds()

# 轉成天數
days_diff = (
    (duration_ms / (1000 * 60 * 60 * 24))
    .floor()
    .cast(pl.Int32)
    .fill_null(-1)
    .alias("days_before_departure")
)

# 加到 DataFrame
train_filled = train_filled.with_columns([days_diff])


In [13]:
train_filled

Id,bySelf,companyID,corporateTariffCode,frequentFlyer,nationality,isAccess3D,isVip,legs0_arrivalAt,legs0_departureAt,legs0_duration,legs0_segments0_aircraft_code,legs0_segments0_arrivalTo_airport_city_iata,legs0_segments0_arrivalTo_airport_iata,legs0_segments0_baggageAllowance_quantity,legs0_segments0_baggageAllowance_weightMeasurementType,legs0_segments0_cabinClass,legs0_segments0_departureFrom_airport_iata,legs0_segments0_duration,legs0_segments0_flightNumber,legs0_segments0_marketingCarrier_code,legs0_segments0_operatingCarrier_code,legs0_segments0_seatsAvailable,legs0_segments1_aircraft_code,legs0_segments1_arrivalTo_airport_city_iata,legs0_segments1_arrivalTo_airport_iata,legs0_segments1_baggageAllowance_quantity,legs0_segments1_baggageAllowance_weightMeasurementType,legs0_segments1_cabinClass,legs0_segments1_departureFrom_airport_iata,legs0_segments1_duration,legs0_segments1_flightNumber,legs0_segments1_marketingCarrier_code,legs0_segments1_operatingCarrier_code,legs0_segments1_seatsAvailable,legs0_segments2_aircraft_code,legs0_segments2_arrivalTo_airport_city_iata,…,legs1_segments3_seatsAvailable,miniRules0_monetaryAmount,miniRules0_percentage,miniRules0_statusInfos,miniRules1_monetaryAmount,miniRules1_percentage,miniRules1_statusInfos,pricingInfo_isAccessTP,pricingInfo_passengerCount,profileId,ranker_id,requestDate,searchRoute,sex,taxes,totalPrice,selected,legs0_departureAt_hour,legs0_departureAt_weekday,legs0_departureAt_business_time,legs0_departureAt_day_period,legs0_arrivalAt_hour,legs0_arrivalAt_weekday,legs0_arrivalAt_business_time,legs0_arrivalAt_day_period,legs1_departureAt_hour,legs1_departureAt_weekday,legs1_departureAt_business_time,legs1_departureAt_day_period,legs1_arrivalAt_hour,legs1_arrivalAt_weekday,legs1_arrivalAt_business_time,legs1_arrivalAt_day_period,is_round_trip,days_before_departure,legs0_segments_total_duration,legs1_segments_total_duration
i64,bool,i64,i64,str,i64,bool,bool,str,str,i64,str,str,str,f64,f64,f64,str,i64,str,str,str,f64,str,str,str,f64,f64,f64,str,i64,str,str,str,f64,str,str,…,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,str,datetime[ns],str,bool,f64,f64,i64,i8,i8,i32,i32,i8,i8,i32,i32,i8,i8,i32,i32,i8,i8,i32,i32,i8,i32,i64,i64
0,true,57323,0,"""S7/SU/UT""",36,false,false,"""2024-06-15T16:20:00""","""2024-06-15T15:40:00""",160,"""YK2""","""KJA""","""KJA""",1.0,0.0,1.0,"""TLK""",160,"""216""","""KV""","""KV""",9.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,"""missing""","""missing""",…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,2087645,"""98ce0dabf6964640b63079fbafd42c…",2024-05-17 03:03:08,"""TLKKJA/KJATLK""",true,370.0,16884.0,1,15,6,0,2,16,6,0,2,9,2,1,1,14,2,0,2,1,29,160,155
1,true,57323,123,"""S7/SU/UT""",36,true,false,"""2024-06-15T14:50:00""","""2024-06-15T09:25:00""",445,"""E70""","""OVB""","""OVB""",1.0,0.0,1.0,"""TLK""",170,"""5358""","""S7""","""S7""",4.0,"""E70""","""KJA""","""KJA""",1.0,0.0,1.0,"""OVB""",80,"""5311""","""S7""","""S7""",4.0,"""missing""","""missing""",…,0.0,2300.0,0.0,1.0,3500.0,0.0,1.0,1.0,1,2087645,"""98ce0dabf6964640b63079fbafd42c…",2024-05-17 03:03:08,"""TLKKJA/KJATLK""",true,2240.0,51125.0,0,9,6,1,1,14,6,0,2,22,2,0,3,8,3,1,1,1,29,250,245
2,true,57323,0,"""S7/SU/UT""",36,false,false,"""2024-06-15T14:50:00""","""2024-06-15T09:25:00""",445,"""E70""","""OVB""","""OVB""",1.0,0.0,1.0,"""TLK""",170,"""5358""","""S7""","""S7""",4.0,"""E70""","""KJA""","""KJA""",1.0,0.0,1.0,"""OVB""",80,"""5311""","""S7""","""S7""",4.0,"""missing""","""missing""",…,0.0,2300.0,0.0,1.0,3500.0,0.0,1.0,1.0,1,2087645,"""98ce0dabf6964640b63079fbafd42c…",2024-05-17 03:03:08,"""TLKKJA/KJATLK""",true,2240.0,53695.0,0,9,6,1,1,14,6,0,2,22,2,0,3,8,3,1,1,1,29,250,245
3,true,57323,123,"""S7/SU/UT""",36,true,false,"""2024-06-15T14:50:00""","""2024-06-15T09:25:00""",445,"""E70""","""OVB""","""OVB""",1.0,0.0,1.0,"""TLK""",170,"""5358""","""S7""","""S7""",4.0,"""E70""","""KJA""","""KJA""",1.0,0.0,1.0,"""OVB""",80,"""5311""","""S7""","""S7""",4.0,"""missing""","""missing""",…,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1,2087645,"""98ce0dabf6964640b63079fbafd42c…",2024-05-17 03:03:08,"""TLKKJA/KJATLK""",true,2240.0,81880.0,0,9,6,1,1,14,6,0,2,22,2,0,3,8,3,1,1,1,29,250,245
4,true,57323,0,"""S7/SU/UT""",36,false,false,"""2024-06-15T14:50:00""","""2024-06-15T09:25:00""",445,"""E70""","""OVB""","""OVB""",1.0,0.0,1.0,"""TLK""",170,"""5358""","""S7""","""S7""",4.0,"""E70""","""KJA""","""KJA""",1.0,0.0,1.0,"""OVB""",80,"""5311""","""S7""","""S7""",4.0,"""missing""","""missing""",…,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1,2087645,"""98ce0dabf6964640b63079fbafd42c…",2024-05-17 03:03:08,"""TLKKJA/KJATLK""",true,2240.0,86070.0,0,9,6,1,1,14,6,0,2,22,2,0,3,8,3,1,1,1,29,250,245
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
18146427,true,54154,44,"""missing""",36,true,false,"""2024-11-06T09:45:00""","""2024-11-05T20:50:00""",655,"""738""","""SGC""","""SGC""",0.0,0.0,1.0,"""VKO""",200,"""247""","""UT""","""UT""",3.0,"""AT7""","""SVX""","""SVX""",0.0,0.0,1.0,"""SGC""",150,"""111""","""UT""","""UT""",5.0,"""missing""","""missing""",…,0.0,24000.0,0.0,1.0,0.0,0.0,0.0,0.0,1,3046852,"""88f8c53a28bf4f438941fd67338009…",2024-10-29 12:46:20,"""MOWSVX/SVXMOW""",true,5560.0,30730.0,0,20,2,1,3,9,3,1,1,19,6,1,3,8,7,1,1,1,7,350,230
18146428,true,54154,44,"""missing""",36,true,false,"""2024-11-05T20:00:00""","""2024-11-05T00:20:00""",1060,"""738""","""UFA""","""UFA""",0.0,0.0,1.0,"""VKO""",125,"""363""","""UT""","""UT""",3.0,"""AT7""","""SVX""","""SVX""",0.0,0.0,1.0,"""UFA""",90,"""106""","""UT""","""UT""",9.0,"""missing""","""missing""",…,0.0,24000.0,0.0,1.0,0.0,0.0,0.0,0.0,1,3046852,"""88f8c53a28bf4f438941fd67338009…",2024-10-29 12:46:20,"""MOWSVX/SVXMOW""",true,5560.0,27660.0,0,0,2,0,0,20,2,1,3,21,6,0,3,8,7,1,1,1,6,215,360
18146429,true,54154,44,"""missing""",36,true,false,"""2024-11-05T20:00:00""","""2024-11-05T00:20:00""",1060,"""738""","""UFA""","""UFA""",0.0,0.0,1.0,"""VKO""",125,"""363""","""UT""","""UT""",3.0,"""AT7""","""SVX""","""SVX""",0.0,0.0,1.0,"""UFA""",90,"""106""","""UT""","""UT""",9.0,"""missing""","""missing""",…,0.0,24000.0,0.0,1.0,0.0,0.0,0.0,0.0,1,3046852,"""88f8c53a28bf4f438941fd67338009…",2024-10-29 12:46:20,"""MOWSVX/SVXMOW""",true,5460.0,24460.0,0,0,2,0,0,20,2,1,3,19,6,1,3,8,7,1,1,1,6,215,230
18146430,true,54154,44,"""missing""",36,true,false,"""2024-11-06T17:10:00""","""2024-11-05T19:30:00""",1180,"""738""","""KUF""","""KUF""",0.0,0.0,1.0,"""VKO""",105,"""357""","""UT""","""UT""",4.0,"""AT7""","""SVX""","""SVX""",0.0,0.0,1.0,"""KUF""",125,"""282""","""UT""","""UT""",2.0,"""missing""","""missing""",…,0.0,24000.0,0.0,1.0,0.0,0.0,0.0,0.0,1,3046852,"""88f8c53a28bf4f438941fd67338009…",2024-10-29 12:46:20,"""MOWSVX/SVXMOW""",true,5560.0,25360.0,0,19,2,1,3,17,3,1,2,21,6,0,3,8,7,1,1,1,7,230,360


# convert bool

In [13]:
# 先找出所有布林欄位
bool_cols = [c for c in train_filled.columns if train_filled[c].dtype == pl.Boolean]

print("✅ Boolean 欄位：")
print(bool_cols)

# 把所有布林欄位轉成 0/1
train_filled = train_filled.with_columns([
    pl.col(c).cast(pl.Int8) for c in bool_cols
])


✅ Boolean 欄位：
['bySelf', 'isAccess3D', 'isVip', 'sex']


# 加上是否直飛跟轉機幾次

In [14]:
# Legs0 segments1~3 有多少存在
legs0_segment_cols = [
    "legs0_segments1_departureFrom_airport_iata",
    "legs0_segments2_departureFrom_airport_iata",
    "legs0_segments3_departureFrom_airport_iata"
]

legs1_segment_cols = [
    "legs1_segments1_departureFrom_airport_iata",
    "legs1_segments2_departureFrom_airport_iata",
    "legs1_segments3_departureFrom_airport_iata"
]

# legs0轉機次數
legs0_num_segments = (
    pl.sum_horizontal([
        ((pl.col(c).is_not_null()) & (pl.col(c) != "missing")).cast(pl.Int8)
        for c in legs0_segment_cols
    ])
    .alias("legs0_num_transfers")
)

# legs1轉機次數
legs1_num_segments = (
    pl.sum_horizontal([
        ((pl.col(c).is_not_null()) & (pl.col(c) != "missing")).cast(pl.Int8)
        for c in legs1_segment_cols
    ])
    .alias("legs1_num_transfers")
)

# 寫入轉機次數
train_filled = train_filled.with_columns([
    legs0_num_segments,
    legs1_num_segments
])

# 是否直飛
train_filled = train_filled.with_columns([
    (pl.col("legs0_num_transfers") == 0).cast(pl.Int8).alias("legs0_is_direct"),
    (pl.col("legs1_num_transfers") == 0).cast(pl.Int8).alias("legs1_is_direct")
])

# 全程直飛
train_filled = train_filled.with_columns([
    (
        (pl.col("legs0_is_direct") == 1) & (pl.col("legs1_is_direct") == 1)
    ).cast(pl.Int8).alias("both_legs_direct")
])


In [11]:
train_filled

Id,bySelf,companyID,corporateTariffCode,frequentFlyer,nationality,isAccess3D,isVip,legs0_arrivalAt,legs0_departureAt,legs0_duration,legs0_segments0_aircraft_code,legs0_segments0_arrivalTo_airport_city_iata,legs0_segments0_arrivalTo_airport_iata,legs0_segments0_baggageAllowance_quantity,legs0_segments0_baggageAllowance_weightMeasurementType,legs0_segments0_cabinClass,legs0_segments0_departureFrom_airport_iata,legs0_segments0_duration,legs0_segments0_flightNumber,legs0_segments0_marketingCarrier_code,legs0_segments0_operatingCarrier_code,legs0_segments0_seatsAvailable,legs0_segments1_aircraft_code,legs0_segments1_arrivalTo_airport_city_iata,legs0_segments1_arrivalTo_airport_iata,legs0_segments1_baggageAllowance_quantity,legs0_segments1_baggageAllowance_weightMeasurementType,legs0_segments1_cabinClass,legs0_segments1_departureFrom_airport_iata,legs0_segments1_duration,legs0_segments1_flightNumber,legs0_segments1_marketingCarrier_code,legs0_segments1_operatingCarrier_code,legs0_segments1_seatsAvailable,legs0_segments2_aircraft_code,legs0_segments2_arrivalTo_airport_city_iata,…,miniRules0_statusInfos,miniRules1_monetaryAmount,miniRules1_percentage,miniRules1_statusInfos,pricingInfo_isAccessTP,pricingInfo_passengerCount,profileId,ranker_id,requestDate,searchRoute,sex,taxes,totalPrice,selected,legs0_departureAt_hour,legs0_departureAt_weekday,legs0_departureAt_business_time,legs0_departureAt_day_period,legs0_arrivalAt_hour,legs0_arrivalAt_weekday,legs0_arrivalAt_business_time,legs0_arrivalAt_day_period,legs1_departureAt_hour,legs1_departureAt_weekday,legs1_departureAt_business_time,legs1_departureAt_day_period,legs1_arrivalAt_hour,legs1_arrivalAt_weekday,legs1_arrivalAt_business_time,legs1_arrivalAt_day_period,is_round_trip,days_before_departure,legs0_num_transfers,legs1_num_transfers,legs0_is_direct,legs1_is_direct,both_legs_direct
i64,i8,i64,i64,str,i64,i8,i8,str,str,i64,str,str,str,f64,f64,f64,str,i64,str,str,str,f64,str,str,str,f64,f64,f64,str,i64,str,str,str,f64,str,str,…,f64,f64,f64,f64,f64,i64,i64,str,datetime[ns],str,i8,f64,f64,i64,i8,i8,i32,i32,i8,i8,i32,i32,i8,i8,i32,i32,i8,i8,i32,i32,i8,i32,i8,i8,i8,i8,i8
18144679,1,62840,0,"""missing""",36,0,0,"""2024-12-19T11:20:00""","""2024-12-19T06:50:00""",150,"""32A""","""SVX""","""SVX""",0.0,0.0,1.0,"""SVO""",150,"""1410""","""SU""","""SU""",4.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,"""missing""","""missing""",…,1.0,0.0,0.0,0.0,1.0,1,3604015,"""c9373e5f772e43d593dd6ad2fa90f6…",2024-10-29 12:50:42,"""MOWSVX/SVXMOW""",0,1018.0,9818.0,0,6,4,1,1,11,4,0,1,21,6,0,3,21,6,0,3,1,50,0,0,1,1,1
18144680,1,62840,0,"""missing""",36,0,0,"""2024-12-19T11:20:00""","""2024-12-19T06:50:00""",150,"""32A""","""SVX""","""SVX""",1.0,0.0,1.0,"""SVO""",150,"""1410""","""SU""","""SU""",4.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,"""missing""","""missing""",…,1.0,2800.0,0.0,1.0,1.0,1,3604015,"""c9373e5f772e43d593dd6ad2fa90f6…",2024-10-29 12:50:42,"""MOWSVX/SVXMOW""",0,1018.0,14018.0,0,6,4,1,1,11,4,0,1,21,6,0,3,21,6,0,3,1,50,0,0,1,1,1
18144681,1,62840,0,"""missing""",36,0,0,"""2024-12-19T11:20:00""","""2024-12-19T06:50:00""",150,"""32A""","""SVX""","""SVX""",2.0,0.0,1.0,"""SVO""",150,"""1410""","""SU""","""SU""",4.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,"""missing""","""missing""",…,1.0,0.0,0.0,1.0,1.0,1,3604015,"""c9373e5f772e43d593dd6ad2fa90f6…",2024-10-29 12:50:42,"""MOWSVX/SVXMOW""",0,1018.0,22418.0,0,6,4,1,1,11,4,0,1,21,6,0,3,21,6,0,3,1,50,0,0,1,1,1
18144682,1,62840,0,"""missing""",36,0,0,"""2024-12-19T12:45:00""","""2024-12-19T08:25:00""",140,"""320""","""SVX""","""SVX""",0.0,0.0,1.0,"""DME""",140,"""273""","""U6""","""U6""",7.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,"""missing""","""missing""",…,0.0,0.0,0.0,0.0,1.0,1,3604015,"""c9373e5f772e43d593dd6ad2fa90f6…",2024-10-29 12:50:42,"""MOWSVX/SVXMOW""",0,3284.0,12974.0,0,8,4,1,1,12,4,0,2,12,6,0,2,12,6,0,2,1,50,0,0,1,1,1
18144683,1,62840,0,"""missing""",36,0,0,"""2024-12-19T12:45:00""","""2024-12-19T08:25:00""",140,"""320""","""SVX""","""SVX""",1.0,0.0,1.0,"""DME""",140,"""273""","""U6""","""U6""",7.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,"""missing""","""missing""",…,1.0,1500.0,0.0,1.0,1.0,1,3604015,"""c9373e5f772e43d593dd6ad2fa90f6…",2024-10-29 12:50:42,"""MOWSVX/SVXMOW""",0,3284.0,16974.0,0,8,4,1,1,12,4,0,2,12,6,0,2,12,6,0,2,1,50,0,0,1,1,1
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
25043143,0,57320,65,"""missing""",36,1,0,"""2025-01-08T12:50:00""","""2025-01-08T09:05:00""",165,"""32A""","""ASF""","""ASF""",2.0,0.0,1.0,"""SVO""",165,"""1678""","""SU""","""SU""",4.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,"""missing""","""missing""",…,1.0,0.0,0.0,1.0,1.0,1,3647580,"""c5622e0de0594bde95a4dd8c1fcff7…",2024-12-31 18:54:00,"""MOWASF""",1,566.0,16486.0,0,9,3,1,1,12,3,0,2,-1,-1,-1,-1,-1,-1,-1,-1,0,7,0,0,1,1,1
25043144,0,57320,65,"""missing""",36,1,0,"""2025-01-09T01:10:00""","""2025-01-08T21:25:00""",165,"""32A""","""ASF""","""ASF""",1.0,0.0,1.0,"""SVO""",165,"""1172""","""SU""","""SU""",4.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,"""missing""","""missing""",…,1.0,2800.0,0.0,1.0,1.0,1,3647580,"""c5622e0de0594bde95a4dd8c1fcff7…",2024-12-31 18:54:00,"""MOWASF""",1,566.0,11701.0,0,21,3,0,3,1,4,0,0,-1,-1,-1,-1,-1,-1,-1,-1,0,8,0,0,1,1,1
25043145,0,57320,65,"""missing""",36,1,0,"""2025-01-09T01:10:00""","""2025-01-08T21:25:00""",165,"""32A""","""ASF""","""ASF""",2.0,0.0,1.0,"""SVO""",165,"""1172""","""SU""","""SU""",4.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,"""missing""","""missing""",…,1.0,0.0,0.0,1.0,1.0,1,3647580,"""c5622e0de0594bde95a4dd8c1fcff7…",2024-12-31 18:54:00,"""MOWASF""",1,566.0,16486.0,0,21,3,0,3,1,4,0,0,-1,-1,-1,-1,-1,-1,-1,-1,0,8,0,0,1,1,1
25043146,0,57320,65,"""missing""",36,1,0,"""2025-01-08T18:50:00""","""2025-01-08T15:10:00""",160,"""32B""","""ASF""","""ASF""",1.0,0.0,1.0,"""SVO""",160,"""1174""","""SU""","""SU""",9.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,"""missing""","""missing""",…,1.0,2800.0,0.0,1.0,1.0,1,3647580,"""c5622e0de0594bde95a4dd8c1fcff7…",2024-12-31 18:54:00,"""MOWASF""",1,566.0,14431.0,0,15,3,0,2,18,3,1,3,-1,-1,-1,-1,-1,-1,-1,-1,0,7,0,0,1,1,1


In [18]:
# 先找出所有布林欄位
string_cols = [c for c in train_filled.columns if train_filled[c].dtype == pl.String]

print("✅ String 欄位：")
print(string_cols)


✅ String 欄位：
['frequentFlyer', 'legs0_arrivalAt', 'legs0_departureAt', 'legs0_segments0_aircraft_code', 'legs0_segments0_arrivalTo_airport_city_iata', 'legs0_segments0_arrivalTo_airport_iata', 'legs0_segments0_departureFrom_airport_iata', 'legs0_segments0_flightNumber', 'legs0_segments0_marketingCarrier_code', 'legs0_segments0_operatingCarrier_code', 'legs0_segments1_aircraft_code', 'legs0_segments1_arrivalTo_airport_city_iata', 'legs0_segments1_arrivalTo_airport_iata', 'legs0_segments1_departureFrom_airport_iata', 'legs0_segments1_flightNumber', 'legs0_segments1_marketingCarrier_code', 'legs0_segments1_operatingCarrier_code', 'legs0_segments2_aircraft_code', 'legs0_segments2_arrivalTo_airport_city_iata', 'legs0_segments2_arrivalTo_airport_iata', 'legs0_segments2_departureFrom_airport_iata', 'legs0_segments2_flightNumber', 'legs0_segments2_marketingCarrier_code', 'legs0_segments2_operatingCarrier_code', 'legs0_segments3_aircraft_code', 'legs0_segments3_arrivalTo_airport_city_iata', 'le

In [19]:
train_filled[string_cols]

frequentFlyer,legs0_arrivalAt,legs0_departureAt,legs0_segments0_aircraft_code,legs0_segments0_arrivalTo_airport_city_iata,legs0_segments0_arrivalTo_airport_iata,legs0_segments0_departureFrom_airport_iata,legs0_segments0_flightNumber,legs0_segments0_marketingCarrier_code,legs0_segments0_operatingCarrier_code,legs0_segments1_aircraft_code,legs0_segments1_arrivalTo_airport_city_iata,legs0_segments1_arrivalTo_airport_iata,legs0_segments1_departureFrom_airport_iata,legs0_segments1_flightNumber,legs0_segments1_marketingCarrier_code,legs0_segments1_operatingCarrier_code,legs0_segments2_aircraft_code,legs0_segments2_arrivalTo_airport_city_iata,legs0_segments2_arrivalTo_airport_iata,legs0_segments2_departureFrom_airport_iata,legs0_segments2_flightNumber,legs0_segments2_marketingCarrier_code,legs0_segments2_operatingCarrier_code,legs0_segments3_aircraft_code,legs0_segments3_arrivalTo_airport_city_iata,legs0_segments3_arrivalTo_airport_iata,legs0_segments3_departureFrom_airport_iata,legs0_segments3_flightNumber,legs0_segments3_marketingCarrier_code,legs0_segments3_operatingCarrier_code,legs1_arrivalAt,legs1_departureAt,legs1_segments0_aircraft_code,legs1_segments0_arrivalTo_airport_city_iata,legs1_segments0_arrivalTo_airport_iata,legs1_segments0_departureFrom_airport_iata,legs1_segments0_flightNumber,legs1_segments0_marketingCarrier_code,legs1_segments0_operatingCarrier_code,legs1_segments1_aircraft_code,legs1_segments1_arrivalTo_airport_city_iata,legs1_segments1_arrivalTo_airport_iata,legs1_segments1_departureFrom_airport_iata,legs1_segments1_flightNumber,legs1_segments1_marketingCarrier_code,legs1_segments1_operatingCarrier_code,legs1_segments2_aircraft_code,legs1_segments2_arrivalTo_airport_city_iata,legs1_segments2_arrivalTo_airport_iata,legs1_segments2_departureFrom_airport_iata,legs1_segments2_flightNumber,legs1_segments2_marketingCarrier_code,legs1_segments2_operatingCarrier_code,legs1_segments3_aircraft_code,legs1_segments3_arrivalTo_airport_city_iata,legs1_segments3_arrivalTo_airport_iata,legs1_segments3_departureFrom_airport_iata,legs1_segments3_flightNumber,legs1_segments3_marketingCarrier_code,legs1_segments3_operatingCarrier_code,ranker_id,searchRoute
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""S7/SU/UT""","""2024-06-15T16:20:00""","""2024-06-15T15:40:00""","""YK2""","""KJA""","""KJA""","""TLK""","""216""","""KV""","""KV""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""2024-07-09T14:20:00""","""2024-07-09T09:45:00""","""YK2""","""TLK""","""TLK""","""KJA""","""215""","""KV""","""KV""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""98ce0dabf6964640b63079fbafd42c…","""TLKKJA/KJATLK"""
"""S7/SU/UT""","""2024-06-15T14:50:00""","""2024-06-15T09:25:00""","""E70""","""OVB""","""OVB""","""TLK""","""5358""","""S7""","""S7""","""E70""","""KJA""","""KJA""","""OVB""","""5311""","""S7""","""S7""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""2024-07-10T08:30:00""","""2024-07-09T22:05:00""","""E70""","""OVB""","""OVB""","""KJA""","""5338""","""S7""","""S7""","""E70""","""TLK""","""TLK""","""OVB""","""5357""","""S7""","""S7""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""98ce0dabf6964640b63079fbafd42c…","""TLKKJA/KJATLK"""
"""S7/SU/UT""","""2024-06-15T14:50:00""","""2024-06-15T09:25:00""","""E70""","""OVB""","""OVB""","""TLK""","""5358""","""S7""","""S7""","""E70""","""KJA""","""KJA""","""OVB""","""5311""","""S7""","""S7""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""2024-07-10T08:30:00""","""2024-07-09T22:05:00""","""E70""","""OVB""","""OVB""","""KJA""","""5338""","""S7""","""S7""","""E70""","""TLK""","""TLK""","""OVB""","""5357""","""S7""","""S7""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""98ce0dabf6964640b63079fbafd42c…","""TLKKJA/KJATLK"""
"""S7/SU/UT""","""2024-06-15T14:50:00""","""2024-06-15T09:25:00""","""E70""","""OVB""","""OVB""","""TLK""","""5358""","""S7""","""S7""","""E70""","""KJA""","""KJA""","""OVB""","""5311""","""S7""","""S7""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""2024-07-10T08:30:00""","""2024-07-09T22:05:00""","""E70""","""OVB""","""OVB""","""KJA""","""5338""","""S7""","""S7""","""E70""","""TLK""","""TLK""","""OVB""","""5357""","""S7""","""S7""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""98ce0dabf6964640b63079fbafd42c…","""TLKKJA/KJATLK"""
"""S7/SU/UT""","""2024-06-15T14:50:00""","""2024-06-15T09:25:00""","""E70""","""OVB""","""OVB""","""TLK""","""5358""","""S7""","""S7""","""E70""","""KJA""","""KJA""","""OVB""","""5311""","""S7""","""S7""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""2024-07-10T08:30:00""","""2024-07-09T22:05:00""","""E70""","""OVB""","""OVB""","""KJA""","""5338""","""S7""","""S7""","""E70""","""TLK""","""TLK""","""OVB""","""5357""","""S7""","""S7""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""98ce0dabf6964640b63079fbafd42c…","""TLKKJA/KJATLK"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""missing""","""2024-11-06T09:45:00""","""2024-11-05T20:50:00""","""738""","""SGC""","""SGC""","""VKO""","""247""","""UT""","""UT""","""AT7""","""SVX""","""SVX""","""SGC""","""111""","""UT""","""UT""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""2024-11-10T08:10:00""","""2024-11-09T19:25:00""","""AT7""","""UFA""","""UFA""","""SVX""","""1105""","""UT""","""UT""","""738""","""MOW""","""VKO""","""UFA""","""364""","""UT""","""UT""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""88f8c53a28bf4f438941fd67338009…","""MOWSVX/SVXMOW"""
"""missing""","""2024-11-05T20:00:00""","""2024-11-05T00:20:00""","""738""","""UFA""","""UFA""","""VKO""","""363""","""UT""","""UT""","""AT7""","""SVX""","""SVX""","""UFA""","""106""","""UT""","""UT""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""2024-11-10T08:35:00""","""2024-11-09T21:10:00""","""AT7""","""SGC""","""SGC""","""SVX""","""112""","""UT""","""UT""","""738""","""MOW""","""VKO""","""SGC""","""248""","""UT""","""UT""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""88f8c53a28bf4f438941fd67338009…","""MOWSVX/SVXMOW"""
"""missing""","""2024-11-05T20:00:00""","""2024-11-05T00:20:00""","""738""","""UFA""","""UFA""","""VKO""","""363""","""UT""","""UT""","""AT7""","""SVX""","""SVX""","""UFA""","""106""","""UT""","""UT""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""2024-11-10T08:10:00""","""2024-11-09T19:25:00""","""AT7""","""UFA""","""UFA""","""SVX""","""1105""","""UT""","""UT""","""738""","""MOW""","""VKO""","""UFA""","""364""","""UT""","""UT""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""88f8c53a28bf4f438941fd67338009…","""MOWSVX/SVXMOW"""
"""missing""","""2024-11-06T17:10:00""","""2024-11-05T19:30:00""","""738""","""KUF""","""KUF""","""VKO""","""357""","""UT""","""UT""","""AT7""","""SVX""","""SVX""","""KUF""","""282""","""UT""","""UT""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""2024-11-10T08:35:00""","""2024-11-09T21:10:00""","""AT7""","""SGC""","""SGC""","""SVX""","""112""","""UT""","""UT""","""738""","""MOW""","""VKO""","""SGC""","""248""","""UT""","""UT""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""missing""","""88f8c53a28bf4f438941fd67338009…","""MOWSVX/SVXMOW"""


In [19]:
import polars as pl
import pickle

# 假設 test 已經存在

# 載入 transform_config
with open("transform_config_rank.pkl", "rb") as f:
    transform_config = pickle.load(f)

label_encoders = transform_config["label_encoders"]
aircraft_cols = transform_config["aircraft_cols"]


# frequentFlyer 衍生特徵
ff_has = (
    (pl.col("frequentFlyer") != "")
    & (pl.col("frequentFlyer") != "missing")
).cast(pl.Int8).alias("has_frequentFlyer")

ff_count = (
    pl.col("frequentFlyer").map_elements(
        lambda s: 0 if s in ("", "missing") else s.count("/") + 1,
        return_dtype=pl.Int32
    )
    .alias("n_ff_programs")
)

# 先加上 frequentFlyer 衍生欄
train_filled = train_filled.with_columns([ff_has, ff_count])

# 再做 Label Encoding
for col in label_encoders:
    enc = label_encoders[col]
    mapping_df = pl.DataFrame({
        col: enc[col],
        f"{col}_rank": enc["rank_id"]
    })

    # 確保型別
    train_filled = train_filled.with_columns(
        pl.col(col).cast(pl.Utf8)
    )

    train_filled = train_filled.join(
        mapping_df,
        on=col,
        how="left"
    )

    train_filled = train_filled.with_columns(
        pl.col(f"{col}_rank").fill_null(-1).cast(pl.Int32).alias(col)
    ).drop(f"{col}_rank")

# Aircraft 是否缺失
aircraft_missing_exprs = [
    pl.col(c).is_null().cast(pl.Int8).alias(c + "_is_missing")
    for c in aircraft_cols
]

# 加上 aircraft 缺失
train_filled = train_filled.with_columns(aircraft_missing_exprs)

print("✅ 已完成 test 的轉換（先處理frequentFlyer再做Label Encoding）")


✅ 已完成 test 的轉換（先處理frequentFlyer再做Label Encoding）


In [20]:
train_filled

Id,bySelf,companyID,corporateTariffCode,frequentFlyer,nationality,isAccess3D,isVip,legs0_arrivalAt,legs0_departureAt,legs0_duration,legs0_segments0_aircraft_code,legs0_segments0_arrivalTo_airport_city_iata,legs0_segments0_arrivalTo_airport_iata,legs0_segments0_baggageAllowance_quantity,legs0_segments0_baggageAllowance_weightMeasurementType,legs0_segments0_cabinClass,legs0_segments0_departureFrom_airport_iata,legs0_segments0_duration,legs0_segments0_flightNumber,legs0_segments0_marketingCarrier_code,legs0_segments0_operatingCarrier_code,legs0_segments0_seatsAvailable,legs0_segments1_aircraft_code,legs0_segments1_arrivalTo_airport_city_iata,legs0_segments1_arrivalTo_airport_iata,legs0_segments1_baggageAllowance_quantity,legs0_segments1_baggageAllowance_weightMeasurementType,legs0_segments1_cabinClass,legs0_segments1_departureFrom_airport_iata,legs0_segments1_duration,legs0_segments1_flightNumber,legs0_segments1_marketingCarrier_code,legs0_segments1_operatingCarrier_code,legs0_segments1_seatsAvailable,legs0_segments2_aircraft_code,legs0_segments2_arrivalTo_airport_city_iata,…,legs0_departureAt_weekday,legs0_departureAt_business_time,legs0_departureAt_day_period,legs0_arrivalAt_hour,legs0_arrivalAt_weekday,legs0_arrivalAt_business_time,legs0_arrivalAt_day_period,legs1_departureAt_hour,legs1_departureAt_weekday,legs1_departureAt_business_time,legs1_departureAt_day_period,legs1_arrivalAt_hour,legs1_arrivalAt_weekday,legs1_arrivalAt_business_time,legs1_arrivalAt_day_period,is_round_trip,days_before_departure,legs0_num_transfers,legs1_num_transfers,legs0_is_direct,legs1_is_direct,both_legs_direct,legs0_main_carrier,legs1_main_carrier,legs0_all_segments_carrier_same,legs1_all_segments_carrier_same,both_legs_carrier_all_same,has_frequentFlyer,n_ff_programs,legs0_segments0_aircraft_code_is_missing,legs0_segments1_aircraft_code_is_missing,legs0_segments2_aircraft_code_is_missing,legs0_segments3_aircraft_code_is_missing,legs1_segments0_aircraft_code_is_missing,legs1_segments1_aircraft_code_is_missing,legs1_segments2_aircraft_code_is_missing,legs1_segments3_aircraft_code_is_missing
i64,i8,i64,i64,i32,i64,i8,i8,str,str,i64,i32,i32,i32,f64,f64,f64,i32,i64,i32,i32,i32,f64,i32,i32,i32,f64,f64,f64,i32,i64,i32,i32,i32,f64,i32,i32,…,i8,i32,i32,i8,i8,i32,i32,i8,i8,i32,i32,i8,i8,i32,i32,i8,i32,i8,i8,i8,i8,i8,str,str,i8,i8,i8,i8,i32,i8,i8,i8,i8,i8,i8,i8,i8
18144679,1,62840,0,371,36,0,0,"""2024-12-19T11:20:00""","""2024-12-19T06:50:00""",150,12,399,424,0.0,0.0,1.0,303,150,441,127,167,4.0,102,424,475,0.0,0.0,0.0,453,0,6576,149,200,0.0,81,242,…,4,1,1,11,4,0,1,21,6,0,3,21,6,0,3,1,50,0,0,1,1,1,"""SU""","""SU""",1,1,1,0,0,0,0,0,0,0,0,0,0
18144680,1,62840,0,371,36,0,0,"""2024-12-19T11:20:00""","""2024-12-19T06:50:00""",150,12,399,424,1.0,0.0,1.0,303,150,441,127,167,4.0,102,424,475,0.0,0.0,0.0,453,0,6576,149,200,0.0,81,242,…,4,1,1,11,4,0,1,21,6,0,3,21,6,0,3,1,50,0,0,1,1,1,"""SU""","""SU""",1,1,1,0,0,0,0,0,0,0,0,0,0
18144681,1,62840,0,371,36,0,0,"""2024-12-19T11:20:00""","""2024-12-19T06:50:00""",150,12,399,424,2.0,0.0,1.0,303,150,441,127,167,4.0,102,424,475,0.0,0.0,0.0,453,0,6576,149,200,0.0,81,242,…,4,1,1,11,4,0,1,21,6,0,3,21,6,0,3,1,50,0,0,1,1,1,"""SU""","""SU""",1,1,1,0,0,0,0,0,0,0,0,0,0
18144682,1,62840,0,371,36,0,0,"""2024-12-19T12:45:00""","""2024-12-19T08:25:00""",140,9,399,424,0.0,0.0,1.0,78,140,1743,137,179,7.0,102,424,475,0.0,0.0,0.0,453,0,6576,149,200,0.0,81,242,…,4,1,1,12,4,0,2,12,6,0,2,12,6,0,2,1,50,0,0,1,1,1,"""U6""","""U6""",1,1,1,0,0,0,0,0,0,0,0,0,0
18144683,1,62840,0,371,36,0,0,"""2024-12-19T12:45:00""","""2024-12-19T08:25:00""",140,9,399,424,1.0,0.0,1.0,78,140,1743,137,179,7.0,102,424,475,0.0,0.0,0.0,453,0,6576,149,200,0.0,81,242,…,4,1,1,12,4,0,2,12,6,0,2,12,6,0,2,1,50,0,0,1,1,1,"""U6""","""U6""",1,1,1,0,0,0,0,0,0,0,0,0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
25043143,0,57320,65,371,36,1,0,"""2025-01-08T12:50:00""","""2025-01-08T09:05:00""",165,12,15,16,2.0,0.0,1.0,303,165,725,127,167,4.0,102,424,475,0.0,0.0,0.0,453,0,6576,149,200,0.0,81,242,…,3,1,1,12,3,0,2,-1,-1,-1,-1,-1,-1,-1,-1,0,7,0,0,1,1,1,"""SU""","""missing""",1,1,0,0,0,0,0,0,0,0,0,0,0
25043144,0,57320,65,371,36,1,0,"""2025-01-09T01:10:00""","""2025-01-08T21:25:00""",165,12,15,16,1.0,0.0,1.0,303,165,183,127,167,4.0,102,424,475,0.0,0.0,0.0,453,0,6576,149,200,0.0,81,242,…,3,0,3,1,4,0,0,-1,-1,-1,-1,-1,-1,-1,-1,0,8,0,0,1,1,1,"""SU""","""missing""",1,1,0,0,0,0,0,0,0,0,0,0,0
25043145,0,57320,65,371,36,1,0,"""2025-01-09T01:10:00""","""2025-01-08T21:25:00""",165,12,15,16,2.0,0.0,1.0,303,165,183,127,167,4.0,102,424,475,0.0,0.0,0.0,453,0,6576,149,200,0.0,81,242,…,3,0,3,1,4,0,0,-1,-1,-1,-1,-1,-1,-1,-1,0,8,0,0,1,1,1,"""SU""","""missing""",1,1,0,0,0,0,0,0,0,0,0,0,0
25043146,0,57320,65,371,36,1,0,"""2025-01-08T18:50:00""","""2025-01-08T15:10:00""",160,13,15,16,1.0,0.0,1.0,303,160,185,127,167,9.0,102,424,475,0.0,0.0,0.0,453,0,6576,149,200,0.0,81,242,…,3,0,2,18,3,1,3,-1,-1,-1,-1,-1,-1,-1,-1,0,7,0,0,1,1,1,"""SU""","""missing""",1,1,0,0,0,0,0,0,0,0,0,0,0


In [21]:
# legs0所有marketingCarrier
legs0_marketing_cols = [
    f"legs0_segments{s}_marketingCarrier_code"
    for s in range(4) if f"legs0_segments{s}_marketingCarrier_code" in train_filled.columns
]
legs1_marketing_cols = [
    f"legs1_segments{s}_marketingCarrier_code"
    for s in range(4) if f"legs1_segments{s}_marketingCarrier_code" in train_filled.columns
]

# legs0 主carrier (取第一個不為null)
legs0_main_carrier = (
    pl.coalesce([pl.col(c) for c in legs0_marketing_cols])
    .alias("legs0_main_carrier")
)

# legs1 主carrier
legs1_main_carrier = (
    pl.coalesce([pl.col(c) for c in legs1_marketing_cols])
    .alias("legs1_main_carrier")
)

train_filled = train_filled.with_columns([
    legs0_main_carrier,
    legs1_main_carrier
])

# legs0是否一致
legs0_all_same = (
    pl.when(pl.col("legs0_num_transfers") == 0)
    .then(1)
    .otherwise(
        pl.all_horizontal([
            (pl.col(c) == pl.col("legs0_main_carrier")) & pl.col(c).is_not_null()
            for c in legs0_marketing_cols
        ]).cast(pl.Int8)
    )
    .alias("legs0_all_segments_carrier_same")
)

# legs1是否一致
legs1_all_same = (
    pl.when(pl.col("legs1_num_transfers") == 0)
    .then(1)
    .otherwise(
        pl.all_horizontal([
            (pl.col(c) == pl.col("legs1_main_carrier")) & pl.col(c).is_not_null()
            for c in legs1_marketing_cols
        ]).cast(pl.Int8)
    )
    .alias("legs1_all_segments_carrier_same")
)

# 加入腿一致性
train_filled = train_filled.with_columns([
    legs0_all_same,
    legs1_all_same
])

# 最後判斷兩腿是否都是同一家
both_legs_all_same = (
    (
        (pl.col("legs0_all_segments_carrier_same") == 1) &
        (pl.col("legs1_all_segments_carrier_same") == 1) &
        (pl.col("legs0_main_carrier") == pl.col("legs1_main_carrier")) &
        pl.col("legs0_main_carrier").is_not_null() &
        pl.col("legs1_main_carrier").is_not_null()
    ).cast(pl.Int8)
    .alias("both_legs_carrier_all_same")
)

train_filled = train_filled.with_columns([
    both_legs_all_same
])

print("✅ 完成 legs0/legs1 主Carrier一致判斷！")


✅ 完成 legs0/legs1 主Carrier一致判斷！


In [22]:
train_filled

Id,bySelf,companyID,corporateTariffCode,frequentFlyer,nationality,isAccess3D,isVip,legs0_arrivalAt,legs0_departureAt,legs0_duration,legs0_segments0_aircraft_code,legs0_segments0_arrivalTo_airport_city_iata,legs0_segments0_arrivalTo_airport_iata,legs0_segments0_baggageAllowance_quantity,legs0_segments0_baggageAllowance_weightMeasurementType,legs0_segments0_cabinClass,legs0_segments0_departureFrom_airport_iata,legs0_segments0_duration,legs0_segments0_flightNumber,legs0_segments0_marketingCarrier_code,legs0_segments0_operatingCarrier_code,legs0_segments0_seatsAvailable,legs0_segments1_aircraft_code,legs0_segments1_arrivalTo_airport_city_iata,legs0_segments1_arrivalTo_airport_iata,legs0_segments1_baggageAllowance_quantity,legs0_segments1_baggageAllowance_weightMeasurementType,legs0_segments1_cabinClass,legs0_segments1_departureFrom_airport_iata,legs0_segments1_duration,legs0_segments1_flightNumber,legs0_segments1_marketingCarrier_code,legs0_segments1_operatingCarrier_code,legs0_segments1_seatsAvailable,legs0_segments2_aircraft_code,legs0_segments2_arrivalTo_airport_city_iata,…,legs0_departureAt_weekday,legs0_departureAt_business_time,legs0_departureAt_day_period,legs0_arrivalAt_hour,legs0_arrivalAt_weekday,legs0_arrivalAt_business_time,legs0_arrivalAt_day_period,legs1_departureAt_hour,legs1_departureAt_weekday,legs1_departureAt_business_time,legs1_departureAt_day_period,legs1_arrivalAt_hour,legs1_arrivalAt_weekday,legs1_arrivalAt_business_time,legs1_arrivalAt_day_period,is_round_trip,days_before_departure,legs0_num_transfers,legs1_num_transfers,legs0_is_direct,legs1_is_direct,both_legs_direct,legs0_main_carrier,legs1_main_carrier,legs0_all_segments_carrier_same,legs1_all_segments_carrier_same,both_legs_carrier_all_same,has_frequentFlyer,n_ff_programs,legs0_segments0_aircraft_code_is_missing,legs0_segments1_aircraft_code_is_missing,legs0_segments2_aircraft_code_is_missing,legs0_segments3_aircraft_code_is_missing,legs1_segments0_aircraft_code_is_missing,legs1_segments1_aircraft_code_is_missing,legs1_segments2_aircraft_code_is_missing,legs1_segments3_aircraft_code_is_missing
i64,i8,i64,i64,i32,i64,i8,i8,str,str,i64,i32,i32,i32,f64,f64,f64,i32,i64,i32,i32,i32,f64,i32,i32,i32,f64,f64,f64,i32,i64,i32,i32,i32,f64,i32,i32,…,i8,i32,i32,i8,i8,i32,i32,i8,i8,i32,i32,i8,i8,i32,i32,i8,i32,i8,i8,i8,i8,i8,i32,i32,i8,i8,i8,i8,i32,i8,i8,i8,i8,i8,i8,i8,i8
18144679,1,62840,0,371,36,0,0,"""2024-12-19T11:20:00""","""2024-12-19T06:50:00""",150,12,399,424,0.0,0.0,1.0,303,150,441,127,167,4.0,102,424,475,0.0,0.0,0.0,453,0,6576,149,200,0.0,81,242,…,4,1,1,11,4,0,1,21,6,0,3,21,6,0,3,1,50,0,0,1,1,1,127,114,1,1,0,0,0,0,0,0,0,0,0,0,0
18144680,1,62840,0,371,36,0,0,"""2024-12-19T11:20:00""","""2024-12-19T06:50:00""",150,12,399,424,1.0,0.0,1.0,303,150,441,127,167,4.0,102,424,475,0.0,0.0,0.0,453,0,6576,149,200,0.0,81,242,…,4,1,1,11,4,0,1,21,6,0,3,21,6,0,3,1,50,0,0,1,1,1,127,114,1,1,0,0,0,0,0,0,0,0,0,0,0
18144681,1,62840,0,371,36,0,0,"""2024-12-19T11:20:00""","""2024-12-19T06:50:00""",150,12,399,424,2.0,0.0,1.0,303,150,441,127,167,4.0,102,424,475,0.0,0.0,0.0,453,0,6576,149,200,0.0,81,242,…,4,1,1,11,4,0,1,21,6,0,3,21,6,0,3,1,50,0,0,1,1,1,127,114,1,1,0,0,0,0,0,0,0,0,0,0,0
18144682,1,62840,0,371,36,0,0,"""2024-12-19T12:45:00""","""2024-12-19T08:25:00""",140,9,399,424,0.0,0.0,1.0,78,140,1743,137,179,7.0,102,424,475,0.0,0.0,0.0,453,0,6576,149,200,0.0,81,242,…,4,1,1,12,4,0,2,12,6,0,2,12,6,0,2,1,50,0,0,1,1,1,137,121,1,1,0,0,0,0,0,0,0,0,0,0,0
18144683,1,62840,0,371,36,0,0,"""2024-12-19T12:45:00""","""2024-12-19T08:25:00""",140,9,399,424,1.0,0.0,1.0,78,140,1743,137,179,7.0,102,424,475,0.0,0.0,0.0,453,0,6576,149,200,0.0,81,242,…,4,1,1,12,4,0,2,12,6,0,2,12,6,0,2,1,50,0,0,1,1,1,137,121,1,1,0,0,0,0,0,0,0,0,0,0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
25043143,0,57320,65,371,36,1,0,"""2025-01-08T12:50:00""","""2025-01-08T09:05:00""",165,12,15,16,2.0,0.0,1.0,303,165,725,127,167,4.0,102,424,475,0.0,0.0,0.0,453,0,6576,149,200,0.0,81,242,…,3,1,1,12,3,0,2,-1,-1,-1,-1,-1,-1,-1,-1,0,7,0,0,1,1,1,127,140,1,1,0,0,0,0,0,0,0,0,0,0,0
25043144,0,57320,65,371,36,1,0,"""2025-01-09T01:10:00""","""2025-01-08T21:25:00""",165,12,15,16,1.0,0.0,1.0,303,165,183,127,167,4.0,102,424,475,0.0,0.0,0.0,453,0,6576,149,200,0.0,81,242,…,3,0,3,1,4,0,0,-1,-1,-1,-1,-1,-1,-1,-1,0,8,0,0,1,1,1,127,140,1,1,0,0,0,0,0,0,0,0,0,0,0
25043145,0,57320,65,371,36,1,0,"""2025-01-09T01:10:00""","""2025-01-08T21:25:00""",165,12,15,16,2.0,0.0,1.0,303,165,183,127,167,4.0,102,424,475,0.0,0.0,0.0,453,0,6576,149,200,0.0,81,242,…,3,0,3,1,4,0,0,-1,-1,-1,-1,-1,-1,-1,-1,0,8,0,0,1,1,1,127,140,1,1,0,0,0,0,0,0,0,0,0,0,0
25043146,0,57320,65,371,36,1,0,"""2025-01-08T18:50:00""","""2025-01-08T15:10:00""",160,13,15,16,1.0,0.0,1.0,303,160,185,127,167,9.0,102,424,475,0.0,0.0,0.0,453,0,6576,149,200,0.0,81,242,…,3,0,2,18,3,1,3,-1,-1,-1,-1,-1,-1,-1,-1,0,7,0,0,1,1,1,127,140,1,1,0,0,0,0,0,0,0,0,0,0,0


# remove time

In [23]:
# 先找出所有布林欄位
string_cols = [c for c in train_filled.columns if train_filled[c].dtype == pl.String]

print("✅ String 欄位：")
print(string_cols)


✅ String 欄位：
['legs0_arrivalAt', 'legs0_departureAt', 'legs1_arrivalAt', 'legs1_departureAt', 'ranker_id']


In [21]:
# 要刪除的時間欄位
drop_cols = [
    "legs0_arrivalAt",
    "legs0_departureAt",
    "legs1_arrivalAt",
    "legs1_departureAt"
]

# 刪除
train_filled = train_filled.drop(drop_cols)

print("✅ 已刪除時間欄位")


✅ 已刪除時間欄位


In [24]:
# 儲存成 Parquet 格式
train_filled.write_parquet("data/test_filled.parquet")

print("✅ 已將 test_filled 儲存為 data/test_filled.parquet")


✅ 已將 test_filled 儲存為 data/test_filled.parquet


In [4]:
# 先讀 parquet
test = pl.read_parquet('data/test_filled.parquet')
train = pl.read_parquet('data/train_filled.parquet')  # 這裡你可能打錯了，train 路徑

# 取欄位名稱
train_cols = set(train.columns)
test_cols = set(test.columns)

# 列印比較
print("✅ Train columns:", len(train_cols))
print("✅ Test columns:", len(test_cols))

print("\n🎯 Train 中有但 Test 沒有的欄位:")
print(sorted(train_cols - test_cols))

print("\n🎯 Test 中有但 Train 沒有的欄位:")
print(sorted(test_cols - train_cols))

# 如果完全一樣
if train_cols == test_cols:
    print("\n✅ Train 和 Test 欄位完全一致！")


✅ Train columns: 164
✅ Test columns: 164

🎯 Train 中有但 Test 沒有的欄位:
[]

🎯 Test 中有但 Train 沒有的欄位:
[]

✅ Train 和 Test 欄位完全一致！
