In [1]:
import polars as pl
# Load data
train_sample = pl.read_parquet('data/recall/top30/data/train_all_data.parquet')
test_sample = pl.read_parquet('data/recall/top30/data/test_all_data.parquet')


In [6]:
# 先定義不需要做 z-score 的欄位
exclude_cols = ["ranker_id", "companyID", "Id", "selected"]

# 要轉換的欄位
train_numeric_cols = [c for c in train_sample.columns if c not in exclude_cols]
test_numeric_cols = [c for c in test_sample.columns if c not in exclude_cols]

print("Train numeric columns:", train_numeric_cols)
print("Test numeric columns:", test_numeric_cols)

# 防呆版 group-wise z-score
def zscore_expr(colname: str):
    mean = pl.col(colname).mean().over("ranker_id")
    std = pl.col(colname).std().over("ranker_id")
    safe_std = pl.when(std == 0).then(1.0).otherwise(std)
    return ((pl.col(colname) - mean) / safe_std).alias(colname)

train_sample_z = train_sample.with_columns([
    zscore_expr(c) for c in train_numeric_cols
])

test_sample_z = test_sample.with_columns([
    zscore_expr(c) for c in test_numeric_cols
])

print("✅ Train 標準化後 shape:", train_sample_z.shape)
print("✅ Test 標準化後 shape:", test_sample_z.shape)


Train numeric columns: ['total_num_transfers_rank', 'legs0_segments0_flightNumber', 'price_per_duration_rank', 'price_percentile', 'price_from_median_zscore', 'price_per_duration', 'legs0_segments0_baggageAllowance_quantity', 'price_minus_fee_rank', 'legs0_arrivalAt_hour', 'pricingInfo_isAccessTP', 'legs0_departureAt_hour', 'both_legs_carrier_all_same', 'total_weighted_mean_cabin', 'price_per_fee_rank', 'totalPrice_rank', 'days_before_departure', 'legs0_segments0_seatsAvailable', 'price_per_fee', 'price_per_tax', 'legs1_main_carrier', 'isVip', 'leg0_view_diff_mean', 'legs1_departureAt_hour', 'baggage_total', 'duration_ratio', 'total_fees', 'all_view_diff_mean', 'legs1_arrivalAt_hour', 'log_price', 'legs1_segments0_flightNumber', 'legs1_weighted_mean_cabin', 'companyID_loo_mean_legs0_departureAt_hour', 'legs0_segments0_cabinClass', 'price_minus_fee', 'legs0_main_carrier', 'companyID_loo_mean_legs0_arrivalAt_hour', 'baggage_total_rank', 'legs0_max_duration_cabin', 'legs0_segments0_key_vi

In [7]:
train_sample

total_num_transfers_rank,legs0_segments0_flightNumber,price_per_duration_rank,price_percentile,price_from_median_zscore,price_per_duration,legs0_segments0_baggageAllowance_quantity,price_minus_fee_rank,legs0_arrivalAt_hour,pricingInfo_isAccessTP,legs0_departureAt_hour,both_legs_carrier_all_same,total_weighted_mean_cabin,price_per_fee_rank,totalPrice_rank,days_before_departure,legs0_segments0_seatsAvailable,price_per_fee,price_per_tax,legs1_main_carrier,isVip,leg0_view_diff_mean,legs1_departureAt_hour,baggage_total,duration_ratio,total_fees,all_view_diff_mean,legs1_arrivalAt_hour,log_price,legs1_segments0_flightNumber,legs1_weighted_mean_cabin,companyID_loo_mean_legs0_departureAt_hour,legs0_segments0_cabinClass,price_minus_fee,legs0_main_carrier,companyID_loo_mean_legs0_arrivalAt_hour,baggage_total_rank,…,legs1_segments2_departureFrom_airport_iata,legs1_segments2_duration_rank,legs1_segments2_key_view_count_rank,legs1_num_transfers_rank,legs1_segments2_duration,legs1_segments1_operatingCarrier_code_in_ff,legs0_segments2_flightNumber,legs0_segments2_aircraft_code,legs1_segments1_marketingCarrier_code_in_ff,legs0_segments2_seatsAvailable,legs1_segments2_baggageAllowance_quantity,legs0_segments1_baggageAllowance_weightMeasurementType,legs1_segments2_cabinClass,legs1_segments2_operatingCarrier_code_in_ff,legs0_segments2_key_view_count_rank,is_cheapest,legs0_segments2_marketingCarrier_code,legs0_segments2_operatingCarrier_code_in_ff,legs0_segments2_duration,legs1_segments2_arrivalTo_airport_iata,legs1_segments2_aircraft_code,legs0_segments2_duration_rank,legs1_segments2_seatsAvailable,legs0_segments2_baggageAllowance_quantity,legs0_segments2_arrivalTo_airport_iata,legs0_segments2_arrivalTo_airport_city_iata,legs1_segments2_flightNumber,legs0_segments2_departureFrom_airport_iata,legs0_segments2_key_view_count,legs0_segments2_operatingCarrier_code,legs1_segments2_marketingCarrier_code,legs1_segments2_key_view_count,legs0_segments2_marketingCarrier_code_in_ff,ranker_id,companyID,Id,selected
u32,i32,u32,f64,f64,f64,f64,i32,i8,f64,i8,i8,f64,i32,u32,i32,f64,f64,f64,i32,i8,f64,i8,f64,f64,f64,f64,i8,f64,i32,f64,f64,i64,f64,i32,f64,i32,…,i32,i32,u32,u32,i64,i8,i32,i32,i8,f64,f64,f64,i64,i8,u32,i8,i32,i8,i64,i32,i32,i32,f64,f64,i32,i32,i32,i32,u32,i32,i32,u32,i8,str,i64,i64,i64
1,1274,9,0.04,-1.947024,53.43038,1.0,5,16,1.0,15,1,1.0,3,1,29,9.0,16884.0,45.509434,130,0,-71.04,9,2.0,1.025641,0.0,-22.08,14,9.734181,1263,1.0,12.454769,1,16884.0,130,12.44708,1,…,678,1,1,1,0,0,8790,118,0,0.0,0.0,0.0,0,0,1,1,233,0,0,678,118,1,0.0,0.0,678,678,8790,678,0,233,233,0,0,"""98ce0dabf6964640b63079fbafd42c…",57323,0,1
2,4360,10,0.18,-0.135934,53.759201,1.0,4,14,1.0,9,0,1.0,5,2,29,4.0,8.813136,22.813476,177,0,2.96,22,2.0,0.879447,5800.0,0.92,8,10.842048,4338,1.0,12.454769,1,45325.0,177,12.44708,1,…,678,1,1,2,0,1,8790,118,1,0.0,0.0,0.0,0,0,1,0,233,0,0,678,118,1,0.0,0.0,678,678,8790,678,0,233,233,0,0,"""98ce0dabf6964640b63079fbafd42c…",57323,1,0
2,4360,11,0.42,0.0,56.461619,1.0,3,14,1.0,9,0,1.0,4,3,29,4.0,9.256163,23.960286,177,0,2.96,22,2.0,0.879447,5800.0,0.92,8,10.891094,4338,1.0,12.454769,1,47895.0,177,12.44708,1,…,678,1,1,2,0,1,8790,118,1,0.0,0.0,0.0,0,0,1,0,233,0,0,678,118,1,0.0,0.0,678,678,8790,678,0,233,233,0,0,"""98ce0dabf6964640b63079fbafd42c…",57323,2,0
2,4360,20,0.66,1.490774,86.098843,1.0,2,14,1.0,9,0,1.0,2,4,29,4.0,81880.0,36.53726,177,0,2.96,22,2.0,0.879447,0.0,0.92,8,11.313022,4338,1.0,12.454769,1,81880.0,177,12.44708,1,…,678,1,1,2,0,1,8790,118,1,0.0,0.0,0.0,0,0,1,0,233,0,0,678,118,1,0.0,0.0,678,678,8790,678,0,233,233,0,0,"""98ce0dabf6964640b63079fbafd42c…",57323,3,0
2,4360,21,0.9,1.712393,90.504732,1.0,1,14,1.0,9,0,1.0,1,5,29,4.0,86070.0,38.406961,177,0,2.96,22,2.0,0.879447,0.0,0.92,8,11.362928,4338,1.0,12.454769,1,86070.0,177,12.44708,1,…,678,1,1,2,0,1,8790,118,1,0.0,0.0,0.0,0,0,1,0,233,0,0,678,118,1,0.0,0.0,678,678,8790,678,0,233,233,0,0,"""98ce0dabf6964640b63079fbafd42c…",57323,4,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
1,449,133,0.516102,0.032936,67.228296,1.0,87,20,0.0,15,1,1.0,113,61,7,6.0,3.732905,20.518155,184,0,112696.345763,18,2.0,0.931677,5600.0,103395.833898,19,9.947935,448,1.0,12.729904,1,15308.0,184,14.419614,2,…,678,1,1,1,0,0,8790,118,0,0.0,0.0,0.0,0,0,1,0,233,0,0,678,118,1,0.0,0.0,678,678,8790,678,0,233,233,0,0,"""88f8c53a28bf4f438941fd67338009…",54154,18146378,0
1,460,94,0.392373,-0.114798,58.257235,0.0,86,22,0.0,17,1,1.0,103,40,7,9.0,6.468404,17.780177,184,0,112696.345763,18,0.0,0.931677,2800.0,103395.833898,19,9.804716,448,1.0,12.729904,1,15318.0,184,14.419614,3,…,678,1,1,1,0,0,8790,118,0,0.0,0.0,0.0,0,0,1,0,233,0,0,678,118,1,0.0,0.0,678,678,8790,678,0,233,233,0,0,"""88f8c53a28bf4f438941fd67338009…",54154,18146380,0
1,460,133,0.516102,0.032936,67.228296,1.0,87,22,0.0,17,1,1.0,113,61,7,9.0,3.732905,20.518155,184,0,112696.345763,18,2.0,0.931677,5600.0,103395.833898,19,9.947935,448,1.0,12.729904,1,15308.0,184,14.419614,2,…,678,1,1,1,0,0,8790,118,0,0.0,0.0,0.0,0,0,1,0,233,0,0,678,118,1,0.0,0.0,678,678,8790,678,0,233,233,0,0,"""88f8c53a28bf4f438941fd67338009…",54154,18146381,0
1,447,120,0.5,-0.000953,64.139241,0.0,73,17,0.0,12,1,1.0,101,57,7,7.0,7.235987,19.890088,184,0,112696.345763,18,0.0,0.962733,2800.0,103395.833898,19,9.916848,448,1.0,12.729904,1,17468.0,184,14.419614,3,…,678,1,1,1,0,0,8790,118,0,0.0,0.0,0.0,0,0,1,0,233,0,0,678,118,1,0.0,0.0,678,678,8790,678,0,233,233,0,0,"""88f8c53a28bf4f438941fd67338009…",54154,18146409,0


In [8]:
train_sample_z

total_num_transfers_rank,legs0_segments0_flightNumber,price_per_duration_rank,price_percentile,price_from_median_zscore,price_per_duration,legs0_segments0_baggageAllowance_quantity,price_minus_fee_rank,legs0_arrivalAt_hour,pricingInfo_isAccessTP,legs0_departureAt_hour,both_legs_carrier_all_same,total_weighted_mean_cabin,price_per_fee_rank,totalPrice_rank,days_before_departure,legs0_segments0_seatsAvailable,price_per_fee,price_per_tax,legs1_main_carrier,isVip,leg0_view_diff_mean,legs1_departureAt_hour,baggage_total,duration_ratio,total_fees,all_view_diff_mean,legs1_arrivalAt_hour,log_price,legs1_segments0_flightNumber,legs1_weighted_mean_cabin,companyID_loo_mean_legs0_departureAt_hour,legs0_segments0_cabinClass,price_minus_fee,legs0_main_carrier,companyID_loo_mean_legs0_arrivalAt_hour,baggage_total_rank,…,legs1_segments2_departureFrom_airport_iata,legs1_segments2_duration_rank,legs1_segments2_key_view_count_rank,legs1_num_transfers_rank,legs1_segments2_duration,legs1_segments1_operatingCarrier_code_in_ff,legs0_segments2_flightNumber,legs0_segments2_aircraft_code,legs1_segments1_marketingCarrier_code_in_ff,legs0_segments2_seatsAvailable,legs1_segments2_baggageAllowance_quantity,legs0_segments1_baggageAllowance_weightMeasurementType,legs1_segments2_cabinClass,legs1_segments2_operatingCarrier_code_in_ff,legs0_segments2_key_view_count_rank,is_cheapest,legs0_segments2_marketingCarrier_code,legs0_segments2_operatingCarrier_code_in_ff,legs0_segments2_duration,legs1_segments2_arrivalTo_airport_iata,legs1_segments2_aircraft_code,legs0_segments2_duration_rank,legs1_segments2_seatsAvailable,legs0_segments2_baggageAllowance_quantity,legs0_segments2_arrivalTo_airport_iata,legs0_segments2_arrivalTo_airport_city_iata,legs1_segments2_flightNumber,legs0_segments2_departureFrom_airport_iata,legs0_segments2_key_view_count,legs0_segments2_operatingCarrier_code,legs1_segments2_marketingCarrier_code,legs1_segments2_key_view_count,legs0_segments2_marketingCarrier_code_in_ff,ranker_id,companyID,Id,selected
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,i64,i64,i64
-4.8,-4.8,-0.377715,-1.676233,-2.605417,-0.378079,0.0,1.959592,0.319291,0.0,4.8,4.8,0.0,0.0,-1.959592,0.0,4.8,-0.569717,1.880001,-4.8,0.0,-4.8,-3.403867,0.0,0.951477,-0.941357,-4.8,4.8,-3.628222,-4.798833,0.0,3.5527e-15,0.0,-2.201036,-4.8,5.3291e-15,0.0,…,0.0,0.0,0.0,-4.8,0.0,-4.8,0.0,0.0,-4.8,0.0,0.0,0.0,0.0,0.0,0.0,4.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""98ce0dabf6964640b63079fbafd42c…",57323,0,1
0.2,0.2,-0.214907,-1.187332,-0.794231,-0.356943,0.0,1.143095,-1.011089,0.0,-0.2,-0.2,0.0,1.264911,-1.143095,0.0,-0.2,-0.968581,-1.067377,0.2,0.0,0.2,1.056846,0.0,0.187177,1.019804,0.2,-0.2,-0.567703,0.226097,0.0,3.5527e-15,0.0,-0.854,0.2,5.3291e-15,0.0,…,0.0,0.0,0.0,0.2,0.0,0.2,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,-0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""98ce0dabf6964640b63079fbafd42c…",57323,1,0
0.2,0.2,-0.052099,-0.349215,-0.65829,-0.183243,0.0,0.326599,-1.011089,0.0,-0.2,-0.2,0.0,0.632456,-0.326599,0.0,-0.2,-0.96857,-0.918448,0.2,0.0,0.2,1.056846,0.0,0.187177,1.019804,0.2,-0.2,-0.432214,0.226097,0.0,3.5527e-15,0.0,-0.732279,0.2,5.3291e-15,0.0,…,0.0,0.0,0.0,0.2,0.0,0.2,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,-0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""98ce0dabf6964640b63079fbafd42c…",57323,2,0
0.2,0.2,1.413174,0.488901,0.832563,1.721714,0.0,-0.489898,-1.011089,0.0,-0.2,-0.2,0.0,-0.632456,0.489898,0.0,-0.2,0.966534,0.714843,0.2,0.0,0.2,1.056846,0.0,0.187177,-0.941357,0.2,-0.2,0.733377,0.226097,0.0,3.5527e-15,0.0,0.877335,0.2,5.3291e-15,0.0,…,0.0,0.0,0.0,0.2,0.0,0.2,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,-0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""98ce0dabf6964640b63079fbafd42c…",57323,3,0
0.2,0.2,1.575982,1.327018,1.054194,2.004906,0.0,-1.306395,-1.011089,0.0,-0.2,-0.2,0.0,-1.264911,1.306395,0.0,-0.2,1.06557,0.957649,0.2,0.0,0.2,1.056846,0.0,0.187177,-0.941357,0.2,-0.2,0.871243,0.226097,0.0,3.5527e-15,0.0,1.075783,0.2,5.3291e-15,0.0,…,0.0,0.0,0.0,0.2,0.0,0.2,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,-0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""98ce0dabf6964640b63079fbafd42c…",57323,4,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.0,-0.342884,1.807001,1.668461,1.71751,1.779671,1.782187,-1.155181,1.266484,-0.983192,1.258163,0.0,0.0,0.522779,1.844747,1.501851,0.220594,-0.327832,1.159311,-0.327731,0.0,0.327731,0.218258,1.782187,-0.495322,1.516885,0.327731,0.31508,1.679288,-0.334095,0.0,1.7764e-15,0.0,1.070255,-0.327731,1.0658e-14,-1.782187,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""88f8c53a28bf4f438941fd67338009…",54154,18146378,0
0.0,-0.316279,0.751489,0.868389,0.615994,0.695241,-0.542405,-1.23271,1.773078,-0.983192,1.768229,0.0,0.0,0.091917,0.631604,1.501851,1.388444,-0.327205,0.561147,-0.327731,0.0,0.327731,0.218258,-0.542405,-0.495322,-0.233367,0.327731,0.31508,0.692817,-0.334095,0.0,1.7764e-15,0.0,1.075819,-0.327731,1.0658e-14,0.542405,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""88f8c53a28bf4f438941fd67338009…",54154,18146380,0
0.0,-0.316279,1.807001,1.668461,1.71751,1.779671,1.782187,-1.155181,1.773078,-0.983192,1.768229,0.0,0.0,0.522779,1.844747,1.501851,1.388444,-0.327832,1.159311,-0.327731,0.0,0.327731,0.218258,1.782187,-0.495322,1.516885,0.327731,0.31508,1.679288,-0.334095,0.0,1.7764e-15,0.0,1.070255,-0.327731,1.0658e-14,-1.782187,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""88f8c53a28bf4f438941fd67338009…",54154,18146381,0
0.0,-0.347722,1.455164,1.564342,1.464832,1.406263,-0.542405,-2.240586,0.506594,-0.983192,0.493064,0.0,0.0,0.005745,1.613672,1.501851,0.609877,-0.327029,1.022097,-0.327731,0.0,0.327731,0.218258,-0.542405,0.114323,-0.233367,0.327731,0.31508,1.465165,-0.334095,0.0,1.7764e-15,0.0,2.272059,-0.327731,1.0658e-14,0.542405,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""88f8c53a28bf4f438941fd67338009…",54154,18146409,0


In [9]:
import os

# 指定要儲存的資料夾
save_dir = "data/recall/top30/data"

# 建立資料夾（若不存在）
os.makedirs(save_dir, exist_ok=True)

# 儲存 parquet
train_sample_path = os.path.join(save_dir, "train_all_data_z-score.parquet")
test_sample_path = os.path.join(save_dir, "test_all_data_z-score.parquet")

train_sample_z.write_parquet(train_sample_path)
test_sample_z.write_parquet(test_sample_path)

print(f"✅ Train sample 已儲存到: {train_sample_path}")
print(f"✅ Test sample 已儲存到: {test_sample_path}")


✅ Train sample 已儲存到: data/recall/top30/data/train_all_data_z-score.parquet
✅ Test sample 已儲存到: data/recall/top30/data/test_all_data_z-score.parquet
