In [8]:
import pandas as pd 
import numpy as np 

df = pd.read_csv('encoded_v3.csv')
encoded = df.copy()

target = 'is_canceled'

In [9]:
# Lead Time Transformations 
encoded['log_lead_time'] = np.log1p(encoded['lead_time'])

# Guest Composition 
encoded['family_size'] = encoded['adults'] + encoded['children'] + encoded['babies']
encoded['is_family_trip'] = (encoded['family_size'] > 1).astype(int)

# Customer Intent and Stability 
encoded['customer_intent_score'] = encoded['total_of_special_requests'] - encoded['previous_cancellations']
encoded['booking_stability'] = encoded['total_of_special_requests'] / (1 + encoded['previous_cancellations'])

# Room Behavior 
encoded['room_mismatch'] = (encoded['assigned_room_type'] != encoded['reserved_room_type']).astype(int)

# Price and Value Features 
encoded['price_segment'] = pd.qcut(encoded['adr'], q=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
encoded['adr_log'] = np.log1p(encoded['adr'])

# Segment Risk Encoding 
if 'market_segment' in encoded.columns:
    segment_cxl = encoded.groupby('market_segment')[target].mean()
    encoded['segment_cancel_risk'] = encoded['market_segment'].map(segment_cxl)

# Temporal Features 
encoded['arrival_month_num'] = encoded['arrival_date_month']
encoded['month_sin'] = np.sin(2 * np.pi * encoded['arrival_month_num'] / 12)
encoded['month_cos'] = np.cos(2 * np.pi * encoded['arrival_month_num'] / 12)

# Pace Feature: relates lead time to stay duration
encoded['stay_duration'] = encoded['stays_in_week_nights'] + encoded['stays_in_weekend_nights']
encoded['booking_pace'] = encoded['lead_time'] / (1 + encoded['stay_duration'])

# ADR Deviation by Segment
if 'market_segment' in encoded.columns:
    seg_mean_adr = encoded.groupby('market_segment')['adr'].mean()
    encoded['adr_dev_by_segment'] = encoded['adr'] - encoded['market_segment'].map(seg_mean_adr)

# Seasonal Cancellation Rate (Contextual Feature)
if 'arrival_date_month' in encoded.columns:
    month_cancel = encoded.groupby('arrival_date_month')['is_canceled'].mean()
    encoded['seasonal_cxl_rate'] = encoded['arrival_date_month'].map(month_cancel)

In [10]:
encoded['lead_time'] = pd.to_numeric(encoded['lead_time'], errors='coerce').fillna(0).clip(lower=0, upper=3650)

if 'lead_time_bucket' in encoded.columns:
    encoded.drop(columns=['lead_time_bucket'], inplace=True)

bins = [-0.1, 30, 90, 180, 365, np.inf]
labels = ['<1M', '1-3M', '3-6M', '6-12M', '>1Y']

encoded['lead_time_bucket'] = pd.cut(encoded['lead_time'].astype(float),
                                     bins=bins,
                                     labels=labels,
                                     include_lowest=True, right=True)

print("After re-binning:")
print(encoded['lead_time_bucket'].isna().sum())
print(encoded['lead_time_bucket'].value_counts(dropna=False))


After re-binning:
0
lead_time_bucket
<1M      37249
1-3M     29295
3-6M     26304
6-12M    21422
>1Y       3129
Name: count, dtype: int64


In [11]:
# Drop potential leakage features
encoded.drop(['reservation_status'], axis=1, errors='ignore', inplace=True)

In [12]:
# encoded.to_csv("engineered_features_v4.csv", index=False)

In [14]:
print(encoded.shape)
encoded.head()

(117399, 48)


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,adr_log,segment_cancel_risk,arrival_month_num,month_sin,month_cos,stay_duration,booking_pace,adr_dev_by_segment,seasonal_cxl_rate,lead_time_bucket
0,1,0,7,0,5,27,1,0,1,1,...,4.330733,0.153952,5,0.5,-0.866025,1,3.5,-42.702005,0.378203,<1M
1,1,0,13,0,5,27,1,0,1,1,...,4.330733,0.189407,5,0.5,-0.866025,1,6.5,4.524343,0.378203,<1M
2,1,0,14,0,5,27,1,0,2,2,...,4.59512,0.369395,5,0.5,-0.866025,2,4.666667,-19.985256,0.378203,<1M
3,1,0,14,0,5,27,1,0,2,2,...,4.59512,0.369395,5,0.5,-0.866025,2,4.666667,-19.985256,0.378203,<1M
4,1,0,0,0,5,27,1,0,2,2,...,4.682131,0.153952,5,0.5,-0.866025,2,0.0,-10.702005,0.378203,<1M
