## Load hotel_bookings Dataset

In [28]:
import pandas as pd

raw_data = pd.read_csv('../data/hotel_bookings.csv')
print("Hotel bookings dataset loaded successfully")
print(f"Dataset shape: {raw_data.shape}")
display(raw_data.head())
print(raw_data.info())


Hotel bookings dataset loaded successfully
Dataset shape: (119390, 32)


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

## Data Cleaning

In [29]:
# Check for missing values
print("=== Missing Values ===")
print(raw_data.isnull().sum())

hotel_df = raw_data.copy()
# Fill missing values in 'agent' and 'company' columns with 0
hotel_df['agent'] = hotel_df['agent'].fillna(0)
hotel_df['company'] = hotel_df['company'].fillna(0)
# Drop rows that contain missing values in 'children' and 'country'
hotel_df = hotel_df.dropna(subset=['children', 'country'])
print(f"Shape of data cleaning: {hotel_df.shape}")

=== Missing Values ===
hotel                                  0
is_canceled                            0
lead_time                              0
arrival_date_year                      0
arrival_date_month                     0
arrival_date_week_number               0
arrival_date_day_of_month              0
stays_in_weekend_nights                0
stays_in_week_nights                   0
adults                                 0
children                               4
babies                                 0
meal                                   0
country                              488
market_segment                         0
distribution_channel                   0
is_repeated_guest                      0
previous_cancellations                 0
previous_bookings_not_canceled         0
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
agent                             

In [30]:
# Drop unnormally values in 'adr' column
original_count = len(hotel_df)
hotel_df = hotel_df[hotel_df['adr'] > 0]
print(f"Dropped {original_count - len(hotel_df)} rows with non-positive adr values.")

Dropped 1939 rows with non-positive adr values.


In [31]:
# Check duplicates
hotel_df.duplicated().sum()

hotel_df = hotel_df.drop_duplicates().reset_index(drop=True)
print(f"Total rows after dropping duplicates: {len(hotel_df)}")

Total rows after dropping duplicates: 85177


## Data Conversion

In [32]:
# Country name to ISO3 code conversion
import pycountry

def get_country_iso_code(country_name):
    # Some commeon discrepancies in country names
    mapping = {
        'USA': 'USA',
        'UK': 'GBR',
        'South Korea': 'KOR',
        'Vietnam': 'VNM'
    }
    if country_name in mapping:
        return mapping[country_name]
    try:
        return pycountry.countries.lookup(country_name).alpha_3
    except (LookupError, AttributeError):
        try:
            result = pycountry.countries.search_fuzzy(country_name)
            return result[0].alpha_3
        except:
            return None

In [33]:
# calculate lead time
from datetime import datetime, date

def calculate_lead_time(arrival_date):
    today = date.today()
    
    arrival_date = datetime.strptime(arrival_date, '%Y-%m-%d').date()
    
    delta = arrival_date - today
    lead_time = delta.days
    
    return max(0, lead_time)

In [34]:
# Month name to number conversion
import calendar

month_map = {name: i for i, name in enumerate(calendar.month_name) if name}

hotel_df['arrival_date_month_num'] = hotel_df['arrival_date_month'].map(month_map)

## Plot Price & Climate Trend Figure

In [35]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

country_monthly_stats = hotel_df.groupby(['country', 'arrival_date_month_num'])['adr'].agg(
    avg_adr = 'mean',
    max_adr = 'max',
    min_adr = 'min',
    count = 'size'
).reset_index()

def get_price_trend(country):
    country_data = hotel_df[hotel_df['country'] == country]
    
    price_trend = country_data.groupby('arrival_date_month_num')['adr'].agg(['mean', 'max', 'min']).reset_index()
    price_trend.columns = ['month', 'avg_adr', 'max_adr', 'min_adr']
    
    print(f"=== Average Daily Rate Trend for {country} ===")
    print(price_trend)
    
    return price_trend
    
def plot_price_trend(country):
    sns.set_theme(style="whitegrid")
    plt.figure(figsize=(12, 6))
    
    country_data = hotel_df[hotel_df['country'] == country]
    sns.lineplot(
        data=country_data,
        x='arrival_date_month_num',
        y='adr',
        marker='o',
        color='teal',
        linewidth=2.5
    )
    
    plt.xticks(range(1, 13), [calendar.month_name[i] for i in range(1, 13)], rotation=45)
    plt.title(f'Average Daily Rate (ADR) Trend in {country}', fontsize=16, pad=20)
    plt.xlabel('Month', fontsize=12)
    plt.ylabel('Average Daily Rate (ADR)', fontsize=12)
    
    plt.tight_layout()
    plt.show()

def get_climate(climate_calendar):
    rows= []
    for climate_type, months_list in climate_calendar.items():
        for item in months_list:
            rows.append({
                'month': item['month'],
                'avg_temp': item['temp'],
                'climate': climate_type
            })
    
    climate_df = pd.DataFrame(rows)
    return climate_df.sort_values(by='month')

def plot_price_climate_trend(rec_output):
    country_iso = get_country_iso_code(rec_output['country'])
    country_price = get_price_trend(country_iso)
    climate_df = get_climate(rec_output['climate_calendar'])
    
    combined_df = pd.merge(country_price, climate_df, on='month', how='left')
    combined_df['month_name'] = combined_df['month'].apply(lambda x: calendar.month_name[x])
    combined_df = combined_df.sort_values(by='month')
    
    fig = go.Figure()
    
    fig.add_trace(go.Scatter(
        x=combined_df['month'],
        y=combined_df['avg_adr'],
        mode='lines',
        line=dict(color='lightgrey', width=2),
        name='Price Trend',
        showlegend=False,
        hoverinfo='skip' # Disable hover for this trace
    ))
    
    palette = {
        'Cold': '#a5c8ff',
        'Cool': '#95e1d3',
        'Pleasant': '#fce38a',
        'Hot': '#f38181'
    }
    
    for climate_type in combined_df['climate'].unique():
        df_sub = combined_df[combined_df['climate'] == climate_type]
        if not df_sub.empty:
            fig.add_trace(go.Scatter(
                x=df_sub['month'],
                y=df_sub['avg_adr'],
                mode='markers',
                name=climate_type,
                marker=dict(
                    size=14,
                    color=palette[climate_type],
                    line=dict(width=1, color='black')
                ),
                # Define detailed hover information
                customdata=df_sub[['month_name', 'avg_temp', 'max_adr', 'min_adr']],
                hovertemplate=(
                    "<b>Month: %{customdata[0]}</b><br>" +
                    "Avg Temp: %{customdata[1]}°C<br>" +
                    "Avg Price: €%{y:.2f}<br>" +
                    "Max Price: €%{customdata[2]:.2f}<br>" +
                    "Min Price: €%{customdata[3]:.2f}<br>" +
                    "<extra></extra>"
                )
            ))
    
    fig.update_layout(
        title=f"Travel Insights: Hotel Price & Climate in {rec_output['country']}",
        xaxis=dict(
            title='Month',
            tickmode='array',
            tickvals=list(range(1, 13)),
            ticktext=[calendar.month_abbr[i] for i in range(1, 13)]
        ),
        yaxis=dict(
            title='Average Daily Reate (€)'
        ),
        hovermode='closest',
        template='plotly_white',
        legend_title='Climate Type'
    )
    
    fig.show()

plot_price_climate_trend({'city': 'Detroit',
  'country': 'United States',
  'region': 'north_america',
  'short_description': 'Industrial charm meets vibrant arts and music scenes, where historic architecture and modern innovation create a dynamic and inviting urban experience.',
  'budget_level': 'Mid-range',
  'climate_calendar': {'Cold': [{'month': 1, 'temp': -3.1},
    {'month': 2, 'temp': -2.1},
    {'month': 3, 'temp': 2.6},
    {'month': 12, 'temp': 0.9}],
   'Cool': [{'month': 4, 'temp': 8.3},
    {'month': 5, 'temp': 15.4},
    {'month': 10, 'temp': 12.3},
    {'month': 11, 'temp': 5.4}],
   'Pleasant': [{'month': 6, 'temp': 20.8},
    {'month': 7, 'temp': 23.1},
    {'month': 8, 'temp': 22.3},
    {'month': 9, 'temp': 18.8}],
   'Hot': []}})

=== Average Daily Rate Trend for USA ===
    month     avg_adr  max_adr  min_adr
0       1   72.099342   168.33     4.00
1       2   81.910290   223.00    36.96
2       3   98.416850   285.00    33.30
3       4  118.205215   253.33    37.10
4       5  127.778553   290.00     6.00
5       6  137.245188   311.00    12.00
6       7  146.640000   287.00    61.68
7       8  166.210076   328.33    64.80
8       9  123.296726   232.59     2.00
9      10  110.395000   278.33     6.00
10     11  100.313438   220.00     6.40
11     12   98.482208   215.25    30.00


## Predict Cancellation Risk

### Build a model

In [44]:
# Build a modle for predicting cancelation probability
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Define features for prediction
features = [
    'lead_time', 'arrival_date_month_num', 'arrival_date_week_number', 'stays_in_weekend_nights', 'stays_in_week_nights', 
    'adults', 'children', 'babies', 
    'country', 'market_segment', 'deposit_type', 'customer_type',
    'required_car_parking_spaces', 'total_of_special_requests'
]

X = hotel_df[features].copy()
y = hotel_df['is_canceled']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the pipeline
categorical_features = ['market_segment', 'deposit_type', 'customer_type']
ordinal_features = ['country']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ordinal_features)
    ],
    remainder='passthrough' # Remain other columns
)

clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('clasifier', RandomForestClassifier(n_estimators=100, max_depth=15, random_state=42))
])

# Find the best parameters
param_distributions = {
    'clasifier__n_estimators': [100, 200, 300],
    'clasifier__max_depth': [None, 10, 20, 30],
    'clasifier__min_samples_split': [2, 5, 10],
    'clasifier__min_samples_leaf': [1, 2, 4],
    'clasifier__bootstrap': [True, False]
}

random_search = RandomizedSearchCV(
    clf,
    param_distributions=param_distributions,
    n_iter=10, # 10 random groups
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

print(f"Best parameters: {random_search.best_params_}")
print(f"Best Score: {random_search.best_score_:.4f}")

best_model = random_search.best_estimator_
y_pred_best = best_model.predict(X_test)
print(classification_report(y_test, y_pred_best))

y_probs = best_model.predict_proba(X_test)
cancel_risks = y_probs[:, 1]
print(cancel_risks[:5] * 100)

# ohe_feature_names = clf.named_steps['preprocessor'].transformers_[0][1].get_feature_names_out()
# all_feature_names = list(ohe_feature_names) + ['country'] + ['lead_time', 'arrival_date_month_num', 'adults', 'children', 'babies']

# importances = clf.named_steps['clasifier'].feature_importances_
# feat_imp = pd.Series(importances, index=all_feature_names).sort_values(ascending=False)

# print("=== First 5 important Features ===")
# print(feat_imp.head())

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters: {'clasifier__n_estimators': 200, 'clasifier__min_samples_split': 10, 'clasifier__min_samples_leaf': 2, 'clasifier__max_depth': 30, 'clasifier__bootstrap': False}
Best Score: 0.8114
              precision    recall  f1-score   support

           0       0.85      0.90      0.87     12297
           1       0.70      0.57      0.63      4739

    accuracy                           0.81     17036
   macro avg       0.77      0.74      0.75     17036
weighted avg       0.81      0.81      0.81     17036

[74.98771166 27.26121235  0.60555162 31.28787308 39.715593  ]


## Build price predict model

In [51]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, r2_score

y_price = y = hotel_df['adr']

X_price_train, X_price_test, y_price_train, y_price_test = train_test_split(X, y_price, test_size=0.2, random_state=42)


price_lgbm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LGBMRegressor(random_state=42))
])

# Find the best parameters
price_lgbm_param_distributions = {
    'regressor__n_estimators': [100, 300, 500],
    'regressor__learning_rate': [0.01, 0.05, 0.1],
    'regressor__num_leaves': [20, 31, 50],
    'regressor__max_depth': [5, 10, 15]
}

price_lgbm_search = RandomizedSearchCV(
    price_lgbm_pipeline,
    param_distributions=price_lgbm_param_distributions,
    n_iter=10,
    cv=3,
    scoring='neg_mean_absolute_error',
    verbose=1,
    random_state=42,
    n_jobs=-1
)

price_lgbm_search.fit(X_price_train, y_price_train)
print(f"Best parameters: {price_lgbm_search.best_params_}")
print(f"Best Score: {price_lgbm_search.best_score_:.4f}")

best_price_lgbm_model = price_lgbm_search.best_estimator_

y_price_lgbm_pred = best_price_lgbm_model.predict(X_price_test)
print(f"MAE: {mean_absolute_error(y_price_test, y_price_lgbm_pred):.2f}")
print(f"R2 Score: {r2_score(y_price_test, y_price_lgbm_pred):.4f}")

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020759 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 533
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019782 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 530
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025111 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 532
[LightGBM] [Info] Number of data points in the train set: 45428, number of used features: 25
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhea



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015450 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 533
[LightGBM] [Info] Number of data points in the train set: 45428, number of used features: 25
[LightGBM] [Info] Start training from score 108.893207
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003649 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 530
[LightGBM] [Info] Number of data points in the train set: 45427, number of used features: 25
[LightGBM] [Info] Start training from score 108.948696
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006183 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not eno



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011991 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 533
[LightGBM] [Info] Number of data points in the train set: 45428, number of used features: 25
[LightGBM] [Info] Start training from score 108.893207
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009327 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 530
[LightGBM] [Info] Number of data points in the train set: 45427, number of used features: 25
[LightGBM] [Info] Start training from score 108.948696




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003835 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 532
[LightGBM] [Info] Number of data points in the train set: 45427, number of used features: 25
[LightGBM] [Info] Start training from score 108.769080




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004483 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 533
[LightGBM] [Info] Number of data points in the train set: 45428, number of used features: 25
[LightGBM] [Info] Start training from score 108.893207
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004138 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 530
[LightGBM] [Info] Number of data points in the train set: 45427, number of used features: 25
[LightGBM] [Info] Start training from score 108.948696




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004218 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 532
[LightGBM] [Info] Number of data points in the train set: 45427, number of used features: 25
[LightGBM] [Info] Start training from score 108.769080
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003350 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 533
[LightGBM] [Info] Number of data points in the train set: 45428, number of used features: 25
[LightGBM] [Info] Start training from score 108.893207




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.045670 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 530
[LightGBM] [Info] Number of data points in the train set: 45427, number of used features: 25
[LightGBM] [Info] Start training from score 108.948696
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005238 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 532
[LightGBM] [Info] Number of data points in the train set: 45427, number of used features: 25
[LightGBM] [Info] Start training from score 108.769080












[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010299 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 533
[LightGBM] [Info] Number of data points in the train set: 45428, number of used features: 25
[LightGBM] [Info] Start training from score 108.893207
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004234 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 530
[LightGBM] [Info] Number of data points in the train set: 45427, number of used features: 25
[LightGBM] [Info] Start training from score 108.948696




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004752 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 532
[LightGBM] [Info] Number of data points in the train set: 45427, number of used features: 25
[LightGBM] [Info] Start training from score 108.769080
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012920 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 533
[LightGBM] [Info] Number of data points in the train set: 45428, number of used features: 25
[LightGBM] [Info] Start training from score 108.893207




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004405 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 530
[LightGBM] [Info] Number of data points in the train set: 45427, number of used features: 25
[LightGBM] [Info] Start training from score 108.948696




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004319 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 532
[LightGBM] [Info] Number of data points in the train set: 45427, number of used features: 25
[LightGBM] [Info] Start training from score 108.769080




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003931 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 533
[LightGBM] [Info] Number of data points in the train set: 45428, number of used features: 25
[LightGBM] [Info] Start training from score 108.893207
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004768 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 530
[LightGBM] [Info] Number of data points in the train set: 45427, number of used features: 25
[LightGBM] [Info] Start training from score 108.948696








[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004363 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 532
[LightGBM] [Info] Number of data points in the train set: 45427, number of used features: 25
[LightGBM] [Info] Start training from score 108.769080




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006488 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 533
[LightGBM] [Info] Number of data points in the train set: 45428, number of used features: 25
[LightGBM] [Info] Start training from score 108.893207
















[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002145 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 543
[LightGBM] [Info] Number of data points in the train set: 68141, number of used features: 25
[LightGBM] [Info] Start training from score 108.870328
Best parameters: {'regressor__num_leaves': 31, 'regressor__n_estimators': 500, 'regressor__max_depth': 10, 'regressor__learning_rate': 0.1}
Best Score: -22.4062
MAE: 21.92
R2 Score: 0.6291



X does not have valid feature names, but LGBMRegressor was fitted with feature names



In [50]:
import joblib
import os

os.makedirs('../../ml_logic/models/cancellation_predict/', exist_ok=True)

joblib.dump(best_model, '../../ml_logic/models/cancellation_predict/cancel_pipeline.pkl')

joblib.dump(best_price_lgbm_model, '../../ml_logic/models/cancellation_predict/price_pipeline.pkl')

print("Cancellation risk components are saved successfully!")

Cancellation risk components are saved successfully!


In [None]:
# mapping customer_type
def determine_customer_type(babies, children, adults):
    total_people = babies + children + adults
    
    if total_people > 5:
        return 'Transient-Party'
    
    return 'Transient'

In [None]:
def draw_risk_donut(input_data):
    risk_percent = best_model.predict_proba(input_data)[0][1] * 100
    
    if risk_percent >= 70:
        risk_color = '#dc3545'
    elif risk_percent >= 30:
        risk_color = '#ffc107'
    else:
        risk_color = '#28a745'
        
    values = [risk_percent, 100 - risk_percent]
    colors = [risk_color, '#e9ecef']
    
    fig = go.Figure(data=[go.Pie(
        values=values,
        labels=['Risk', 'Remaining'],
        hole=0.8,
        marker_colors=colors,
        sort=False,
        direction='clockwise',
        rotation=0,
        showlegend=False,
        textinfo='none',
        hoverinfo='none'
    )])
    
    fig.add_annotation(
        text=f'<b>{risk_percent:.1f}%</b>',
        x=0.5,
        y=0.5,
        font_size=32,
        font_color=risk_color,
        showarrow=False
    )
    
    fig.update_layout(
        title={
            'text': "Booking Cancellation Risk",
            'x': 0.5,
            'y': 0.9,
            'xanchor': 'center',
            'yanchor': 'top'
        },
        margin=dict(t=80, b=10, l=10, r=10),
        height=300,
        width=300,
        paper_bgcolor='rgba(0, 0, 0, 0)',
        plot_bgcolor='rgba(0, 0, 0, 0)'
    )
    
    fig.show()

In [None]:
lead_time_config = {
    'Last Minute': 14,
    'Short-term': 30,
    'Medium-term': 90,
    'Half-year': 180,
    'Long-term': 365,
    'Very Long-term': 730   # Maximum of lead_time is 709
}

bins = [0] + list(lead_time_config.values())
labels = list(lead_time_config.keys())
hotel_df['lead_time_bucket'] = pd.cut(hotel_df['lead_time'],
                                      bins=bins,
                                      labels=labels,
                                      include_lowest=True)

price_lookup = hotel_df.groupby(['country', 'arrival_date_month_num', 'lead_time_bucket'], observed=True)['adr'].mean().reset_index()

def get_lt_bucket_name(lt):
    return pd.cut([lt], bins=bins, labels=labels, include_lowest=True)[0]

def get_complete_risk_price_report(input_data):
    test_lead_times = list(lead_time_config.values())
    test_months = range(1, 13)
    
    results = []
    for m in test_months:
        for lt in test_lead_times:
            temp_df = input_data.copy()
            temp_df['arrival_date_month_num'] = m
            temp_df['lead_time'] = lt
            # Predict risk
            prob = best_model.predict_proba(temp_df)[0][1]
            # Look up price
            bucket = get_lt_bucket_name(lt)
            price_row = price_lookup[(price_lookup['country'] == temp_df['country']) &
                                     (price_lookup['arrival_date_month_num'] == m) &
                                     (price_lookup['lead_time_bucket'] == bucket)]
            avg_price = price_row['adr'].values[0] if not price_row.empty else 100
            
            results.append({
                'month': m,
                'lt': lt,
                'risk': prob,
                'price': avg_price
            })
    
    res_df = pd.DataFrame(results)
    
    return res_df

In [None]:
def get_advice_by_weight(candidates, weight_risk=1.5, weight_price=1.2):
    if candidates.empty:
        return None
    
    candidates['total_score'] = (candidates['risk_norm'] * weight_risk) + (candidates['price_norm'] * weight_price)
    
    advice = candidates.sort_values('total_score').iloc[0]
    
    return advice

def get_stategic_advice(input_data, isThisYear=True):
    res_df = get_complete_risk_price_report(input_data)
    user_lt = input_data['lead_time'].iloc[0]
    user_month = input_data['arrival_date_month_num'].iloc[0]
    yearly_avg_price = res_df['price'].mean()
    
    # normalization
    risk_min, risk_max = res_df['risk'].min(), res_df['risk'].max()
    res_df['risk_norm'] = (res_df['risk'] - risk_min) / (risk_max - risk_min)
    
    price_min, price_max = res_df['price'].min(), res_df['price'].max()
    res_df['price_norm'] = (res_df['price'] - price_min) / (price_max - price_min)
        
    # 1. Best CP value
    # risk < 30% & price < yearly_avg_price
    cp_candidates = res_df[(res_df['risk'] < 0.3) & (res_df['price'] < yearly_avg_price)].copy()
    cp_advice = get_advice_by_weight(cp_candidates)
    
    # 2. Same or longer lead time
    target_months = [(int(user_month) + i - 1) % 12 + 1 for i in range(0, 4)]
    lt_priority = res_df[(res_df['lt'] >= user_lt) & (res_df['month'].isin(target_months))].copy()
    lt_advice = get_advice_by_weight(lt_priority)
    
    # 3. Same month
    if isThisYear:
        month_priority = res_df[(res_df['month'] == user_month) & (res_df['lt'] <= user_lt)].copy()
    else:
        month_priority = res_df[res_df['month'] == user_month].copy()
    month_advice = get_advice_by_weight(month_priority)
    
    # Plot
    plot_bubble_recommendation(res_df, cp_advice, lt_advice, month_advice)
    
    return cp_advice, lt_advice, month_advice

In [None]:
def plot_bubble_recommendation(res_df, cp_advice, lt_advice, month_advice):
    fig = go.Figure()
    
    # Draw all the points
    fig.add_trace(go.Scatter(
        x=res_df['price'],
        y=res_df['risk']*100,
        mode='markers',
        marker=dict(color='lightgrey', size=8, opacity=0.5),
        name='Others',
        hoverinfo='text',
        text=[f"Month: {m}<br>Lead time: {lt}<br>Price: ${p:.1f}"
              for m, lt, p in zip(res_df['month'], res_df['lt'], res_df['price'])]
    ))
    
    advices = []
    if cp_advice is not None: advices.append({'data': cp_advice,
                                              'label': '🌟 Best CP Value',
                                              'color': 'gold',
                                              'symbol': 'star'})
    if lt_advice is not None: advices.append({'data': lt_advice,
                                              'label': '🛡️ Planning Priority',
                                              'color': 'royalblue',
                                              'symbol': 'diamond'})
    if month_advice is not None: advices.append({'data': month_advice,
                                              'label': '📅 Month Priority',
                                              'color': 'forestgreen',
                                              'symbol': 'square'})
    advices.sort(key=lambda x: x['data']['risk'], reverse=True)
    
    offsets = [
        {'ay': -50, 'ax': 0},
        {'ay': 0, 'ax': 90},
        {'ay': 50, 'ax': 0},
    ]
    
    for i, adv_item in enumerate(advices):
        advice = adv_item['data']
        label = adv_item['label']
        color = adv_item['color']
        symbol = adv_item['symbol']
        month_name = calendar.month_abbr[int(advice['month'])]
        
        offset = offsets[i] if i < len(offsets) else {'ay': 40, 'ax': 40}

        fig.add_trace(go.Scatter(
            x=[advice['price']],
            y=[advice['risk']*100],
            mode='markers',
            marker=dict(
                color=color,
                size=15,
                symbol=symbol,
                line=dict(width=2, color='white')
            ),
            name=label
        ))
        
        fig.add_annotation(
            x=advice['price'],
            y=advice['risk']*100,
            text=f"<span style='color:{color}'><b>{label}</b></span><br>{month_name} / {int(advice['lt'])} days prep",
            ax=offset['ax'],
            ay=offset['ay'],
            bgcolor="rgba(255, 255, 255, 0.9)",
            bordercolor=color,
            borderwidth=1,
            font=dict(size=11)
        )
    
    fig.update_layout(
        title="<b>AI Recommendation: Risk vs. Price Strategy</b>",
        xaxis_title="Estimated Price (ADR)",
        yaxis_title="Cancellation Risk (%)",
        template="plotly_white",
        hovermode="closest"
    )
    
    fig.add_shape(
        type='rect',
        x0=res_df['price'].min(),
        y0=0,
        x1=res_df['price'].mean(),
        y1=30,
        line=dict(color='Green', width=1, dash='dot'),
        fillcolor='LightGreen',
        opacity=0.3
    )
    
    fig.add_vline(
        x=res_df['price'].mean(),
        line_dash="dash",
        line_color="grey",
        annotation_text="Average Price"
    )
    
    fig.show()

In [None]:
def get_recommendation(user_input):
    current_input = pd.DataFrame([user_input]) if isinstance(user_input, dict) else user_input.copy()
    current_input['lead_time'] = calculate_lead_time(user_input['arrival_date'])
    current_input['country'] = get_country_iso_code(user_input['country_name'])
    current_input['deposit_type'] = 'No Deposit' # Default
    current_input['customer_type'] = determine_customer_type(
        user_input['babies'],
        user_input['children'],
        user_input['adults']
    )
    
    current_input = current_input[features]
    draw_risk_donut(current_input)
    
    get_stategic_advice(current_input)
    
    print(current_input)



get_recommendation({
    'arrival_date': '2026-05-16',
    'arrival_date_month_num': 5,
    'adults': 2,
    'children': 0,
    'babies': 0, 
    'country_name': 'United States'
})

In [None]:
import joblib
import os

os.makedirs('../../ml_logic/models/cancellation_predict/', exist_ok=True)

joblib.dump(best_model, '../../ml_logic/models/cancellation_predict/cancel_pipeline.pkl')

price_lookup.to_csv('../../ml_logic/models/cancellation_predict/price_lookup_reference.csv', index=False)

print("Cancellation risk components are saved successfully!")

In [None]:
country_monthly_stats = hotel_df.groupby(['country', 'arrival_date_month_num'])['adr'].agg(
    avg_adr = 'mean',
    max_adr = 'max',
    min_adr = 'min',
    count = 'size'
).reset_index()

country_monthly_stats.to_csv('../../ml_logic/models/cancellation_predict/country_monthly_stats.csv', index=False)

In [None]:
def get_user_friendly_risk(input_data):
    risk_percent = best_model.predict_proba(input_data)[0][1] * 100
    
    if risk_percent >= 70:
        level = 'High Risk'
        color = 'red'
        advice = f'⚠️ This schedule has a high chance ({risk_percent}%) to be changed, due to personal reasons or 旺季時 hotel isseus, we strongly suggest choose "Free for cancelation / adjustion" 方案'
    elif 40 <= risk_percent < 70:
        level = 'Moderate Risk'
        color = 'orange'
        advice = f'💡 This schedule has a {risk_percent}% probability to be changed.'
    else:
        level= = 'Low Risk'
        color = 'green'
        advice = '✅ This schedule is stable, you can book your accommodation now!'

In [None]:
def get_recommendation(user_input):
    # 預設場景：假設 User 訂的是一般的 No Deposit 房間
    current_input = user_input.copy()
    current_input['deposit_type'] = 'No Deposit'
    
    # 進行模型預測
    risk_score = rf_model.predict_proba(current_input)[0][1]
    
    # 根據風險值給建議
    if risk_score > 0.5:
        message = "💡 此時段行程取消機率較高，建議訂購「可免費取消」之方案。"
    else:
        message = "✅ 此時段行程相對穩定。"
        
    return risk_score, message