In [1]:
import warnings

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn.objects as so

from scipy import stats
from scipy.stats.contingency import association

from sklearn.preprocessing import (
    LabelEncoder, 
    PowerTransformer, 
    StandardScaler,
    MinMaxScaler,
    OneHotEncoder,
    PolynomialFeatures
)
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.feature_selection import SelectFromModel, RFE, SelectKBest, mutual_info_classif

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import (
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score,
    fbeta_score,
    roc_auc_score, 
    average_precision_score,
    confusion_matrix, 
    classification_report, 
    ConfusionMatrixDisplay, 
    RocCurveDisplay, 
    precision_recall_curve,
)

from yellowbrick.model_selection import (
    ValidationCurve,
    LearningCurve
)

from sklearn.svm import SVC

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll.base import scope
from typing import Any, Dict, Union, Sequence

from loguru import logger
from omegaconf import OmegaConf

pd.set_option('display.max_columns', None)
pd.options.plotting.backend = 'matplotlib'
warnings.filterwarnings('ignore')

In [2]:
def pipe_logger(df, addon_msg=''):
    if addon_msg != '':
        logger.info(addon_msg)
    logger.info(f"Shape of DataFrame: {df.shape[0]} x {df.shape[1]}")
    return df

In [3]:
hotel = pd.read_csv('../../data/raw/hotel_reservations.csv')
hotel

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,INN00001,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.00,0,Not_Canceled
1,INN00002,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled
2,INN00003,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.00,0,Canceled
3,INN00004,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.00,0,Canceled
4,INN00005,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.50,0,Canceled
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36270,INN36271,3,0,2,6,Meal Plan 1,0,Room_Type 4,85,2018,8,3,Online,0,0,0,167.80,1,Not_Canceled
36271,INN36272,2,0,1,3,Meal Plan 1,0,Room_Type 1,228,2018,10,17,Online,0,0,0,90.95,2,Canceled
36272,INN36273,2,0,2,6,Meal Plan 1,0,Room_Type 1,148,2018,7,1,Online,0,0,0,98.39,2,Not_Canceled
36273,INN36274,2,0,0,3,Not Selected,0,Room_Type 1,63,2018,4,21,Online,0,0,0,94.50,0,Canceled


In [4]:
(hotel
 .info()
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36275 entries, 0 to 36274
Data columns (total 19 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Booking_ID                            36275 non-null  object 
 1   no_of_adults                          36275 non-null  int64  
 2   no_of_children                        36275 non-null  int64  
 3   no_of_weekend_nights                  36275 non-null  int64  
 4   no_of_week_nights                     36275 non-null  int64  
 5   type_of_meal_plan                     36275 non-null  object 
 6   required_car_parking_space            36275 non-null  int64  
 7   room_type_reserved                    36275 non-null  object 
 8   lead_time                             36275 non-null  int64  
 9   arrival_year                          36275 non-null  int64  
 10  arrival_month                         36275 non-null  int64  
 11  arrival_date   

In [5]:
(hotel
 .describe()
)

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,lead_time,arrival_year,arrival_month,arrival_date,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests
count,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0
mean,1.844962,0.105279,0.810724,2.2043,0.030986,85.232557,2017.820427,7.423653,15.596995,0.025637,0.023349,0.153411,103.423539,0.619655
std,0.518715,0.402648,0.870644,1.410905,0.173281,85.930817,0.383836,3.069894,8.740447,0.158053,0.368331,1.754171,35.089424,0.786236
min,0.0,0.0,0.0,0.0,0.0,0.0,2017.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,1.0,0.0,17.0,2018.0,5.0,8.0,0.0,0.0,0.0,80.3,0.0
50%,2.0,0.0,1.0,2.0,0.0,57.0,2018.0,8.0,16.0,0.0,0.0,0.0,99.45,0.0
75%,2.0,0.0,2.0,3.0,0.0,126.0,2018.0,10.0,23.0,0.0,0.0,0.0,120.0,1.0
max,4.0,10.0,7.0,17.0,1.0,443.0,2018.0,12.0,31.0,1.0,13.0,58.0,540.0,5.0


In [6]:
(hotel
 ['booking_status']
 .value_counts(normalize=True, dropna=False))

booking_status
Not_Canceled    0.672364
Canceled        0.327636
Name: proportion, dtype: float64

In [7]:
def tweak_hotel(df: pd.DataFrame) -> pd.DataFrame:
    
    leap_year_error = (df
                       .loc[(df['arrival_year'] == 2018) & (df['arrival_month'] == 2) & (df['arrival_date'] == 29)]
                       .index)
    return (df
            .pipe(pipe_logger)
            .astype({**{k: 'int8' 
                        for k in ['no_of_adults', 'no_of_children', 'no_of_weekend_nights', 'no_of_week_nights', 'required_car_parking_space', 'repeated_guest', 'no_of_previous_cancellations', 'no_of_previous_bookings_not_canceled', 'no_of_special_requests']},
                     **{k: 'int16'
                        for k in ['lead_time']},
                     **{k: 'category'
                        for k in ['type_of_meal_plan', 'room_type_reserved', 'market_segment_type',]},
                     **{k: 'str'
                        for k in ['arrival_date', 'arrival_month', 'arrival_year']},
                     'avg_price_per_room': 'float16'})
            .drop(columns=['Booking_ID'])
            .drop(leap_year_error)
            .pipe(pipe_logger, "After dropping leap year error...")
            )

In [8]:
tweaked_hotel = tweak_hotel(hotel)
tweaked_hotel

[32m2024-03-27 21:50:56.064[0m | [1mINFO    [0m | [36m__main__[0m:[36mpipe_logger[0m:[36m4[0m - [1mShape of DataFrame: 36275 x 19[0m
[32m2024-03-27 21:50:56.091[0m | [1mINFO    [0m | [36m__main__[0m:[36mpipe_logger[0m:[36m3[0m - [1mAfter dropping leap year error...[0m
[32m2024-03-27 21:50:56.093[0m | [1mINFO    [0m | [36m__main__[0m:[36mpipe_logger[0m:[36m4[0m - [1mShape of DataFrame: 36238 x 18[0m


Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0000,0,Not_Canceled
1,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.6875,1,Not_Canceled
2,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0000,0,Canceled
3,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.0000,0,Canceled
4,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.5000,0,Canceled
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36270,3,0,2,6,Meal Plan 1,0,Room_Type 4,85,2018,8,3,Online,0,0,0,167.7500,1,Not_Canceled
36271,2,0,1,3,Meal Plan 1,0,Room_Type 1,228,2018,10,17,Online,0,0,0,90.9375,2,Canceled
36272,2,0,2,6,Meal Plan 1,0,Room_Type 1,148,2018,7,1,Online,0,0,0,98.3750,2,Not_Canceled
36273,2,0,0,3,Not Selected,0,Room_Type 1,63,2018,4,21,Online,0,0,0,94.5000,0,Canceled


In [9]:
X = tweaked_hotel.drop(columns=['booking_status'])
y = tweaked_hotel['booking_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [10]:
label_encoder = LabelEncoder()
y_train = (pd
           .DataFrame(label_encoder.fit_transform(y_train), 
                      columns=['booking_status'])
           .astype({'booking_status': 'int8'})
           )

y_test = (pd
          .DataFrame(label_encoder.transform(y_test), 
                     columns=['booking_status'])
          .astype({'booking_status': 'int8'})
          )

In [11]:
class TweakFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
    
        return (X
                .assign(lead_time_category=lambda df_: np.where(df_['lead_time'] < 90, 'Within 3 months', np.where(df_['lead_time'] < 180, 'Within 6 months', 'Beyond 6 months')),
                        is_weekend_arrival=lambda df_: np.where(pd.to_datetime(df_['arrival_year'] + '-' + df_['arrival_month'] + '-' + df_['arrival_date']).dt.weekday > 5, 1, 0),
                        total_people=lambda df_: df_['no_of_adults'] + df_['no_of_children'],
                        is_alone=lambda df_: np.where(df_['total_people'] == 1, 1, 0))
                .astype({'lead_time_category': 'category',
                         'is_weekend_arrival': 'int8',
                         'total_people': 'int8',
                         'is_alone': 'int8',
                         'arrival_year': 'int16',
                         'arrival_month': 'int8', 
                         'arrival_date': 'int8'})
                .drop(columns=['arrival_year'])
                )

In [12]:
TweakFeatures().fit_transform(X_train)

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,lead_time_category,is_weekend_arrival,total_people,is_alone
10409,2,0,0,2,Not Selected,0,Room_Type 1,68,3,11,Online,0,0,0,67.5000,1,Within 3 months,1,2,0
6307,2,1,0,3,Meal Plan 1,0,Room_Type 1,149,7,20,Online,0,0,0,127.5000,0,Within 6 months,0,3,0
19004,2,0,0,3,Meal Plan 1,0,Room_Type 1,72,3,31,Online,0,0,0,67.2500,0,Within 3 months,0,2,0
9609,2,0,0,3,Meal Plan 1,0,Room_Type 1,259,5,5,Offline,0,0,0,90.0000,0,Beyond 6 months,0,2,0
32308,2,0,2,0,Meal Plan 1,1,Room_Type 1,47,12,6,Online,0,0,0,115.1875,1,Within 3 months,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18283,1,0,0,1,Not Selected,0,Room_Type 1,0,8,20,Online,0,0,0,92.1875,0,Within 3 months,0,1,1
7308,2,0,0,1,Not Selected,0,Room_Type 1,143,8,20,Online,0,0,0,94.5000,0,Within 6 months,0,2,0
22902,1,0,3,7,Meal Plan 1,0,Room_Type 1,29,4,15,Online,0,0,0,99.5000,1,Within 3 months,1,1,1
20224,2,1,1,3,Meal Plan 1,0,Room_Type 1,83,8,1,Online,0,0,0,135.8750,1,Within 3 months,0,3,0


In [13]:
ohe_categorical_features = ['type_of_meal_plan', 'room_type_reserved', 'market_segment_type', 'lead_time_category']
ohe_categorical_transformer = Pipeline(steps=[
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first').set_output(transform='pandas'))
])

standard_numerical_features = ['lead_time', 'avg_price_per_room']
standard_numerical_transformer = Pipeline(steps=[
    ('scale', StandardScaler().set_output(transform='pandas'))
])

poly = PolynomialFeatures(interaction_only=False).set_output(transform='pandas')

# sel_ = SelectFromModel(LogisticRegression(solver= 'liblinear', C=1, penalty='l1', random_state=42), max_features=12).set_output(transform='pandas')
sel_ = SelectFromModel(RandomForestClassifier(n_estimators=100, max_depth=5, max_features='sqrt', n_jobs=-1, random_state=42), max_features=10).set_output(transform='pandas')
# sel_ = RFE(LogisticRegression(solver= 'liblinear', C=1, penalty='l1', random_state=42), n_features_to_select=10).set_output(transform='pandas')
# sel_ = SelectKBest(mutual_info_classif, k=10).set_output(transform='pandas')

col_trans = ColumnTransformer(
    transformers=[
        ('ohe_categorical_features', ohe_categorical_transformer, ohe_categorical_features),
        ('standard_numerical_features', standard_numerical_transformer, standard_numerical_features)
    ],
    remainder='passthrough', 
    verbose=0, 
    verbose_feature_names_out=False).set_output(transform='pandas')

In [14]:
pipeline_oob_svc = Pipeline(steps = [
       ('tweak_features', TweakFeatures()),
       ('col_trans', col_trans),
    #    ('poly', poly),
    #    ('select', sel_),
])

In [15]:
X_train_transformed = pipeline_oob_svc.fit_transform(X_train, y_train.values.ravel())
X_train_transformed

Unnamed: 0,type_of_meal_plan_Meal Plan 2,type_of_meal_plan_Meal Plan 3,type_of_meal_plan_Not Selected,room_type_reserved_Room_Type 2,room_type_reserved_Room_Type 3,room_type_reserved_Room_Type 4,room_type_reserved_Room_Type 5,room_type_reserved_Room_Type 6,room_type_reserved_Room_Type 7,market_segment_type_Complementary,market_segment_type_Corporate,market_segment_type_Offline,market_segment_type_Online,lead_time_category_Within 3 months,lead_time_category_Within 6 months,lead_time,avg_price_per_room,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,arrival_month,arrival_date,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,no_of_special_requests,is_weekend_arrival,total_people,is_alone
10409,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,-0.201827,-1.018437,2,0,0,2,0,3,11,0,0,0,1,1,2,0
6307,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.738327,0.681051,2,1,0,3,0,7,20,0,0,0,0,0,3,0
19004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,-0.155400,-1.025518,2,0,0,3,0,3,31,0,0,0,0,0,2,0
9609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.015079,-0.381129,2,0,0,3,0,5,5,0,0,0,0,0,2,0
32308,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,-0.445571,0.332302,2,0,2,0,1,12,6,0,0,0,1,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18283,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,-0.991092,-0.319168,1,0,0,1,0,8,20,0,0,0,0,0,1,1
7308,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.668686,-0.253667,2,0,0,1,0,8,20,0,0,0,0,0,2,0
22902,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,-0.654494,-0.112043,1,0,3,7,0,4,15,0,0,0,1,1,1,1
20224,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,-0.027725,0.918271,2,1,1,3,0,8,1,0,0,0,1,0,3,0


In [16]:
svc_oob = SVC(probability=True, 
              random_state=42,)

svc_oob.fit(X_train_transformed, y_train)
y_pred = svc_oob.predict(pipeline_oob_svc.transform(X_test))

In [18]:
print(f'Accuracy score: {accuracy_score(y_test, y_pred)}')
print(f'Precision score: {precision_score(y_test, y_pred)}')
print(f'Recall score: {recall_score(y_test, y_pred)}')
print(f'F1 score: {f1_score(y_test, y_pred)}')
print(f'ROC-AUC score: {roc_auc_score(y_test, svc_oob.predict_proba(pipeline_oob_svc.transform(X_test))[:, 1])}')
print(f'Average Precision Score: {average_precision_score(y_test, svc_oob.predict_proba(pipeline_oob_svc.transform(X_test))[:, 1])}')

Accuracy score: 0.8029801324503312
Precision score: 0.8197177868548088
Recall score: 0.9061986863711001
F1 score: 0.8607915773055177
ROC-AUC score: 0.8687123527281573
Average Precision Score: 0.9288819576437058


In [19]:
print(classification_report(label_encoder.inverse_transform(y_test), label_encoder.inverse_transform(y_pred)))

              precision    recall  f1-score   support

    Canceled       0.75      0.59      0.66      2376
Not_Canceled       0.82      0.91      0.86      4872

    accuracy                           0.80      7248
   macro avg       0.79      0.75      0.76      7248
weighted avg       0.80      0.80      0.80      7248



In [21]:
# Define the space over which to search
space = {
    'C': hp.loguniform('C', -4, 3),  # loguniform distribution for C on a log scale from exp(-4) to exp(4)
    'kernel': hp.choice('kernel', ['linear', 'poly', 'rbf', 'sigmoid']),
    # Conditional parameters for 'poly' kernel
    'degree': hp.choice('degree', [2, 3, 4]),
    'gamma': hp.choice('gamma', ['scale']),
    'coef0': hp.uniform('coef0', 0.0, 1.0)  # coef0 only for 'poly' and 'sigmoid'
}

In [24]:
# Define the objective function
def objective(params):
    # Conditional parameter logic: 'degree' is only relevant if kernel is 'poly'
    if params['kernel'] == 'poly':
        clf = SVC(C=params['C'], kernel=params['kernel'],
                  degree=params['degree'], gamma=params['gamma'], coef0=params['coef0'])
    else:
        # Set 'degree' and 'coef0' to their default values for non-poly kernels
        clf = SVC(C=params['C'], kernel=params['kernel'],
                  gamma=params['gamma'], coef0=0.0)

    # Perform cross validation
    score = cross_val_score(clf, 
                            X_train_transformed, 
                            y_train.values.ravel(), 
                            cv=5, 
                            scoring='accuracy',
                            n_jobs=-1).mean()
    
    # Hyperopt minimizes the objective function, so return the negative accuracy
    return {'loss': -score, 'status': STATUS_OK}

In [25]:
# Run the hyperparameter search using the Tree of Parzen Estimators (TPE) algorithm
trials = Trials()
SEED = 101

best = fmin(fn=objective, 
            space=space, 
            algo=tpe.suggest, 
            max_evals=100, 
            trials=trials,
            rstate=np.random.default_rng(SEED))

100%|██████████| 100/100 [38:51<00:00, 23.32s/trial, best loss: -0.834115212142118]  


In [26]:
# hp.choice returns an index, so retrieve the actual values from the space definition
best['kernel'] = ['linear', 'poly', 'rbf', 'sigmoid'][best['kernel']]
best['gamma'] = ['scale'][best['gamma']]
if best['kernel'] == 'poly':
    best['degree'] = [2, 3, 4][best['degree']]

print(f"Best hyperparameters: {best}")

Best hyperparameters: {'C': 19.30231150166292, 'coef0': 0.9011064460064164, 'degree': 4, 'gamma': 'scale', 'kernel': 'poly'}


In [27]:
best

{'C': 19.30231150166292,
 'coef0': 0.9011064460064164,
 'degree': 4,
 'gamma': 'scale',
 'kernel': 'poly'}

In [None]:
svc_tuned = SVC(**best, 
                probability=True, 
                random_state=42)

svc_tuned.fit(X_train_transformed, y_train)
y_pred_top = svc_tuned.predict(pipeline_oob_svc.transform(X_test))

In [None]:
print(classification_report(label_encoder.inverse_transform(y_test), label_encoder.inverse_transform(y_pred_top)))