# **HOTEL BOOKING CANCELLATION PREDICTION**

![hotel booking](../images/image-hotel.png)

## **Read and Pre-Clean the Data**

### Import Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Load data

Module for load and pre cleaning of data

In [3]:
# Modularizando la carga de datos y limpieza inicial
def inicial_clean(df):
    # Remove columns with personal information about customers
    df.drop(['name', 'email', 'phone-number', 'credit_card'], axis=1, inplace=True)
    # Remove data leakeage
    df.drop(['reservation_status', 'reservation_status_date'], axis=1, inplace=True)
    # Convert objects to strings
    obj_columns = df.select_dtypes('object').columns
    df[obj_columns] = df[obj_columns].astype(str)
    return df

def load_data(file_path):
    # Load the dataset
    df_hotel_bookings_raw = pd.read_csv(file_path)
    # Inicial clean (drop unnecessary columns and correct the data type)
    df_hotel_bookings = inicial_clean(df_hotel_bookings_raw)
    return df_hotel_bookings

In [4]:
file_path = "../data/dataset.csv"
df_hotel_booking = load_data(file_path)
df_hotel_booking.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,C,3,No Deposit,,,0,Transient,0.0,0,0
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,C,4,No Deposit,,,0,Transient,0.0,0,0
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,C,0,No Deposit,,,0,Transient,75.0,0,0
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,A,0,No Deposit,304.0,,0,Transient,75.0,0,0
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,A,0,No Deposit,240.0,,0,Transient,98.0,0,1


In [5]:
df_hotel_booking.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 30 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

## **Split the Data**

In [6]:
from sklearn.model_selection import train_test_split

def split_data(df, target:str, train_size=0.8, val_test_proportion=0.5):
    
    X = df.drop([target], axis=1).copy() # indenpendent variables
    y = df[target].copy() # denpendent (target)

    X_train, X_rest, y_train, y_rest = train_test_split(X, y, train_size=train_size)
    X_test, X_val, y_test, y_val = train_test_split(X_rest, y_rest, train_size=val_test_proportion)

    return X_train, y_train, X_test, y_test, X_val, y_val

In [7]:
X_train, y_train, X_test, y_test, X_val, y_val = split_data(df_hotel_booking, 'is_canceled', train_size=0.8, val_test_proportion=0.5)
print("X_train, X_test, X_val")
print(X_train.shape, X_test.shape, X_val.shape)
print(len(X_train), len(X_test), len(X_val))

X_train, X_test, X_val
(95512, 29) (11939, 29) (11939, 29)
95512 11939 11939


## **Pipeline for Pre Processing**

### Building Pipeline

In [8]:
from sklearn.preprocessing import OneHotEncoder, Binarizer, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion, Pipeline

def build_pipeline():
    
    # -----------------------------------------------------------------------------------------------------
    # One-hot encoder
    # -----------------------------------------------------------------------------------------------------
    internal_ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
    columns_to_encode = [
        "hotel",
        "meal", 
        "distribution_channel", 
        "reserved_room_type", 
        "assigned_room_type", 
        "customer_type"
    ]

    one_hot_encoding = ColumnTransformer([
        (
            'one_hot_encode', internal_ohe, columns_to_encode
        )
    ])

    # -----------------------------------------------------------------------------------------------------
    # Binarizer
    # -----------------------------------------------------------------------------------------------------
    internal_binarizer = Binarizer()
    columns_to_binarize = [
        "total_of_special_requests", 
        "required_car_parking_spaces", 
        "booking_changes", 
        "previous_bookings_not_canceled", 
        "previous_cancellations",
    ]
    # primero convierte a variables dummy
    binarizer = ColumnTransformer([
        (
            'binarizer', internal_binarizer, columns_to_binarize
        )
    ])
    # despues aplica ohe
    internal_encoder_binarizer = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
    one_hot_binarized = Pipeline([
        ("binarizer", binarizer),
        ("one_hot_encoder", internal_encoder_binarizer),
    ])


    # -----------------------------------------------------------------------------------------------------
    # Scaler
    # -----------------------------------------------------------------------------------------------------
    internal_scaler = RobustScaler()
    columns_to_scale = ["adr"]

    scaler = ColumnTransformer([
        ("scaler", internal_scaler, columns_to_scale)
    ])


    # -----------------------------------------------------------------------------------------------------
    # Passthrough columns
    # -----------------------------------------------------------------------------------------------------
    pass_columns = [
        "stays_in_week_nights",
        "stays_in_weekend_nights",
    ]

    passthrough = ColumnTransformer([
        ("pass_columns", "passthrough", pass_columns)
    ])


    # -----------------------------------------------------------------------------------------------------
    # Full Preprocessing Pipeline
    # -----------------------------------------------------------------------------------------------------
    feature_engineering_pipeline  = Pipeline([
        (
            "features",
            FeatureUnion([
                ('categories', one_hot_encoding),
                ('binaries', one_hot_binarized),
                ('scaled', scaler),
                ('passthrough', passthrough)
            ])
        )
    ])

    return feature_engineering_pipeline

### Show Pipeline

In [9]:
from sklearn import set_config

preprocessing_pipeline = build_pipeline()
set_config(display="diagram")
preprocessing_pipeline 

### Applying Pipeline

In [10]:
# Training set
X_train_transformed = preprocessing_pipeline.fit_transform(X_train)
X_train_transformed[:2,:]

array([[ 0.        ,  1.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  1.        ,  0.        ,
         1.        ,  0.        ,  1.        ,  0.        ,  1.        ,
         0.        ,  1.        ,  0.        , -0.77235056,  3.        ,
         1.        ],
       [ 1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.     

In [11]:
# Test set
X_eval_transformed = preprocessing_pipeline.transform(X_val)
X_eval_transformed[:2,:]

array([[1.        , 0.        , 1.        , 0.        , 0.        ,
        0.        , 0.        , 1.        , 0.        , 0.        ,
        0.        , 0.        , 1.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 1.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 1.        , 0.        , 1.        , 0.        ,
        1.        , 0.        , 1.        , 0.        , 1.        ,
        0.        , 0.        , 1.        , 0.00352671, 2.        ,
        0.        ],
       [1.        , 0.        , 1.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        1.        , 0.        , 1.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.     

In [12]:
# Test set
X_test_transformed = preprocessing_pipeline.transform(X_test)
X_test_transformed[:2,:]

array([[1.        , 0.        , 1.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        1.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 1.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 1.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 1.        , 0.        , 0.        , 1.        ,
        1.        , 0.        , 1.        , 0.        , 1.        ,
        0.        , 1.        , 0.        , 1.18497619, 1.        ,
        2.        ],
       [1.        , 0.        , 0.        , 0.        , 0.        ,
        1.        , 0.        , 0.        , 0.        , 0.        ,
        1.        , 0.        , 1.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.     

## **Modeling (For Multiple Algorithms)**

### Select the Algorithms

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Models
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "DecisionTree": DecisionTreeClassifier(),
    "RandomForest": RandomForestClassifier(n_estimators=100),
    "XGBoostClassifier": XGBClassifier(),
}

### **Training Pipeline**

In [25]:
def create_model_pipeline(models, preprocessing_pipeline):
    """
        This function create training pipelines for every model
    """

    model_pipeline_list = []

    for model_name, model in models.items():

        # build pipeline for each model
        final_pipeline=(f'  {model_name}_p', # name of general model pipeline
                            (
                                Pipeline([  ("feature_engineering", preprocessing_pipeline), # preprocessing pipeline
                                            (f'{model_name}_model', model) # model
                                        ])
                            )
                        )
        # add model pipeline
        model_pipeline_list.append(final_pipeline)
    
    return model_pipeline_list

In [26]:
def train_models_pipeline(models, preprocessing_pipeline, X_train, y_train):
    """
        Execute each pipeline
    """
    # create pipeline for each model
    model_pipeline_list = create_model_pipeline(models, preprocessing_pipeline)
    # iterate over each model pipeline
    trained_models = {}
    for pipeline_name, model_pipeline in model_pipeline_list:
        # train every model
        print(f"{pipeline_name} running...!")
        model_pipeline.fit(X_train, y_train)        
        trained_models[pipeline_name] = model_pipeline
        print("success! \n")

    return trained_models

In [27]:
trained_models=train_models_pipeline(models, preprocessing_pipeline, X_train, y_train)

  LogisticRegression_p training...!
success! 

  DecisionTree_p training...!
success! 

  RandomForest_p training...!
success! 

  XGBoostClassifier_p training...!
success! 



###  **Evaluation**

In [28]:
def eval_models(models, X_true, y_true):
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    # Evaluar modelos
    results = []
    for model_name, model in models.items():
        # predict
        y_pred = model.predict(X_true)
        # evaluate
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        results.append({
            "Model": model_name,
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1-Score": f1
        })
    return pd.DataFrame(results)

In [93]:
# def model_train_eval(training_pipeline, X_train, y_train, X_val, y_val):
#     # train
#     print(f" training...!")
#     training_pipeline.fit(X_train, y_train)
#     print(f" success! \n")
#     model_dict = {"dt_model":training_pipeline}
#     result = eval_models(model_dict, X_val, y_val)
#     print(result)

#     return training_pipeline

## **Full Training**

In [31]:
#from joblib import dump

def full_training_run(file_path):
    # load data
    raw_dataset = load_data(file_path)
    # split data
    X_train, y_train, X_test, y_test, X_val, y_val = split_data(raw_dataset, 'is_canceled', train_size=0.8, val_test_proportion=0.5)
    # build preproessing pipeline
    preprocessing_pipeline = build_pipeline()
    # training pipeline
    trained_models_pipeline = train_models_pipeline(models, preprocessing_pipeline, X_train, y_train)
    # Evaluation
    results = eval_models(trained_models_pipeline, X_val, y_val)

    #dump(trained_models_pipeline, "inference_pipeline.joblib")
    return trained_models, results

In [32]:
file_path = "../data/dataset.csv"
trained_pipeline, results = full_training_run(file_path)

  LogisticRegression_p training...!
success! 

  DecisionTree_p training...!
success! 

  RandomForest_p training...!
success! 

  XGBoostClassifier_p training...!
success! 



SHOW RESULTS

In [33]:
results

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
0,LogisticRegression_p,0.774269,0.728867,0.626688,0.673926
1,DecisionTree_p,0.806684,0.750117,0.720747,0.735139
2,RandomForest_p,0.817573,0.76989,0.727273,0.747975
3,XGBoostClassifier_p,0.812045,0.803532,0.655266,0.721864
