# **HOTEL BOOKING CANCELLATION PREDICTION**

![hotel booking](../images/image-hotel.png)

## **Read and Pre-Clean the Data**

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Load data

Module for load and pre cleaning of data

In [2]:
# Modularizando la carga de datos y limpieza inicial
def inicial_clean(df):
    # Remove columns with personal information about customers
    df.drop(['name', 'email', 'phone-number', 'credit_card'], axis=1, inplace=True)
    # Remove data leakeage
    df.drop(['reservation_status', 'reservation_status_date'], axis=1, inplace=True)
    # Convert objects to strings
    obj_columns = df.select_dtypes('object').columns
    df[obj_columns] = df[obj_columns].astype(str)
    return df

def load_data(file_path):
    # Load the dataset
    df_hotel_bookings_raw = pd.read_csv(file_path)
    # Inicial clean (drop unnecessary columns and correct the data type)
    df_hotel_bookings = inicial_clean(df_hotel_bookings_raw)
    return df_hotel_bookings

In [3]:
file_path = "../data/dataset.csv"
df_hotel_booking = load_data(file_path)
df_hotel_booking.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,C,3,No Deposit,,,0,Transient,0.0,0,0
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,C,4,No Deposit,,,0,Transient,0.0,0,0
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,C,0,No Deposit,,,0,Transient,75.0,0,0
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,A,0,No Deposit,304.0,,0,Transient,75.0,0,0
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,A,0,No Deposit,240.0,,0,Transient,98.0,0,1


In [4]:
df_hotel_booking.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 30 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

## **Split the Data**

In [None]:
from sklearn.model_selection import train_test_split

def split_data(df):
    

    X = df.drop(['is_canceled'], axis=1).copy() # indenpendent variables
    y = df['is_canceled'].copy() # denpendent (target)

    X_train, X_rest, y_train, y_rest = train_test_split(X, y, train_size=.6)
    X_test, X_val, y_test, y_val = train_test_split(X_rest, y_rest, train_size=0.5)

    return X_train, y_train, X_test, y_test, X_val, y_val

In [8]:
X_train, X_rest, y_train, y_rest = train_test_split(X, y, train_size=.6)
X_test, X_val, y_test, y_val = train_test_split(X_rest, y_rest, train_size=0.5)

print(len(X_train), len(X_test), len(X_val))

71634 23878 23878


## **Pipeline for Pre Processing**

### Building Pipeline

In [9]:
from sklearn.preprocessing import OneHotEncoder, Binarizer, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion, Pipeline


# -----------------------------------------------------------------------------------------------------
# One-hot encoder
# -----------------------------------------------------------------------------------------------------
internal_ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
columns_to_encode = [
    "hotel",
    "meal", 
    "distribution_channel", 
    "reserved_room_type", 
    "assigned_room_type", 
    "customer_type"
]

one_hot_encoding = ColumnTransformer([
    (
        'one_hot_encode', internal_ohe, columns_to_encode
    )
])


# -----------------------------------------------------------------------------------------------------
# Binarizer
# -----------------------------------------------------------------------------------------------------
internal_binarizer = Binarizer()
columns_to_binarize = [
    "total_of_special_requests", 
    "required_car_parking_spaces", 
    "booking_changes", 
    "previous_bookings_not_canceled", 
    "previous_cancellations",
]
# primero convierte a variables dummy
binarizer = ColumnTransformer([
    (
        'binarizer', internal_binarizer, columns_to_binarize
    )
])
# despues aplica ohe
internal_encoder_binarizer = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
one_hot_binarized = Pipeline([
    ("binarizer", binarizer),
    ("one_hot_encoder", internal_encoder_binarizer),
])


# -----------------------------------------------------------------------------------------------------
# Scaler
# -----------------------------------------------------------------------------------------------------
internal_scaler = RobustScaler()
columns_to_scale = ["adr"]

scaler = ColumnTransformer([
    ("scaler", internal_scaler, columns_to_scale)
])


# -----------------------------------------------------------------------------------------------------
# Passthrough columns
# -----------------------------------------------------------------------------------------------------
pass_columns = [
    "stays_in_week_nights",
    "stays_in_weekend_nights",
]

passthrough = ColumnTransformer([
    ("pass_columns", "passthrough", pass_columns)
])


# -----------------------------------------------------------------------------------------------------
# Full Preprocessing Pipeline
# -----------------------------------------------------------------------------------------------------
feature_engineering_pipeline  = Pipeline([
    (
        "features",
        FeatureUnion([
            ('categories', one_hot_encoding),
            ('binaries', one_hot_binarized),
            ('scaled', scaler),
            ('passthrough', passthrough)
        ])
    )
])

### Show Pipeline

In [10]:
from sklearn import set_config

set_config(display="diagram")
feature_engineering_pipeline 

### Applying Pipeline

In [11]:
# Training set
X_train_transformed = feature_engineering_pipeline.fit_transform(X_train)
X_train_transformed[:2,:]

array([[ 1.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
         1.        ,  0.        ,  1.        ,  0.        ,  1.        ,
         0.        ,  1.        ,  0.        , -0.17192982,  3.        ,
         0.        ],
       [ 1.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.     

In [12]:
# Test set
X_test_transformed = feature_engineering_pipeline.transform(X_test)
X_test_transformed[:2,:]

array([[ 0.        ,  1.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  1.        ,  1.        ,  0.        ,  0.        ,
         1.        ,  1.        ,  0.        ,  2.33438596,  4.        ,
         2.        ],
       [ 0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.     

## **Modeling**

### Select the Algorithms

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


lr_model = LogisticRegression(max_iter=1000)
dt_model = DecisionTreeClassifier()
rf_model = RandomForestClassifier(n_estimators=100)
#svc_model = SVC(kernel='linear')

# Diccionario de modelos
models = {
    "Logistic Regression": lr_model,
    "Decision Tree": dt_model,
    "Random Forest": rf_model,
    #"SVM": svc_model
}


### Train de Models

In [14]:
# Función para entrenar modelos
def train_models(models, X_train, y_train):
    trained_models = {}
    for model_name, model in models.items():
        print(f"{model_name} training...!")
        model.fit(X_train, y_train)        
        trained_models[model_name] = model
        print(f"{model_name} success! \n")
    return trained_models

In [15]:
trained_models=train_models(models, X_train_transformed, y_train)

Logistic Regression training...!
Logistic Regression success! 

Decision Tree training...!
Decision Tree success! 

Random Forest training...!
Random Forest success! 



## **Evaluation**

In [16]:
def eval_models(models, X_test, y_test):
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    # Evaluar modelos
    results = []
    for model_name, model in models.items():
        # predict
        y_pred = model.predict(X_test)
        # evaluate
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        results.append({
            "Model": model_name,
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1-Score": f1
        })
    return pd.DataFrame(results)

In [17]:
models_values_eval = eval_models(trained_models, X_test_transformed, y_test)
models_values_eval

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
0,Logistic Regression,0.775274,0.722778,0.622689,0.669011
1,Decision Tree,0.791817,0.717477,0.708003,0.712709
2,Random Forest,0.808066,0.744547,0.721208,0.732692


In [20]:
from sklearn.metrics import classification_report

def show_report_eval_models(models, X_test, y_test):

    for model_name, model in models.items():
        y_pred = model.predict(X_test_transformed)
        print(f'>> {model_name.upper()} <<')
        print(classification_report(y_test, y_pred), '\n')

In [21]:
show_report_eval_models(trained_models, X_test_transformed, y_test)

>> LOGISTIC REGRESSION <<
              precision    recall  f1-score   support

           0       0.80      0.86      0.83     15169
           1       0.72      0.62      0.67      8709

    accuracy                           0.78     23878
   macro avg       0.76      0.74      0.75     23878
weighted avg       0.77      0.78      0.77     23878
 

>> DECISION TREE <<
              precision    recall  f1-score   support

           0       0.83      0.84      0.84     15169
           1       0.72      0.71      0.71      8709

    accuracy                           0.79     23878
   macro avg       0.78      0.77      0.77     23878
weighted avg       0.79      0.79      0.79     23878
 

>> RANDOM FOREST <<
              precision    recall  f1-score   support

           0       0.84      0.86      0.85     15169
           1       0.74      0.72      0.73      8709

    accuracy                           0.81     23878
   macro avg       0.79      0.79      0.79     23878
weig