# **HOTEL BOOKING CANCELLATION PREDICTION**

![hotel booking](../images/image-hotel.png)

## **Read and Pre-Clean the Data**

### Import Libraries

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [22]:
import mlflow

### Load data

Module for load and pre cleaning of data

In [23]:
# Modularizando la carga de datos y limpieza inicial
def inicial_clean(df):
    # Remove columns with personal information about customers
    df.drop(['name', 'email', 'phone-number', 'credit_card'], axis=1, inplace=True)
    # Remove data leakeage
    df.drop(['reservation_status', 'reservation_status_date'], axis=1, inplace=True)
    # Convert objects to strings
    obj_columns = df.select_dtypes('object').columns
    df[obj_columns] = df[obj_columns].astype(str)
    return df

def load_data(file_path):
    # Load the dataset
    df_hotel_bookings_raw = pd.read_csv(file_path)
    # Inicial clean (drop unnecessary columns and correct the data type)
    df_hotel_bookings = inicial_clean(df_hotel_bookings_raw)
    return df_hotel_bookings

In [24]:
# file_path = "../data/dataset.csv"
# df_hotel_booking = load_data(file_path)
# df_hotel_booking.head()

In [25]:
#df_hotel_booking.info()

## **Split the Data**

**Save the size of the datasets**

In [26]:
from sklearn.model_selection import train_test_split

def split_data(df, target:str, train_size=0.8, val_test_proportion=0.5):
    
    original_size = df.shape[0]

    X = df.drop([target], axis=1).copy() # indenpendent variables
    y = df[target].copy() # denpendent (target)

    X_train, X_rest, y_train, y_rest = train_test_split(X, y, train_size=train_size)
    X_test, X_val, y_test, y_val = train_test_split(X_rest, y_rest, train_size=val_test_proportion)

    # --------------------------------------------
    # Save the size of the datasets using "mlflow"
    # --------------------------------------------
    mlflow.log_params({
        'dataset_size': original_size,
        'training_set_size': len(X_train),
        'validate_set_size': len(X_val),
        'test_set_size': len(X_test),
    })

    return X_train, y_train, X_test, y_test, X_val, y_val

In [27]:
# X_train, y_train, X_test, y_test, X_val, y_val = split_data(df_hotel_booking, 'is_canceled', train_size=0.8, val_test_proportion=0.5)
# print("X_train, X_test, X_val")
# print(X_train.shape, X_test.shape, X_val.shape)
# print(len(X_train), len(X_test), len(X_val))

## **Pipeline for Pre Processing**

### Building Pipeline

**Save the preprocessing using mlflow**

In [28]:
from sklearn.preprocessing import OneHotEncoder, Binarizer, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion, Pipeline

def build_pipeline():
    
    # -----------------------------------------------------------------------------------------------------
    # One-hot encoder
    # -----------------------------------------------------------------------------------------------------
    internal_ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
    columns_to_encode = [
        "hotel",
        "meal", 
        "distribution_channel", 
        "reserved_room_type", 
        "assigned_room_type", 
        "customer_type"
    ]


    # --------------------------------------------
    # Save the one-hot encoder columns -> MLflow
    # --------------------------------------------
    mlflow.log_param('ohe_columns', columns_to_encode)
    # Get parameters from the one-hot encoder
    encoder_params = internal_ohe.get_params()
    # --------------------------------------------
    # Extract and Save the one-hot encoder parameters in a dictionary -> MLflow
    # --------------------------------------------
    mlflow.log_params({
        f"encoder_{key}": value for key, value in encoder_params.items()
    })


    one_hot_encoding = ColumnTransformer([
        (
            'one_hot_encode', internal_ohe, columns_to_encode
        )
    ])



    # -----------------------------------------------------------------------------------------------------
    # Binarizer
    # -----------------------------------------------------------------------------------------------------
    internal_binarizer = Binarizer()
    columns_to_binarize = [
        "total_of_special_requests", 
        "required_car_parking_spaces", 
        "booking_changes", 
        "previous_bookings_not_canceled", 
        "previous_cancellations",
    ]
    # primero convierte a variables dummy
    binarizer = ColumnTransformer([
        (
            'binarizer', internal_binarizer, columns_to_binarize
        )
    ])
    # despues aplica ohe
    internal_encoder_binarizer = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
    one_hot_binarized = Pipeline([
        ("binarizer", binarizer),
        ("one_hot_encoder", internal_encoder_binarizer),
    ])


    # -----------------------------------------------------------------------------------------------------
    # Scaler
    # -----------------------------------------------------------------------------------------------------
    internal_scaler = RobustScaler()
    columns_to_scale = ["adr"]

    scaler = ColumnTransformer([
        ("scaler", internal_scaler, columns_to_scale)
    ])


    # -----------------------------------------------------------------------------------------------------
    # Passthrough columns
    # -----------------------------------------------------------------------------------------------------
    pass_columns = [
        "stays_in_week_nights",
        "stays_in_weekend_nights",
    ]

    passthrough = ColumnTransformer([
        ("pass_columns", "passthrough", pass_columns)
    ])


    # -----------------------------------------------------------------------------------------------------
    # Full Preprocessing Pipeline
    # -----------------------------------------------------------------------------------------------------
    feature_engineering_pipeline  = Pipeline([
        (
            "features",
            FeatureUnion([
                ('categories', one_hot_encoding),
                ('binaries', one_hot_binarized),
                ('scaled', scaler),
                ('passthrough', passthrough)
            ])
        )
    ])

    return feature_engineering_pipeline

### Show Pipeline

In [29]:
# from sklearn import set_config

# preprocessing_pipeline = build_pipeline()
# set_config(display="diagram")
# preprocessing_pipeline 

### Applying Pipeline

In [30]:
# # Training set
# X_train_transformed = preprocessing_pipeline.fit_transform(X_train)
# X_train_transformed[:2,:]

In [31]:
# # Test set
# X_eval_transformed = preprocessing_pipeline.transform(X_val)
# X_eval_transformed[:2,:]

In [32]:
# # Test set
# X_test_transformed = preprocessing_pipeline.transform(X_test)
# X_test_transformed[:2,:]

## **Modeling (For Multiple Algorithms)**

### **Training Pipeline**

In [33]:
from sklearn.ensemble import RandomForestClassifier

def build_training_pipeline():

    preprocessing_pipeline = build_pipeline()

    # Machine learning model
    model = RandomForestClassifier(n_estimators=10)

    # -----------------------
    # Save the parameter -> "mlflow"
    # -----------------------
    model_params = model.get_params()
    mlflow.log_params({
        f"model__{key}": value for key, value in model_params.items()
    })


    # Full pipeline
    final_pipeline = Pipeline([
        ("feature_engineering", preprocessing_pipeline),
        ("model", model)
    ])

    return final_pipeline

In [34]:
def model_train(training_pipeline, X_train, y_train):
    # train
    print(f" training...!")
    model = training_pipeline.fit(X_train, y_train)
    print(f" success! \n")

    return model

###  **Evaluation**

**Save Metrics**

In [35]:
def eval_models(models, X_true, y_true):
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    # Evaluar modelos
    results = []
    for model_name, model in models.items():
        # predict
        y_pred = model.predict(X_true)
        # evaluate
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        results.append({
            "Model": model_name,
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1-Score": f1
        })
    
        # --------------------------------
        # Save metrics -> "mlflow"
        # --------------------------------
        mlflow.log_metrics({
            f"{model_name}_Accuracy": accuracy,
            f"{model_name}_Precision": precision,
            f"{model_name}_Recall": recall,
            f"{model_name}_F1": f1
        })


    return pd.DataFrame(results)

## **Full Training**

**Save Experiment**

In [36]:
#from joblib import dump

def full_training_run(file_path):

    # -----------------------------------
    # set the experiment name -> "mlflow"
    # -----------------------------------
    mlflow.set_experiment("/turism/hotel-booking-cancellation")
    # start the execution block of the experiment
    with mlflow.start_run() as run:

        # load data
        raw_dataset = load_data(file_path)
        # split data
        X_train, y_train, X_test, y_test, X_val, y_val = split_data(raw_dataset, 'is_canceled', train_size=0.8, val_test_proportion=0.5)
        # build preproessing pipeline
        training_pipeline = build_training_pipeline()
        # training pipeline
        model_trained_pipeline = model_train(training_pipeline, X_train, y_train)
        # for one model
        dict_models = {'rf_model': model_trained_pipeline,}
        # Evaluation
        results = eval_models(dict_models, X_val, y_val)

        #dump(model_trained_pipeline, "inference_pipeline.joblib")
        
        # ----------------------------
        # Save the model -> "mlflow"
        # ----------------------------
        # mlflow.log_artifacts("inference_pipeline.joblib")

        return model_trained_pipeline, results

**Run all pipeline**

In [37]:
file_path = "../data/dataset.csv"
trained_pipeline, results = full_training_run(file_path)

 training...!
 success! 



In [38]:
#mlflow.end_run()

SHOW RESULTS

In [39]:
results

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
0,rf_model,0.813468,0.765054,0.708571,0.73573
