In [None]:
# cleaner.py
"""
@brief:
This module is responsible for cleaning the data.
It will remove any inconsistencies in the data and make it ready for analysis.
@author: Alain Mugisha(U2083264)

"""

In [1]:
import pandas as pd
from pydantic import BaseModel, ValidationError
from pandas import DataFrame
from matplotlib import pyplot as pylt

In [None]:
class Booking(BaseModel):
    """
    This class is the base class modeled based on the data in the csv file.
    """

    hotel: str
    is_canceled: int
    lead_time: int
    arrival_date_year: int
    arrival_date_month: str
    arrival_date_week_number: int
    arrival_date_day_of_month: int
    stays_in_weekend_nights: int
    stays_in_week_nights: int
    adults: int
    children: int
    babies: int
    meal: str
    country: str
    market_segment: str
    is_repeated_guest: int
    previous_cancellations: int
    previous_bookings_not_canceled: int
    reserved_room_type: str
    assigned_room_type: str
    booking_changes: int
    deposit_type: str
    days_in_waiting_list: int
    customer_type: str
    adr: float
    total_of_special_requests: int
    reservation_status: str
    reservation_status_date: str

In [None]:
class Cleaner:
    """
    This is class has as purpose to define and use all multiple classes.
    """

    def __init__(self):
        self.data_frame: DataFrame = pd.read_csv("hotel_bookings.csv")

    @staticmethod
    def validate_row(row) -> Booking | None:
        """
        Validates each row in the dataframe against the Booking class.
        """
        try:
            booking = Booking(**row.to_dict())
            return booking
        except ValidationError as error:
            print(f"Error validating row: {error}")
            return None

    def drop_columns(self):
        """
        Removes columns from the data frame that are not needed
        """
        self.data_frame.drop(
            columns=[

                "company",
                "required_car_parking_spaces",
                "distribution_channel",
            ],
            inplace=True,
        )


    def drop_invalid_rows(self) -> DataFrame:
        """
        Drops rows that contain values that is inconsistent.
        """
        # Drops rows where total number of days is zero and the booking is not cancelled
        invalid_stay = self.data_frame.query(
            "stays_in_weekend_nights == 0 and stays_in_week_nights == 0 and is_canceled == 0"
        )
        self.data_frame = self.data_frame.drop(invalid_stay.index)
        return self.data_frame

    def validate_data(self) -> DataFrame:
        """
        Validates the whole frame and returns a valid data frame cleaned up.
        """
        # Perform a fill based on the mean value
        self.data_frame["children"] = self.data_frame["children"].fillna(
            self.data_frame.groupby("children")["children"].transform("mean")
        )

        # Perform a text filling as the datatype is text
        self.data_frame["country"] = self.data_frame["country"].fillna("N/A")

        # Remove invalid rows with invalid data
        self.data_frame = self.drop_invalid_rows()

        # Remove unnecessary columns
        self.drop_columns()
        dropped = self.data_frame.dropna()

        valid_series = dropped.apply(self.validate_row, axis=1)
        valid_df = DataFrame([booking.dict() for booking in valid_series])
        return valid_df

In [None]:
if __name__ == "__main__":
    cleaner = Cleaner()
    pylt.boxplot(cleaner.data_frame[[
                "stays_in_week_nights",
                "stays_in_weekend_nights",
                "previous_cancellations",
                "booking_changes",
                "days_in_waiting_list"
    ]])
    pylt.show()
    print(cleaner.validate_data())

In [None]:
# eda.py
"""
@brief:
Provides modules to do exploratory data analysis on given dataframes.
@author: Alain Mugisha(U2083264)
"""

In [None]:
from matplotlib import pyplot as pylt
from sklearn.cluster import KMeans
import pandas as pd

In [None]:
from plots import Charts, Maps
from cleaner import Cleaner

In [None]:
from feature_engineering import FeatureEngineering

In [None]:
class EDA:
    """
    Provides methods for carrying out Exploratory analysis
    on a variables
    """

    def __init__(self):
        """
        Constructor method
        """
        self.charts = Charts()
        self.maps = Maps()
        self.cleaner = Cleaner()
        self.data_frame = self.cleaner.data_frame

        self.fe = FeatureEngineering(self.data_frame)
        self.fe.create_month_year()
        self.fe.create_duration()
    def guests_each_month(self):
        """
        Provides of a graphical representation
        of the number of guests per Month/Year combination
        """
        grouped = (
            self.data_frame.groupby("YearMonth")["YearMonth"]
            .count()
            .reset_index(name="count")
        )
        return grouped

    def duration_of_stays(self):
        """
        Provides a dataframe representing the grouped duration of stays.
        """

        df, bin_labels = self.fe.binning("duration")

        grouped = (
            df.groupby("bin_duration")["bin_duration"]
            .count()
            .reset_index(name="count")
        )

        return grouped, bin_labels

    def get_geographical_origins(self):
        """
        Provides a dataframe for charting country origins
        """
        grouped = (
            self.data_frame.groupby("country")["country"]
            .count()
            .reset_index(name="count")
        )

        return grouped.sort_values(by="count", ascending=False).head(10)

In [None]:
if __name__ == "__main__":
    eda = EDA()

    # Graphing the bookings per month over the years
    df_guests_each = eda.guests_each_month()
    properties = df_guests_each["YearMonth"].values.tolist()
    values = df_guests_each["count"].values.tolist()
    labels = ["Year/Month", "Number of Bookings"]
    title = "Bookings each month"
    eda.charts.line_chart(properties, values, labels, title)

    # Graphing the duration of guest stays in a barchart.
    df_duration = eda.duration_of_stays()
    properties = df_duration[0]["bin_duration"].values.tolist()
    values = df_duration[0]["count"].values.tolist()
    y_lbl_duration = "Number of Bookings"
    x_lbl_duration = "Range of duration"
    title = "Range of duration of stays"
    eda.charts.bar_chart(
        values=values,
        properties=df_duration[1],
        y_axis_lbl=y_lbl_duration,
        title=title,
        x_axis_lbl=x_lbl_duration
    )

    # Top 10 countries with the most customers
    df_geo_origins = eda.get_geographical_origins()
    properties_ctr = df_geo_origins["country"].values.tolist()
    values_ctr = df_geo_origins["count"].values.tolist()
    title_ctr = "Guest country of origin"
    labels_ctr = ["country", "count"]
    eda.charts.bar_chart(
        values=values_ctr,
        properties=properties_ctr,
        y_axis_lbl=labels_ctr[1],
        title=title_ctr,
        x_axis_lbl=labels_ctr[0]
    )
    pylt.show()

In [None]:
# feature_engineering.py
"""
This module provides feature engineering capabilities for the project. Certain new fields can be produced 
all in order to produce fast and effective predictions
"""

In [None]:
import calendar
from typing import Tuple, List

In [None]:
from pandas import DataFrame, get_dummies, qcut

In [None]:
class FeatureEngineering:
    """
    Base class defining which methods to use.
    """

    def __init__(self, data_frame: DataFrame):
        """
        Base constructor for setting basic variables
        """
        self.data_frame = data_frame

    def create_month_year(self):
        """
        Creates a new property called YearMonth.
        Which is a combination of the month and the year.
        This will be important if we want to showcase data chronologically
        """
        months = list(calendar.month_name)[1:]

        # Conversion to number format for easy graphical representation
        self.data_frame["arrival_date_month"] = self.data_frame[
            "arrival_date_month"
        ].map(lambda m: months.index(m) + 1)

        self.data_frame["arrival_date_year"] = self.data_frame["arrival_date_year"].map(
            lambda y: str(y).split("20")[1]
        )
        # This was joined to give more context on date sensitive answers.
        self.data_frame["YearMonth"] = (
            self.data_frame["arrival_date_year"].astype(str)
            + "/"
            + self.data_frame["arrival_date_month"].astype(str)
        )

    def create_duration(self):
        """
        Creates a new duration property.
        Which combines the days stayin for both the weekend and week days.
        This is particularly important because it gives a good summarised
        and concise measure of the stay of the individual.
        """
        # This is to be able to use the duration.
        self.data_frame["duration"] = (
            self.data_frame["stays_in_weekend_nights"]
            + self.data_frame["stays_in_week_nights"]
        )

    def one_hot_encoding(self, properties):
        """
        This encodes the passed properties to create dummies that are
        exploitable in further domains. This is important especially
        while running algorithms that rely on account numerical values
        """
        self.data_frame = get_dummies(self.data_frame, columns=properties)
        return self.data_frame


    def binning(self, prop:str) -> Tuple[DataFrame, List[str]]:
        """
        Returns a dataframe that contains the newly binned property
        """
        new_prop = f'bin_{prop}'
        self.data_frame[new_prop], bin_edges = qcut(
            self.data_frame[prop],
            10,
            retbins=True,
            labels=False,
            duplicates='drop'
        )
        bin_labels = [f'({bin_edges[i]:.1f}, {bin_edges[i + 1]:.1f})' for i in range(len(bin_edges) - 1)]
        return self.data_frame, bin_labels

feature_importance.py

In [None]:
# insights.py
"""
@brief:
This module uses the pre-processed dataframe from the cleaner
to derive insights from the data.

@author: Alain Mugisha (u2083264)
"""

In [None]:
from cleaner import Cleaner
from typing import List
from plots import Charts, Maps, pylt
from pandas import DataFrame, Series, crosstab
from typing import cast

In [None]:
from feature_engineering import FeatureEngineering

In [None]:
class Insights:
    """
    Produces insights based on the passed information.
    """

    def __init__(self):
        self.cleaner = Cleaner()
        self.data_frame = self.cleaner.validate_data()
        self.fe = FeatureEngineering(self.data_frame)
        self.fe.create_month_year()
        self.fe.create_duration()


    def cancellation_percentage_per_hotel(self) -> List:
        """
        Calculates the percentage of cancellation per city
        """
        bookings_grouped = self.data_frame.groupby("hotel")["hotel"].count()
        bookings_city_canceled = (
            self.data_frame[
                (self.data_frame["is_canceled"] == True)
                & (self.data_frame["hotel"] == "City Hotel")
            ].size
            / 32
        )
        bookings_resort_canceled = (
            self.data_frame[
                (self.data_frame["is_canceled"] == True)
                & (self.data_frame["hotel"] == "Resort Hotel")
            ].size
            / 32
        )
        resort_perc = (
            bookings_resort_canceled / bookings_grouped["Resort Hotel"]
        ) * 100
        city_perc = (bookings_city_canceled / bookings_grouped["City Hotel"]) * 100

        return [resort_perc, city_perc]

    def most_ordered_meals(self) -> DataFrame:
        """
        Returns a data frame of the most ordered meal types.
        """
        meals = self.data_frame.groupby("meal")["meal"].count()
        return cast(DataFrame, meals)

    def most_booked_room_types(self) -> Series:
        """
        Returns a list of the most booked meals
        """
        # The focus is on the reserved room because at the time of
        # booking this is the room type given until it is potentially changed.
        room_type = self.data_frame.groupby("reserved_room_type")[
            "reserved_room_type"
        ].count()
        return cast(Series, room_type)

    def most_common_customer_types(self):
        """
        Returns a list of the most common customer types
        """
        # The focus is on the reserved room because at the time of
        # booking this is the room type given until it is potentially changed.
        customer_type = self.data_frame.groupby("customer_type")[
            "customer_type"
        ].count()
        return customer_type

    def get_returning_guests(self) -> float:
        """
        Returns a number of returning guests
        """
        r_guests = (
            self.data_frame[(self.data_frame["is_repeated_guest"] == True)].size / 32
        )
        return r_guests

    def correlation_between_columns(self, index, columns: list):
        """
        Returns a heat map showing the correlation between a passed list of columns
        It uses cross tabulation to show the frequency distribution of x amount of columns
        """
        contingency_tbl = crosstab(
            index=self.data_frame[index],
            columns=[self.data_frame[col] for col in columns],
            normalize="index",
        )
        return contingency_tbl

In [None]:
if __name__ == "__main__":
    insight = Insights()
    plots = Charts()
    maps = Maps()

    # Printing information about the percentage of cancellations per hotel.
    values = insight.cancellation_percentage_per_hotel()
    properties = ["% City Hotel", "% Resort Hotel"]
    y_axis_label = "Percentage of cancellation"
    title = "Percentage of cancellation per type of hotel"
    plt_canc = plots.pie_chart(properties, values, title)

    # Printing information about the most ordered meals.
    values = insight.most_ordered_meals().values.tolist()
    properties = ["BB", "FB", "HB", "SC", "Undefined"]
    y_axis_label = "Meal orders per meal type"
    title = "Most ordered meals per type"
    plt_meals = plots.bar_chart(values, properties, y_axis_label, title)

    # Printing information about the most sought after room types
    values = insight.most_booked_room_types().values.tolist()
    properties = ["A", "B", "C", "D", "E", "F", "G", "H", "L", "P"]
    y_axis_label = "Number of bookings"
    title = "Most sought after rooms"
    plt_rooms = plots.bar_chart(values, properties, y_axis_label, title)
    # plt_rooms.show()

    # Printing information about the most common type of customer
    values = insight.most_common_customer_types().values.tolist()
    properties = ["Contract", "Group", "Transient", "Transient-Party"]
    y_axis_label = "Number of bookings"
    title = "Most common type of customers"
    plt_customer_types = plots.bar_chart(values, properties, y_axis_label, title)
    # plt_rooms.show()

    # Printing the number of the returning customers
    returning_customers = insight.get_returning_guests()

    # Showing correlation between room types and cancellations
    # With a plotted crosstab in a heatmap, crosstab because it will allow us to view
    # The frequency distribution between the two columns

    columns = ["assigned_room_type"]
    cont_tbl = insight.correlation_between_columns("is_canceled", columns)
    cont_tbl.plot(kind="bar", rot=90, stacked=True)
    pylt.xlabel('Assigned Room Type')
    pylt.ylabel('Whether the booking is canceled')
    pylt.title('Correlation between the room types and cancellation status')
    cont_tbl.to_excel("cont_table.xlsx", sheet_name="Crosstab Data")
    map_ = maps.heat_map(cont_tbl)
    pylt.show()

In [None]:
# plots.py
"""
@brief:
Provides methods to plot data. The methods are data agnostic for reuse.

@author: Alain Christian (U2083264)
"""

In [None]:
import matplotlib.pyplot as pylt
import geopandas as gpd
from matplotlib.pyplot import xticks

In [None]:
from pandas import DataFrame
import numpy as np
import random

In [None]:
class Charts:
    """
    This class houses methods for creating charts.
    """

    def __init__(self):
        self.pylt = pylt
    @staticmethod
    def bar_chart(
        values: list,
        properties: list,
        y_axis_lbl: str,
        title: str,
        x_axis_lbl: str = None,
    ):
        """
        Creates a bar chart using a list of provided values and properties.
        """
        fig, ax = pylt.subplots(figsize=(10, 8))
        labels = properties
        colors = [
            (random.random(), random.random(), random.random(), 1) for value in values
        ]
        ax.bar(properties, values, label=labels, color=colors,)
        ax.set_ylabel(y_axis_lbl)
        ax.set_xlabel(x_axis_lbl)
        ax.set_xticklabels(properties, rotation=45)
        ax.legend(title=title)

        return pylt

    def line_chart(self, properties: list, values: list, labels: list[str], title: str):
        """
        Creates a line chart using a list of provided values and properties.
        """
        # Create a line chart
        self.pylt.figure(figsize=(8, 6))
        self.pylt.plot(properties, values, marker="o", linestyle="-")
        self.pylt.title(title)
        self.pylt.xlabel(labels[0])
        self.pylt.ylabel(labels[1])
        self.pylt.grid(True)
        self.pylt.xticks(rotation=90)
        return self.pylt

    def pie_chart(self, properties: list, values: list, title: str):
        """
        Creates a pie chart using a list of provided values
        """
        colors = [
            (random.random(), random.random(), random.random(), 1) for value in values
        ]

        # Create a pie chart
        fig, ax = pylt.subplots(figsize=(15, 15))
        ax.pie(
            values,
            labels=properties,
            colors=colors,
            autopct="%1.1f%%",
            startangle=90,
            pctdistance=0.55,
            labeldistance=1.0,
        )
        self.pylt.legend(properties, loc="center left", bbox_to_anchor=(1, 0.5), ncol=3)
        self.pylt.title(title)

In [None]:
class Maps:
    """
    This class presents methods that create various types of maps
    """

    def __init__(self):
        self.pylt = pylt

    def heat_map(self, contingency_tbl, annot=True, cmap="YlGnBu"):
        """
        Creates a heat map with the provided contigency
        """
        self.pylt.imshow(contingency_tbl, cmap, aspect="auto")
        self.pylt.colorbar()
        self.pylt.xticks(np.arange(contingency_tbl.shape[1]), contingency_tbl.columns)
        self.pylt.yticks(np.arange(contingency_tbl.shape[0]), contingency_tbl.index)
        return self.pylt
    @staticmethod
    def choropleth_map(df: DataFrame):
        """
        Returns a choropleth map
        """

        url = "https://naciscdn.org/naturalearth/110m/cultural/ne_110m_admin_0_countries.zip"
        world = gpd.read_file(url)
        # Merge dataset / frame to add geospatial data

        merged_frame = world.merge(df, how="left", left_on="ISO_A3", right_on="country")
        merged_frame["country"].dropna(inplace=False)
        fig, ax = pylt.subplots(1, 1, figsize=(14, 10))

        merged_frame.plot(
            column="count",
            ax=ax,
            legend=True,
            cmap="YlGnBu",
            missing_kwds={
                "color": "lightgrey",
                "edgecolor": "red",
                "hatch": "/\/",
                "label": "Unknown values",
            },
        )

        ax.set_title("Bookings per country")
        ax.set_xlabel("Longitude")
        ax.set_ylabel("Latitude")
        pylt.legend(loc="lower right")

        ax.set_axis_off()

In [None]:
# training.py
"""
@brief:
Provides training functionality for the cleaned and prepared
data presented as a dataframe
"""
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
    mean_absolute_error,
    mean_squared_error,
    r2_score,
)
from feature_engineering import FeatureEngineering

In [None]:
from cleaner import Cleaner
from plots import Charts

In [None]:
class Train:
    """
    Provides methods to facilitate the training
    """

    def __init__(self) -> None:
        """
        Base constructor, setting up class properties
        """
        self.data_frame = Cleaner().validate_data()
        self.f_eng = FeatureEngineering(data_frame=self.data_frame)

        # Call methods for feature engineering to have the right methods called.
        self.f_eng.create_month_year()
        self.f_eng.create_duration()
        self.data_frame = self.f_eng.one_hot_encoding(
            properties=[
                "customer_type",
                "assigned_room_type",
                "deposit_type",
                "reservation_status",
                "meal",
                "hotel",
                "arrival_date_month",
                "country",
                "market_segment",
                "reserved_room_type",
                "reservation_status_date",
                "YearMonth",
            ]
        )
        self.charts = Charts()

    def split(self):
        """
        Splits the dataset into features and labels
        """
        x = self.data_frame[
            [
                "stays_in_week_nights",
                "stays_in_weekend_nights",
                "previous_cancellations",
                "previous_bookings_not_canceled",
                "booking_changes",
                "days_in_waiting_list",
                "is_repeated_guest",
                "lead_time",
                "deposit_type_Non Refund",
                "adr",
                "assigned_room_type_A",
                "customer_type_Transient",
                "market_segment_Groups",
                "country_PRT",
                "hotel_City Hotel",
                "hotel_Resort Hotel",
                "meal_FB",
            ]
        ]
        y = self.data_frame["reservation_status_Canceled"]
        x_train, x_test, y_train, y_test = train_test_split(
            x, y, test_size=0.2, random_state=42
        )

        return (x_train, x_test, y_train, y_test)

    def train(self):
        """
        Method for training the dataset using RandomForestClassifier
        """
        x_train, x_test, y_train, y_test = self.split()
        rf = RandomForestClassifier(n_estimators=100, random_state=42)

        rf.fit(x_train, y_train)

        y_pred = rf.predict(x_test)
        return rf, y_test, y_pred

    def evaluate(self, rf, y_test, y_pred):
        """
        Evaluate the model by calculating precision,
        recall f1-score  and plotting a chart
        about feature importance.
        """
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        conf_matx = confusion_matrix(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        print("Precision:", precision)
        print("Recall:", recall)
        print("F1-Score:", f1)
        print(f"Accuracy: {accuracy: .2f}")

        roc_auc = roc_auc_score(y_test, y_pred)
        print("AUC-ROC:", roc_auc)

        print("Confusion Matrix: \n", conf_matx)

        impt = rf.feature_importances_
        impt_r = [imp for imp in list(range(len(impt)))]
        chart = self.charts.line_chart(
            impt_r, impt, ["Score", "Feature"], "Feature importance"
        )
        chart.show()

In [None]:
if __name__ == "__main__":
    train = Train()
    rf, y_test, y_pred_ = train.train()
    train.evaluate(rf, y_test, y_pred_)