In [None]:
# plots.py
"""
@brief:
Provides methods to plot data. The methods are data agnostic for reuse.

@author: Alain Christian (U2083264)
"""

In [None]:
import matplotlib.pyplot as plt
import random

In [None]:
class Charts:
    """
    This class houses methods for creating charts.
    """

    def __init__(self):
        pass

    def bar_chart(self, values: list, properties: list, y_axis_lbl: str, title: str):
        """
        Creates a bar chart using a list of provided values and properties.
        """
        fig, ax = plt.subplots()
        labels = properties
        colors = [
            (random.random(), random.random(), random.random(), 1) for value in values
        ]
        ax.bar(properties, values, label=labels, color=colors)
        ax.set_ylabel(y_axis_lbl)
        ax.legend(title=title)

        return plt

In [None]:
# insights.py
"""
@brief:
This module uses the pre-processed dataframe from the cleaner
to derive insights from the data.

@author: Alain Mugisha (u2083264)
"""

In [None]:
from cleaner import Cleaner
from typing import List
from plots import Charts
from pandas import DataFrame, Series
from typing import cast

In [None]:
class Insights:
    """
    Produces insights based on the passed information.
    """

    def __init__(self):
        self.cleaner = Cleaner()
        self.data_frame = self.cleaner.validate_data()

    def cancellation_percentage_per_hotel(self) -> List:
        """
        Calculates the percentage of cancellation per city
        """
        bookings_grouped = self.data_frame.groupby("hotel")["hotel"].count()
        bookings_city_canceled = (
            self.data_frame[
                (self.data_frame["is_canceled"] == True)
                & (self.data_frame["hotel"] == "City Hotel")
            ].size
            / 32
        )
        bookings_resort_canceled = (
            self.data_frame[
                (self.data_frame["is_canceled"] == True)
                & (self.data_frame["hotel"] == "Resort Hotel")
            ].size
            / 32
        )
        resort_perc = (
            bookings_resort_canceled / bookings_grouped["Resort Hotel"]
        ) * 100
        city_perc = (bookings_city_canceled / bookings_grouped["City Hotel"]) * 100

        return [resort_perc, city_perc]

    def most_ordered_meals(self) -> DataFrame:
        """
        Returns a data frame of the most ordered meal types.
        """
        meals = self.data_frame.groupby("meal")["meal"].count()
        return cast(DataFrame, meals)

    def most_booked_room_types(self) -> Series:
        """
        Returns a list of the most booked meals
        """
        # The focus is on the reserved room because at the time of
        # booking this is the room type given until it is potentially changed.
        room_type = self.data_frame.groupby("reserved_room_type")[
            "reserved_room_type"
        ].count()
        return cast(Series, room_type)

    def most_common_customer_types(self):
        """
        Returns a list of the most common customer types
        """
        # The focus is on the reserved room because at the time of
        # booking this is the room type given until it is potentially changed.
        customer_type = self.data_frame.groupby("customer_type")[
            "customer_type"
        ].count()
        return customer_type

In [None]:
if __name__ == "__main__":
    insight = Insights()
    plots = Charts()

    # Printing information about the percentage of cancellations per hotel.
    values = insight.cancellation_percentage_per_hotel()
    properties = ["% City Hotel", "% Resort Hotel"]
    y_axis_label = "Percentage of cancellation"
    title = "Percentage of cancellation per type of hotel"
    plt_canc = plots.bar_chart(values, properties, y_axis_label, title)

    # Printing information about the most ordered meals.
    values = insight.most_ordered_meals().values.tolist()
    properties = ["BB", "FB", "HB", "SC", "Undefined"]
    y_axis_label = "Meal orders per meal type"
    title = "Most ordered meals per type"
    plt_meals = plots.bar_chart(values, properties, y_axis_label, title)

    # Printing information about the most sought after room types
    values = insight.most_booked_room_types().values.tolist()
    properties = ["A", "B", "C", "D", "E", "F", "G", "H", "L", "P"]
    y_axis_label = "Number of bookings"
    title = "Most sought after rooms"
    plt_rooms = plots.bar_chart(values, properties, y_axis_label, title)
    # plt_rooms.show()

    # Printing information about the most common type of customer
    values = insight.most_common_customer_types().values.tolist()
    properties = ["Contract", "Group", "Transient", "Transient-Party"]
    y_axis_label = "Number of bookings"
    title = "Most common type of customers"
    plt_customer_types = plots.bar_chart(values, properties, y_axis_label, title)
    plt_rooms.show()

In [None]:
# cleaner.py
"""
@brief:
This module is responsible for cleaning the data.
It will remove any inconsistencies in the data and make it ready for analysis.
@author: Alain Mugisha(U2083264)

"""

In [None]:
import pandas as pd
from pydantic import BaseModel, ValidationError
from pandas import DataFrame

In [None]:
class Booking(BaseModel):
    """
    This class is the base class modeled based on the data in the csv file.
    """

    hotel: str
    is_canceled: int
    lead_time: int
    arrival_date_year: int
    arrival_date_month: str
    arrival_date_week_number: int
    arrival_date_day_of_month: int
    stays_in_weekend_nights: int
    stays_in_week_nights: int
    adults: int
    children: int
    babies: int
    meal: str
    country: str
    market_segment: str
    distribution_channel: str
    is_repeated_guest: int
    previous_cancellations: int
    previous_bookings_not_canceled: int
    reserved_room_type: str
    assigned_room_type: str
    booking_changes: int
    deposit_type: str
    agent: int
    company: int
    days_in_waiting_list: int
    customer_type: str
    adr: float
    required_car_parking_spaces: int
    total_of_special_requests: int
    reservation_status: str
    reservation_status_date: str

In [None]:
class Cleaner:
    """
    This is class has as purpose to define and use all multiple classes.
    """

    def __init__(self):
        self.data_frame: DataFrame = pd.read_csv("hotel_bookings.csv")

    def validate_row(self, row) -> Booking | None:
        """
        Validates each row in the dataframe against the Booking class.
        """
        try:
            booking = Booking(**row.to_dict())
            return booking
        except ValidationError as error:
            print(f"Error validating row: {error}")
            return None

    def validate_data(self) -> DataFrame:
        """
        Validates the whole frame and returns a valid data frame cleaned up.
        """
        # Perform a fill based on the mean value
        self.data_frame["children"] = self.data_frame["children"].fillna(
            self.data_frame.groupby("children")["children"].transform("mean")
        )

        # Perform a text filling as the datatype is text
        self.data_frame["country"] = self.data_frame["country"].fillna("N/A")

        # Perform a backward fill to allow data at the top to be filled
        self.data_frame["agent"] = self.data_frame["agent"].bfill()
        self.data_frame["company"] = self.data_frame["company"].bfill()

        # Perform a backward fill to allow data from the bottom to be filled
        self.data_frame["agent"] = self.data_frame["agent"].ffill()
        self.data_frame["company"] = self.data_frame["company"].ffill()

        dropped = self.data_frame.dropna()
        valid_series = dropped.apply(self.validate_row, axis=1)
        valid_df = DataFrame([booking.dict() for booking in valid_series])
        return valid_df

In [None]:
if __name__ == "__main__":
    cleaner = Cleaner()
    cleaner.validate_data()
    # print(cleaner.validate_data())