### We import the libraries needed to use and clean the dataset

In [2]:
import pandas as pd
import kagglehub
import shutil
import os
import calendar

### We obtain the Kaggle dataset

In [3]:
#We define the path where we want our data to be.
RAW_DATA_DIR = '../data/raw/'
FILE_NAME = 'hotel_bookings.csv'

#If the destination folder does not exist, we create it.
if not os.path.exists(RAW_DATA_DIR):
    os.makedirs(RAW_DATA_DIR)

#We downloaded the Kaggle dataset.
tmp_path = kagglehub.dataset_download('jessemostipak/hotel-booking-demand')

#We located the origin and destination routes.
source_file = os.path.join(tmp_path, FILE_NAME)
destination_file = os.path.join(RAW_DATA_DIR, FILE_NAME)

#We use shutil.copy to bring the data to our selected folder.
shutil.copy(source_file, destination_file)

print(f'Dataset successfully uploaded to {RAW_DATA_DIR}')

Dataset successfully uploaded to ../data/raw/


### Define the DataFrame and verify the integrity of the loaded data

In [4]:
df = pd.read_csv(destination_file)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

### Check for null values ​​to perform proper cleaning.

In [5]:
df.isna().sum()

hotel                                  0
is_canceled                            0
lead_time                              0
arrival_date_year                      0
arrival_date_month                     0
arrival_date_week_number               0
arrival_date_day_of_month              0
stays_in_weekend_nights                0
stays_in_week_nights                   0
adults                                 0
children                               4
babies                                 0
meal                                   0
country                              488
market_segment                         0
distribution_channel                   0
is_repeated_guest                      0
previous_cancellations                 0
previous_bookings_not_canceled         0
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
agent                              16340
company         

### Data Cleansing: Handling Redundancy, Missing Values, and Business Logic Constraints

In [6]:
df = (df
    .drop_duplicates()
    .fillna({
        'country': 'Not provided',
        'agent': 0,
        'company': 0,
        'children': 0
    })
    .loc[lambda x: x['adults'] > 0]
    .astype({
        'agent': int,
        'company': int,
        'children': int
    })
    .rename(columns=str.lower)
    .reset_index(drop=True)
)

### Update the column names 'arrival_date_year', 'arrival_date_month', 'arrival_date_day_of_month' to 'year', 'month' and 'day'.

In [7]:
df = (df
    .rename(columns={
        'arrival_date_year': 'year',
        'arrival_date_month': 'month',
        'arrival_date_day_of_month': 'day'
    })
)

### Update the 'month' column to an int in order to correctly create the 'arrival_date' column of type datetime, then delete the columns we already used, to avoid generating data redundancy.

In [8]:
df['month'] = pd.to_datetime(df['month'], format='%B').dt.month

df = (df
    .assign(arrival_date = pd.to_datetime(df[['year', 'month', 'day']]))
    .drop(columns=['year', 'month', 'day'])
)

### Verify the columns and their corresponding data type.

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87011 entries, 0 to 87010
Data columns (total 30 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   hotel                           87011 non-null  object        
 1   is_canceled                     87011 non-null  int64         
 2   lead_time                       87011 non-null  int64         
 3   arrival_date_week_number        87011 non-null  int64         
 4   stays_in_weekend_nights         87011 non-null  int64         
 5   stays_in_week_nights            87011 non-null  int64         
 6   adults                          87011 non-null  int64         
 7   children                        87011 non-null  int64         
 8   babies                          87011 non-null  int64         
 9   meal                            87011 non-null  object        
 10  country                         87011 non-null  object        
 11  ma

### Create the 'arrival_day' and 'arrival_month' columns which will be very useful for statistical analysis based on the day or month.

In [10]:
ordered_days = list(calendar.day_name)[1:]
ordered_months = list(calendar.month_name)[1:]

df['arrival_day'] = pd.Categorical(
    df['arrival_date'].dt.day_name(),
    categories=ordered_days,
    ordered=True
)

df['arrival_month'] = pd.Categorical(
    df['arrival_date'].dt.month_name(),
    categories=ordered_months,
    ordered=True
)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87011 entries, 0 to 87010
Data columns (total 32 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   hotel                           87011 non-null  object        
 1   is_canceled                     87011 non-null  int64         
 2   lead_time                       87011 non-null  int64         
 3   arrival_date_week_number        87011 non-null  int64         
 4   stays_in_weekend_nights         87011 non-null  int64         
 5   stays_in_week_nights            87011 non-null  int64         
 6   adults                          87011 non-null  int64         
 7   children                        87011 non-null  int64         
 8   babies                          87011 non-null  int64         
 9   meal                            87011 non-null  object        
 10  country                         87011 non-null  object        
 11  ma

### Finally, we save our processed file in '../data/processed/'

In [12]:
output_path = '../data/processed'

if not os.path.exists(output_path):
    os.makedirs(output_path)

df.to_csv(f'{output_path}/hotel_booking_cleaned.csv', index=False)

print(f'File saved successfully in {output_path}/')
print(f'Final size of the dataset after cleaning: {df.shape}')

File saved successfully in ../data/processed/
Final size of the dataset after cleaning: (87011, 32)
