## Data Preparation

In [36]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [37]:
data = pd.read_csv(
    filepath_or_buffer='../data/raw/hotel_reservations.csv',
    sep=',',
    header=0
)

data.head()

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,INN00001,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,Not_Canceled
1,INN00002,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled
2,INN00003,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,Canceled
3,INN00004,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.0,0,Canceled
4,INN00005,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.5,0,Canceled


### Getting rid of the Booking_ID column

In [38]:
# Drop the Booking_ID column
data = data.drop('Booking_ID', axis=1)
data.head()

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,Not_Canceled
1,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled
2,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,Canceled
3,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.0,0,Canceled
4,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.5,0,Canceled


### Encoding the type_of_meal_plan variable

In [39]:
# Print unique values for type_of_meal_plan column
print(data['type_of_meal_plan'].unique())

['Meal Plan 1' 'Not Selected' 'Meal Plan 2' 'Meal Plan 3']


In [40]:
# Convert type_of_meal_plan to a categorical variable
data['type_of_meal_plan'] = data['type_of_meal_plan'].astype('category')

# Check the data type of type_of_meal_plan column
print(data['type_of_meal_plan'].dtype)

category


In [41]:
# Use One-Hot Encoding to convert type_of_meal_plan to a numerical variable
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()

# Fit the encoder to the type_of_meal_plan column
encoder.fit(data[['type_of_meal_plan']])

# Transform the type_of_meal_plan column
meal_plan_encoded = encoder.transform(data[['type_of_meal_plan']]).toarray().astype('int64')

# Create a DataFrame from the encoded data
meal_plan_encoded_df = pd.DataFrame(
    meal_plan_encoded,
    columns=encoder.get_feature_names_out(['type_of_meal_plan'])
)

meal_plan_encoded_df.head()

Unnamed: 0,type_of_meal_plan_Meal Plan 1,type_of_meal_plan_Meal Plan 2,type_of_meal_plan_Meal Plan 3,type_of_meal_plan_Not Selected
0,1,0,0,0
1,0,0,0,1
2,1,0,0,0
3,1,0,0,0
4,0,0,0,1


In [42]:
# Drop the original type_of_meal_plan column from the data
data = data.drop('type_of_meal_plan', axis=1)

# Concatenate the data and meal_plan_encoded_df DataFrames
data = pd.concat([data, meal_plan_encoded_df], axis=1)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36275 entries, 0 to 36274
Data columns (total 21 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   no_of_adults                          36275 non-null  int64  
 1   no_of_children                        36275 non-null  int64  
 2   no_of_weekend_nights                  36275 non-null  int64  
 3   no_of_week_nights                     36275 non-null  int64  
 4   required_car_parking_space            36275 non-null  int64  
 5   room_type_reserved                    36275 non-null  object 
 6   lead_time                             36275 non-null  int64  
 7   arrival_year                          36275 non-null  int64  
 8   arrival_month                         36275 non-null  int64  
 9   arrival_date                          36275 non-null  int64  
 10  market_segment_type                   36275 non-null  object 
 11  repeated_guest 

### Encoding the room_type_reserved column

In [43]:
# Print unique values for room_type_reserved column
print(data['room_type_reserved'].unique())

['Room_Type 1' 'Room_Type 4' 'Room_Type 2' 'Room_Type 6' 'Room_Type 5'
 'Room_Type 7' 'Room_Type 3']


In [44]:
# Convert room_type_reserved to a categorical variable
data['room_type_reserved'] = data['room_type_reserved'].astype('category')

encoder = OneHotEncoder()

# Fit the encoder to the room_type_reserved column
encoder.fit(data[['room_type_reserved']])

# Transform the room_type_reserved column
room_type_encoded = encoder.transform(data[['room_type_reserved']]).toarray().astype('int64')

# Create a DataFrame from the encoded data
room_type_encoded_df = pd.DataFrame(
    room_type_encoded,
    columns=encoder.get_feature_names_out(['room_type_reserved'])
)

room_type_encoded_df.head()

Unnamed: 0,room_type_reserved_Room_Type 1,room_type_reserved_Room_Type 2,room_type_reserved_Room_Type 3,room_type_reserved_Room_Type 4,room_type_reserved_Room_Type 5,room_type_reserved_Room_Type 6,room_type_reserved_Room_Type 7
0,1,0,0,0,0,0,0
1,1,0,0,0,0,0,0
2,1,0,0,0,0,0,0
3,1,0,0,0,0,0,0
4,1,0,0,0,0,0,0


In [45]:
# Drop the original room_type_reserved column from the data
data = data.drop('room_type_reserved', axis=1)

# Concatenate the data and room_type_encoded_df DataFrames
data = pd.concat([data, room_type_encoded_df], axis=1)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36275 entries, 0 to 36274
Data columns (total 27 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   no_of_adults                          36275 non-null  int64  
 1   no_of_children                        36275 non-null  int64  
 2   no_of_weekend_nights                  36275 non-null  int64  
 3   no_of_week_nights                     36275 non-null  int64  
 4   required_car_parking_space            36275 non-null  int64  
 5   lead_time                             36275 non-null  int64  
 6   arrival_year                          36275 non-null  int64  
 7   arrival_month                         36275 non-null  int64  
 8   arrival_date                          36275 non-null  int64  
 9   market_segment_type                   36275 non-null  object 
 10  repeated_guest                        36275 non-null  int64  
 11  no_of_previous_

# Encoding the market_segment_type variable

In [46]:
# Print unique values for market_segment_type column
print(data['market_segment_type'].unique())

['Offline' 'Online' 'Corporate' 'Aviation' 'Complementary']


In [47]:
# Convert market_segment_type to a categorical variable
data['market_segment_type'] = data['market_segment_type'].astype('category')

# Use One-Hot Encoding to convert market_segment_type to a numerical variable
encoder = OneHotEncoder()

# Fit the encoder to the market_segment_type column
encoder.fit(data[['market_segment_type']])

# Transform the market_segment_type column
market_segment_encoded = encoder.transform(data[['market_segment_type']]).toarray().astype('int64')

# Create a DataFrame from the encoded data
market_segment_encoded_df = pd.DataFrame(
    market_segment_encoded,
    columns=encoder.get_feature_names_out(['market_segment_type'])
)

market_segment_encoded_df.head()

Unnamed: 0,market_segment_type_Aviation,market_segment_type_Complementary,market_segment_type_Corporate,market_segment_type_Offline,market_segment_type_Online
0,0,0,0,1,0
1,0,0,0,0,1
2,0,0,0,0,1
3,0,0,0,0,1
4,0,0,0,0,1


In [48]:
# Drop the original market_segment_type column from the data
data = data.drop('market_segment_type', axis=1)

# Concatenate the data and market_segment_encoded_df DataFrames
data = pd.concat([data, market_segment_encoded_df], axis=1)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36275 entries, 0 to 36274
Data columns (total 31 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   no_of_adults                          36275 non-null  int64  
 1   no_of_children                        36275 non-null  int64  
 2   no_of_weekend_nights                  36275 non-null  int64  
 3   no_of_week_nights                     36275 non-null  int64  
 4   required_car_parking_space            36275 non-null  int64  
 5   lead_time                             36275 non-null  int64  
 6   arrival_year                          36275 non-null  int64  
 7   arrival_month                         36275 non-null  int64  
 8   arrival_date                          36275 non-null  int64  
 9   repeated_guest                        36275 non-null  int64  
 10  no_of_previous_cancellations          36275 non-null  int64  
 11  no_of_previous_

# Encoding the booking_status variable

In [50]:
# Print unique values for booking_status column
print(data['booking_status'].unique())

['Not_Canceled' 'Canceled']


In [53]:
# Set booking_status to 1 if the booking_status is 'Canceled' and 0 otherwise
data['booking_status'] = data['booking_status'].apply(lambda x: 1 if x == 'Canceled' else 0)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36275 entries, 0 to 36274
Data columns (total 31 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   no_of_adults                          36275 non-null  int64  
 1   no_of_children                        36275 non-null  int64  
 2   no_of_weekend_nights                  36275 non-null  int64  
 3   no_of_week_nights                     36275 non-null  int64  
 4   required_car_parking_space            36275 non-null  int64  
 5   lead_time                             36275 non-null  int64  
 6   arrival_year                          36275 non-null  int64  
 7   arrival_month                         36275 non-null  int64  
 8   arrival_date                          36275 non-null  int64  
 9   repeated_guest                        36275 non-null  int64  
 10  no_of_previous_cancellations          36275 non-null  int64  
 11  no_of_previous_

In [54]:
# Save the cleaned data to a new CSV file
data.to_csv('../data/processed/hotel_reservations_encoded.csv', index=False)