## Data Preparation

In [97]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [98]:
data = pd.read_csv(
    filepath_or_buffer='../data/raw/hotel_reservations.csv',
    sep=',',
    header=0
)

data.head()

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,INN00001,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,Not_Canceled
1,INN00002,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled
2,INN00003,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,Canceled
3,INN00004,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.0,0,Canceled
4,INN00005,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.5,0,Canceled


### Getting rid of the Booking_ID column

In [99]:
# Drop the Booking_ID column
data = data.drop('Booking_ID', axis=1)
data.head()

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,Not_Canceled
1,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled
2,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,Canceled
3,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.0,0,Canceled
4,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.5,0,Canceled


### Encoding the type_of_meal_plan variable

In [100]:
# Print unique values for type_of_meal_plan column
print(data['type_of_meal_plan'].unique())

['Meal Plan 1' 'Not Selected' 'Meal Plan 2' 'Meal Plan 3']


In [101]:
# Convert type_of_meal_plan to a categorical variable
data['type_of_meal_plan'] = data['type_of_meal_plan'].astype('category')

# Check the data type of type_of_meal_plan column
print(data['type_of_meal_plan'].dtype)

category


In [102]:
# Use Lable Encoding to convert type_of_meal_plan to numerical values with int64 data type
data['type_of_meal_plan'] = data['type_of_meal_plan'].cat.codes.astype('int64')

# Check the data type of type_of_meal_plan column
print(data['type_of_meal_plan'].dtype)

# Print unique values for type_of_meal_plan column
print(data['type_of_meal_plan'].unique())

int64
[0 3 1 2]


In [103]:
data.head()

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,2,0,1,2,0,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,Not_Canceled
1,2,0,2,3,3,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled
2,1,0,2,1,0,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,Canceled
3,2,0,0,2,0,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.0,0,Canceled
4,2,0,1,1,3,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.5,0,Canceled


### Encoding the room_type_reserved column

In [104]:
# Print unique values for room_type_reserved column
print(data['room_type_reserved'].unique())

['Room_Type 1' 'Room_Type 4' 'Room_Type 2' 'Room_Type 6' 'Room_Type 5'
 'Room_Type 7' 'Room_Type 3']


In [105]:
# Convert room_type_reserved to a categorical variable
data['room_type_reserved'] = data['room_type_reserved'].astype('category')

# Check the data type of room_type_reserved column
print(data['room_type_reserved'].dtype)

category


In [106]:
# Use Lable Encoding to convert room_type_reserved to numerical values
data['room_type_reserved'] = data['room_type_reserved'].cat.codes.astype('int64')

# Check the data type of room_type_reserved column
print(data['room_type_reserved'].dtype)

# Print unique values for room_type_reserved column
print(data['room_type_reserved'].unique())

int64
[0 3 1 5 4 6 2]


In [107]:
data.head()

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,2,0,1,2,0,0,0,224,2017,10,2,Offline,0,0,0,65.0,0,Not_Canceled
1,2,0,2,3,3,0,0,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled
2,1,0,2,1,0,0,0,1,2018,2,28,Online,0,0,0,60.0,0,Canceled
3,2,0,0,2,0,0,0,211,2018,5,20,Online,0,0,0,100.0,0,Canceled
4,2,0,1,1,3,0,0,48,2018,4,11,Online,0,0,0,94.5,0,Canceled


# Encoding the market_segment_type variable

In [108]:
# Print unique values for market_segment_type column
print(data['market_segment_type'].unique())

['Offline' 'Online' 'Corporate' 'Aviation' 'Complementary']


In [109]:
# Convert market_segment_type to a categorical variable
data['market_segment_type'] = data['market_segment_type'].astype('category')

# Check the data type of market_segment_type column
print(data['market_segment_type'].dtype)

category


In [110]:
# Use Lable Encoding to convert market_segment_type to numerical values
data['market_segment_type'] = data['market_segment_type'].cat.codes.astype('int64')

# Check the data type of market_segment_type column
print(data['market_segment_type'].dtype)

# Print unique values for market_segment_type column
print(data['market_segment_type'].unique())

int64
[3 4 2 0 1]


In [111]:
data.head()

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,2,0,1,2,0,0,0,224,2017,10,2,3,0,0,0,65.0,0,Not_Canceled
1,2,0,2,3,3,0,0,5,2018,11,6,4,0,0,0,106.68,1,Not_Canceled
2,1,0,2,1,0,0,0,1,2018,2,28,4,0,0,0,60.0,0,Canceled
3,2,0,0,2,0,0,0,211,2018,5,20,4,0,0,0,100.0,0,Canceled
4,2,0,1,1,3,0,0,48,2018,4,11,4,0,0,0,94.5,0,Canceled


# Encoding the booking_status variable

In [112]:
# Print unique values for booking_status column
print(data['booking_status'].unique())

['Not_Canceled' 'Canceled']


In [113]:
# Set booking_status to 1 if the booking_status is 'Canceled' and 0 otherwise
data['booking_status'] = data['booking_status'].apply(lambda x: 1 if x == 'Canceled' else 0)

# Print unique values for booking_status column
print(data['booking_status'].unique())

[0 1]


In [114]:
data.head()

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,2,0,1,2,0,0,0,224,2017,10,2,3,0,0,0,65.0,0,0
1,2,0,2,3,3,0,0,5,2018,11,6,4,0,0,0,106.68,1,0
2,1,0,2,1,0,0,0,1,2018,2,28,4,0,0,0,60.0,0,1
3,2,0,0,2,0,0,0,211,2018,5,20,4,0,0,0,100.0,0,1
4,2,0,1,1,3,0,0,48,2018,4,11,4,0,0,0,94.5,0,1


In [115]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36275 entries, 0 to 36274
Data columns (total 18 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   no_of_adults                          36275 non-null  int64  
 1   no_of_children                        36275 non-null  int64  
 2   no_of_weekend_nights                  36275 non-null  int64  
 3   no_of_week_nights                     36275 non-null  int64  
 4   type_of_meal_plan                     36275 non-null  int64  
 5   required_car_parking_space            36275 non-null  int64  
 6   room_type_reserved                    36275 non-null  int64  
 7   lead_time                             36275 non-null  int64  
 8   arrival_year                          36275 non-null  int64  
 9   arrival_month                         36275 non-null  int64  
 10  arrival_date                          36275 non-null  int64  
 11  market_segment_

In [116]:
# Save the cleaned data to a new csv file called hotel_reservations_cleaned.csv
data.to_csv('../data/processed/hotel_reservations_cleaned.csv', index=False)