In [None]:
import os

def scale_input_data(scale_factor):
  file_bases = ['./input/hotel_bookings']
  for file_base in file_bases:
    import pandas as pd
    import shutil
    if scale_factor == 1.0:
      shutil.copyfile(file_base + '.csv', file_base + '.scaled.csv')
      continue
    df_to_scale = pd.read_csv(file_base + '.csv')
    new_num_rows = int(scale_factor * len(df_to_scale))
    if scale_factor <= 1.0:
      df_to_scale = df_to_scale.iloc[:new_num_rows]
    else:
      while len(df_to_scale) < new_num_rows:
        df_to_scale = pd.concat([df_to_scale, df_to_scale[:min(new_num_rows - len(df_to_scale), len(df_to_scale))]])
    df_to_scale.to_csv(file_base + '.scaled.csv', index=False)

if 'INPUT_SCALE_FACTOR' in os.environ:
  scale_input_data(float(os.environ['INPUT_SCALE_FACTOR']))

# EDA - Hotel Booking Demand

This data set contains booking information for a city hotel and a resort hotel, and includes information such as when the booking was made, length of stay, the number of adults, children, and/or babies, and the number of available parking spaces, among other things.

In [1]:
import numpy as np
# import pandas as pd
exec(os.environ['IREWR_IMPORTS'])
import datetime

# FIRST-AUTHOR: remove plotting
# import matplotlib.pyplot as plt
# import seaborn as sns
# import folium
# %matplotlib inline

In [2]:
hotel_bookings = pd.read_csv("./input/hotel_bookings.scaled.csv")

In [3]:
hotel_bookings.shape

(119390, 32)

In [4]:
hotel_bookings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

In [5]:
hotel_bookings.describe(include='all')

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
count,119390,119390.0,119390.0,119390.0,119390,119390.0,119390.0,119390.0,119390.0,119390.0,...,119390,103050.0,6797.0,119390.0,119390,119390.0,119390.0,119390.0,119390,119390
unique,2,,,,12,,,,,,...,3,,,,4,,,,3,926
top,City Hotel,,,,August,,,,,,...,No Deposit,,,,Transient,,,,Check-Out,2015-10-21
freq,79330,,,,13877,,,,,,...,104641,,,,89613,,,,75166,1461
mean,,0.370416,104.011416,2016.156554,,27.165173,15.798241,0.927599,2.500302,1.856403,...,,86.693382,189.266735,2.321149,,101.831122,0.062518,0.571363,,
std,,0.482918,106.863097,0.707476,,13.605138,8.780829,0.998613,1.908286,0.579261,...,,110.774548,131.655015,17.594721,,50.53579,0.245291,0.792798,,
min,,0.0,0.0,2015.0,,1.0,1.0,0.0,0.0,0.0,...,,1.0,6.0,0.0,,-6.38,0.0,0.0,,
25%,,0.0,18.0,2016.0,,16.0,8.0,0.0,1.0,2.0,...,,9.0,62.0,0.0,,69.29,0.0,0.0,,
50%,,0.0,69.0,2016.0,,28.0,16.0,1.0,2.0,2.0,...,,14.0,179.0,0.0,,94.575,0.0,0.0,,
75%,,1.0,160.0,2017.0,,38.0,23.0,2.0,3.0,2.0,...,,229.0,270.0,0.0,,126.0,0.0,1.0,,


In [6]:
#Dropping feature "company" as it has 94% NULL.
hotel_bookings = hotel_bookings.drop(axis='1',columns='company')

In [7]:
#Converting certain features to categorical form
categorical_features = ['hotel','is_canceled','arrival_date_week_number','meal','country','market_segment',
                        'distribution_channel','is_repeated_guest','reserved_room_type','assigned_room_type',
                        'deposit_type','agent','customer_type','reservation_status','arrival_date_month']
hotel_bookings[categorical_features] = hotel_bookings[categorical_features].astype('category')

# Converting reservation_status_date to datetime object
hotel_bookings['reservation_status_date'] = hotel_bookings['reservation_status_date'].astype('datetime64[ns]')

# Converting arrival date to datetime object
MonthtoNum = {'January':1, 'February':2,'March':3,'April':4,'May':5,'June':6,'July':7,
             'August':8,'September':9,'October':10,'November':11,'December':12}

hotel_bookings['arrival_date'] = hotel_bookings.apply(lambda x:datetime.date(x['arrival_date_year'],
                                                                             MonthtoNum[x['arrival_date_month']],
                                                                             x['arrival_date_day_of_month']),
                                                      axis = 1)
hotel_bookings['arrival_date'] = hotel_bookings['arrival_date'].astype('datetime64[ns]')

hotel_bookings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   hotel                           119390 non-null  category      
 1   is_canceled                     119390 non-null  category      
 2   lead_time                       119390 non-null  int64         
 3   arrival_date_year               119390 non-null  int64         
 4   arrival_date_month              119390 non-null  category      
 5   arrival_date_week_number        119390 non-null  category      
 6   arrival_date_day_of_month       119390 non-null  int64         
 7   stays_in_weekend_nights         119390 non-null  int64         
 8   stays_in_week_nights            119390 non-null  int64         
 9   adults                          119390 non-null  int64         
 10  children                        119386 non-null  float64

In [8]:
# Plot to show outlier in Average Daily Rate
# FIRST-AUTHOR: remove plotting
# ax = sns.boxplot(x=hotel_bookings['adr'])
_ = hotel_bookings['adr']

The plot shows that we have an outlier above 5000, hence that record is not required for analysis

In [9]:
hotel_bookings['adr'] = hotel_bookings['adr'].astype('int')

In [10]:
# Deleting a record with ADR greater than 5000
hotel_bookings = hotel_bookings[hotel_bookings['adr'] < 5000]

After removing the outlier;

In [11]:
# FIRST-AUTHOR: remove plotting
# ax = sns.boxplot(x=hotel_bookings['adr'])
_ = hotel_bookings['adr']

## Categorical Data Analysis

In [12]:
# The function generating the EDA for categorical data

def categorical_eda(df):
    """Given dataframe, generate EDA of categorical data"""
    print("To check: Unique count of non-numeric data")
    print(df.select_dtypes(include=['category']).nunique())
    # Plot count distribution of categorical data
    
    for col in df.select_dtypes(include='category').columns:
        if df[col].nunique() < 20:
# FIRST-AUTHOR: remove plotting
#             fig = sns.catplot(x=col, kind="count", data=df)
#             fig.set_xticklabels(rotation=90)
#             plt.show()
            pass
        
        
categorical_eda(hotel_bookings)

To check: Unique count of non-numeric data
hotel                         2
is_canceled                   2
arrival_date_month           12
arrival_date_week_number     53
meal                          5
country                     177
market_segment                8
distribution_channel          5
is_repeated_guest             2
reserved_room_type           10
assigned_room_type           12
deposit_type                  3
agent                       333
customer_type                 4
reservation_status            3
dtype: int64


In [13]:
country_count = hotel_bookings['country'].value_counts()
country_count_df = pd.DataFrame(country_count)
country_count_df = country_count_df.reset_index()
country_count_df.columns = ['country','booking_count']
country_count_df = country_count_df[country_count_df['booking_count'] > 10]

# FIRST-AUTHOR: remove plotting
# import plotly.express as px

# fig = px.choropleth(country_count_df, locations="country",
#                     color="booking_count",
#                     hover_name="country",
#                     color_continuous_scale=px.colors.sequential.RdBu)
# fig.show()

In [14]:
reservation_df = hotel_bookings[['hotel','reservation_status']]
reservation_df.groupby(['hotel']).count()

  reservation_df.groupby(['hotel']).count()


Unnamed: 0_level_0,reservation_status
hotel,Unnamed: 1_level_1
City Hotel,79329
Resort Hotel,40060


In [15]:
hotel_bookings_1 = hotel_bookings[hotel_bookings['hotel'] == 'City Hotel']
hotel_bookings_1['reservation_status'].value_counts()

reservation_status
Check-Out    46228
Canceled     32185
No-Show        916
Name: count, dtype: int64

In [16]:
hotel_bookings_2 = hotel_bookings[hotel_bookings['hotel'] == 'Resort Hotel']
hotel_bookings_2['reservation_status'].value_counts()

reservation_status
Check-Out    28938
Canceled     10831
No-Show        291
Name: count, dtype: int64

In [17]:
# Percentage of Cancelation
print('Percentage of calculation in City Hotel: ',(32185/79329)*100)
print('Percentage of calculation in Resort Hotel: ',(10831/40060)*100)

Percentage of calculation in City Hotel:  40.57154382382231
Percentage of calculation in Resort Hotel:  27.03694458312531


In [18]:
# import pandas as pd
hotel_bookings = pd.read_csv("./input/hotel_bookings.scaled.csv")