# Predicting hotel booking cancellations

In [1]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    f1_score,
    precision_score,
    recall_score,
)

In [2]:
df_hoteis = pd.read_csv('tb_hotel_traintest.csv')
df_hoteis.head()

Unnamed: 0,hotel,is_cancelled,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,...,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status_date,arrival_date,id_booking
0,Resort Hotel,0,342,0,0,2,0.0,0,BB,PRT,...,,,0,Transient,0.0,0,0,2015-07-01,2015-07-01,0
1,Resort Hotel,0,737,0,0,2,0.0,0,BB,PRT,...,,,0,Transient,0.0,0,0,2015-07-01,2015-07-01,1
2,Resort Hotel,0,7,0,1,1,0.0,0,BB,GBR,...,,,0,Transient,75.0,0,0,2015-07-02,2015-07-01,2
3,Resort Hotel,0,13,0,1,1,0.0,0,BB,GBR,...,304.0,,0,Transient,75.0,0,0,2015-07-02,2015-07-01,3
4,Resort Hotel,0,14,0,2,2,0.0,0,BB,GBR,...,240.0,,0,Transient,98.0,0,1,2015-07-03,2015-07-01,4


In [3]:
df_hoteis.describe()

Unnamed: 0,is_cancelled,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,agent,company,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests,id_booking
count,113409.0,113409.0,113409.0,113409.0,113409.0,113406.0,113409.0,113409.0,113409.0,113409.0,113409.0,97918.0,6437.0,113409.0,113409.0,113409.0,113409.0,113409.0
mean,0.370658,104.109074,0.927907,2.500498,1.857304,0.104227,0.00798,0.031673,0.087101,0.13633,0.220917,86.459476,189.195122,2.3262,101.882431,0.062367,0.571612,59714.795969
std,0.482983,106.894825,0.998723,1.90667,0.583753,0.398976,0.098027,0.175129,0.844538,1.497662,0.649771,110.559811,131.579937,17.613897,50.626711,0.24519,0.792979,34464.577528
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,0.0,-6.38,0.0,0.0,0.0
25%,0.0,18.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,62.0,0.0,69.4,0.0,0.0,29879.0
50%,0.0,69.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,179.0,0.0,94.9,0.0,0.0,59708.0
75%,1.0,161.0,2.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,229.0,270.0,0.0,126.0,0.0,1.0,89584.0
max,1.0,737.0,19.0,50.0,55.0,10.0,10.0,1.0,26.0,72.0,21.0,535.0,543.0,391.0,5400.0,8.0,5.0,119389.0


In [4]:
df_hoteis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113409 entries, 0 to 113408
Data columns (total 29 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           113409 non-null  object 
 1   is_cancelled                    113409 non-null  int64  
 2   lead_time                       113409 non-null  int64  
 3   stays_in_weekend_nights         113409 non-null  int64  
 4   stays_in_week_nights            113409 non-null  int64  
 5   adults                          113409 non-null  int64  
 6   children                        113406 non-null  float64
 7   babies                          113409 non-null  int64  
 8   meal                            113409 non-null  object 
 9   country                         112951 non-null  object 
 10  market_segment                  113409 non-null  object 
 11  distribution_channel            113409 non-null  object 
 12  is_repeated_gues

In [5]:
df_hoteis['is_cancelled'].value_counts()

0    71373
1    42036
Name: is_cancelled, dtype: int64

## Treat DataType 

In [6]:
df_hoteis['reservation_status_date'] = pd.to_datetime(df_hoteis['reservation_status_date'])
df_hoteis['arrival_date'] = pd.to_datetime(df_hoteis['arrival_date'])

## Treat Missing Values  

In [7]:
df_hoteis[df_hoteis['children'].isna()]

Unnamed: 0,hotel,is_cancelled,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,...,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status_date,arrival_date,id_booking
38498,City Hotel,1,2,1,0,2,,0,BB,PRT,...,,,0,Transient-Party,12.0,0,1,2015-08-01,2015-08-03,40600
38561,City Hotel,1,1,0,2,2,,0,BB,PRT,...,14.0,,0,Transient-Party,12.0,0,1,2015-08-04,2015-08-05,40667
38573,City Hotel,1,1,0,2,3,,0,BB,PRT,...,,,0,Transient-Party,18.0,0,2,2015-08-04,2015-08-05,40679


In [8]:
df_hoteis['children'] = np.where(df_hoteis['children'].isna(),0,df_hoteis['children'])
df_hoteis['children'] = df_hoteis['children'].astype('int64')

In [9]:
df_hoteis[df_hoteis['country'].isna()]

Unnamed: 0,hotel,is_cancelled,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,...,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status_date,arrival_date,id_booking
30,Resort Hotel,0,118,4,10,1,0,0,BB,,...,,,0,Transient,62.0,0,2,2015-07-15,2015-07-01,30
3923,Resort Hotel,1,0,0,0,0,0,0,SC,,...,,383.0,0,Transient,0.0,0,0,2016-02-15,2016-02-15,4127
6734,Resort Hotel,1,8,0,1,1,0,0,BB,,...,,204.0,0,Transient,73.0,0,2,2016-07-20,2016-07-21,7092
7459,Resort Hotel,1,39,0,5,2,0,0,HB,,...,,,0,Transient,159.0,0,5,2016-07-22,2016-08-30,7860
8328,Resort Hotel,1,0,0,1,1,0,0,BB,,...,,457.0,0,Transient,50.0,0,0,2016-10-13,2016-10-13,8779
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62602,City Hotel,1,0,0,0,0,0,0,SC,,...,,279.0,0,Transient,0.0,0,0,2017-04-10,2017-04-10,65908
62603,City Hotel,1,0,0,0,0,0,0,SC,,...,,279.0,0,Transient,0.0,0,0,2017-04-10,2017-04-10,65909
62604,City Hotel,1,0,0,0,0,0,0,SC,,...,,279.0,0,Transient,0.0,0,0,2017-04-10,2017-04-10,65910
76749,City Hotel,0,4,1,2,1,0,0,BB,,...,37.0,,0,Transient-Party,70.0,0,0,2015-11-26,2015-11-23,80830


In [10]:
df_hoteis['country'] = np.where(df_hoteis['country'].isna(),'Unknown',df_hoteis['country'])
df_hoteis['country'] = df_hoteis['country'].astype(str)

In [11]:
df_hoteis[df_hoteis['agent'].isna()]

Unnamed: 0,hotel,is_cancelled,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,...,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status_date,arrival_date,id_booking
0,Resort Hotel,0,342,0,0,2,0,0,BB,PRT,...,,,0,Transient,0.00,0,0,2015-07-01,2015-07-01,0
1,Resort Hotel,0,737,0,0,2,0,0,BB,PRT,...,,,0,Transient,0.00,0,0,2015-07-01,2015-07-01,1
2,Resort Hotel,0,7,0,1,1,0,0,BB,GBR,...,,,0,Transient,75.00,0,0,2015-07-02,2015-07-01,2
6,Resort Hotel,0,0,0,2,2,0,0,BB,PRT,...,,,0,Transient,107.00,0,0,2015-07-03,2015-07-01,6
18,Resort Hotel,0,0,0,1,2,0,0,BB,FRA,...,,110.0,0,Transient,107.42,0,0,2015-07-02,2015-07-01,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113154,City Hotel,0,0,0,1,1,0,0,BB,PRT,...,,72.0,0,Transient,0.00,0,2,2017-08-30,2017-08-29,119124
113181,City Hotel,0,0,0,1,2,2,0,BB,NLD,...,,,0,Transient,270.00,0,0,2017-08-30,2017-08-29,119151
113195,City Hotel,0,0,0,1,1,0,0,BB,BRA,...,,,0,Transient,140.00,0,0,2017-08-31,2017-08-30,119166
113241,City Hotel,0,2,0,1,1,0,0,SC,LBN,...,,,0,Transient,140.00,0,2,2017-09-01,2017-08-31,119215


In [12]:
df_hoteis['agent_bin'] = np.where(df_hoteis['agent'].isna(),0,1)

In [13]:
df_hoteis[df_hoteis['company'].isna()]

Unnamed: 0,hotel,is_cancelled,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,...,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status_date,arrival_date,id_booking,agent_bin
0,Resort Hotel,0,342,0,0,2,0,0,BB,PRT,...,,0,Transient,0.00,0,0,2015-07-01,2015-07-01,0,0
1,Resort Hotel,0,737,0,0,2,0,0,BB,PRT,...,,0,Transient,0.00,0,0,2015-07-01,2015-07-01,1,0
2,Resort Hotel,0,7,0,1,1,0,0,BB,GBR,...,,0,Transient,75.00,0,0,2015-07-02,2015-07-01,2,0
3,Resort Hotel,0,13,0,1,1,0,0,BB,GBR,...,,0,Transient,75.00,0,0,2015-07-02,2015-07-01,3,1
4,Resort Hotel,0,14,0,2,2,0,0,BB,GBR,...,,0,Transient,98.00,0,1,2015-07-03,2015-07-01,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113404,City Hotel,0,23,2,5,2,0,0,BB,BEL,...,,0,Transient,96.14,0,0,2017-09-06,2017-08-30,119385,1
113405,City Hotel,0,102,2,5,3,0,0,BB,FRA,...,,0,Transient,225.43,0,2,2017-09-07,2017-08-31,119386,1
113406,City Hotel,0,34,2,5,2,0,0,BB,DEU,...,,0,Transient,157.71,0,4,2017-09-07,2017-08-31,119387,1
113407,City Hotel,0,109,2,5,2,0,0,BB,GBR,...,,0,Transient,104.40,0,0,2017-09-07,2017-08-31,119388,1


In [14]:
df_hoteis['company_bin'] = np.where(df_hoteis['company'].isna(),0,1)

In [15]:
df_hoteis['stay_nigths_0'] = np.where((df_hoteis['stays_in_weekend_nights'] == 0) & (df_hoteis['stays_in_week_nights']== 0),1,0)

In [16]:
df_hoteis['Family_bin'] = np.where((df_hoteis['children'] != 0) | (df_hoteis['babies'] != 0), 1, 0)

In [17]:
df_hoteis['different_room_type'] = np.where(df_hoteis['reserved_room_type'] != df_hoteis['assigned_room_type'],1,0)

In [18]:
df_hoteis['City'] = np.where(df_hoteis['hotel'] == 'City Hotel', 1, 0) 
df_hoteis['Resort'] = np.where(df_hoteis['hotel'] == 'Resort Hotel', 1, 0)

In [20]:
df_hoteis['lead_time'].quantile(q=[0.05,0.25,0.50,0.75,0.90,0.95,0.99])

0.05      0.0
0.25     18.0
0.50     69.0
0.75    161.0
0.90    265.0
0.95    320.0
0.99    444.0
Name: lead_time, dtype: float64

In [21]:
df_hoteis['adr'].quantile(q=[0.05,0.25,0.50,0.75,0.90,0.95,0.99])

0.05     38.500
0.25     69.400
0.50     94.900
0.75    126.000
0.90    164.022
0.95    193.500
0.99    252.000
Name: adr, dtype: float64

In [29]:
fig, ax = plt.subplots(1, 2, figsize=(12, 8))
sns.boxplot(data=df_hoteis, x='is_cancelled', y='lead_time', ax=ax[0])
sns.boxplot(data=df_hoteis, x='adr', y='lead_time', hue = 'is_cancelled', ax=ax[1])

<AxesSubplot:xlabel='adr', ylabel='lead_time'>

In [30]:
sns.scatterplot(data=df_hoteis, x='adr', y='lead_time', hue = 'is_cancelled')

<AxesSubplot:xlabel='adr', ylabel='lead_time'>

In [19]:
df_hoteis.columns

Index(['hotel', 'is_cancelled', 'lead_time', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
       'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'reserved_room_type',
       'assigned_room_type', 'booking_changes', 'deposit_type', 'agent',
       'company', 'days_in_waiting_list', 'customer_type', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'reservation_status_date', 'arrival_date', 'id_booking', 'agent_bin',
       'company_bin', 'stay_nigths_0', 'Family_bin', 'different_room_type',
       'City', 'Resort'],
      dtype='object')

In [36]:
dfmodel_hoteis = df_hoteis[['is_cancelled','lead_time','stays_in_weekend_nights', 'stays_in_week_nights','adults','children',
                            'babies','is_repeated_guest','previous_cancellations','previous_bookings_not_canceled',
                            'booking_changes','days_in_waiting_list','adr','required_car_parking_spaces',
                            'total_of_special_requests','agent_bin','company_bin','stay_nigths_0','Family_bin',
                            'different_room_type','City','Resort']]

In [37]:
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(dfmodel_hoteis.drop('is_cancelled', axis = 1)))
y = dfmodel_hoteis["is_cancelled"]

In [40]:
pca = PCA()
pca.fit(X)
pca_X_norm = pca.transform(X)

In [46]:
plt.plot(np.cumsum(pca.explained_variance_))

[<matplotlib.lines.Line2D at 0x20e8ccf3400>]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)