In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [2]:
# import data
dataset = pd.read_csv('hotel_bookings.csv')

# Data prepatation
# do not require
data = dataset.drop(['arrival_date_year', 'reservation_status', 
                  'reservation_status_date','company'], axis=1)
data.shape

(119390, 28)

We have 28 features, which gives us information about a customer's hotel booking and 119,390 observations, which are the number of bookings that either had or had not churned.

In [3]:
# idfentify which columns must be encoded with dummy variables
variables_to_dummy = []
for col in data.columns:
    if data[col].dtypes != 'int64':
        if data[col].dtypes != 'float64':
            variables_to_dummy.append(col)

# identify if na values
data.isna().sum()

hotel                                 0
is_canceled                           0
lead_time                             0
arrival_date_month                    0
arrival_date_week_number              0
arrival_date_day_of_month             0
stays_in_weekend_nights               0
stays_in_week_nights                  0
adults                                0
children                              4
babies                                0
meal                                  0
country                             488
market_segment                        0
distribution_channel                  0
is_repeated_guest                     0
previous_cancellations                0
previous_bookings_not_canceled        0
reserved_room_type                    0
assigned_room_type                    0
booking_changes                       0
deposit_type                          0
agent                             16340
days_in_waiting_list                  0
customer_type                         0


In [4]:
data.groupby('children').size()

children
0.0     110796
1.0       4861
2.0       3652
3.0         76
10.0         1
dtype: int64

In [5]:
# The number children is not even or normally distribution 
# Therefore we will use the median to replce the na values

child_med = data['children'].median() 
data['children'] = data['children'].fillna(value=child_med)

In [6]:
# The values of the agent variable refers to a travel agency's ID
#  that made the booking. Therefore we cannot take the median or mean
#  We will input 0 for no agency.

data['agent'] = data['agent'].fillna(value=0)

data.isna().sum() # dummy variables will take care of the na values in country

hotel                               0
is_canceled                         0
lead_time                           0
arrival_date_month                  0
arrival_date_week_number            0
arrival_date_day_of_month           0
stays_in_weekend_nights             0
stays_in_week_nights                0
adults                              0
children                            0
babies                              0
meal                                0
country                           488
market_segment                      0
distribution_channel                0
is_repeated_guest                   0
previous_cancellations              0
previous_bookings_not_canceled      0
reserved_room_type                  0
assigned_room_type                  0
booking_changes                     0
deposit_type                        0
agent                               0
days_in_waiting_list                0
customer_type                       0
adr                                 0
required_car

In [7]:
variables_to_dummy = []
for col in data.columns:
    if data[col].dtypes != 'int64':
        if data[col].dtypes != 'float64':
            variables_to_dummy.append(col)


for var in variables_to_dummy:
    data[var] = pd.get_dummies(data[var])

In [8]:
data.isna().sum() # after creating dummy variables, there are no more na values

hotel                             0
is_canceled                       0
lead_time                         0
arrival_date_month                0
arrival_date_week_number          0
arrival_date_day_of_month         0
stays_in_weekend_nights           0
stays_in_week_nights              0
adults                            0
children                          0
babies                            0
meal                              0
country                           0
market_segment                    0
distribution_channel              0
is_repeated_guest                 0
previous_cancellations            0
previous_bookings_not_canceled    0
reserved_room_type                0
assigned_room_type                0
booking_changes                   0
deposit_type                      0
agent                             0
days_in_waiting_list              0
customer_type                     0
adr                               0
required_car_parking_spaces       0
total_of_special_requests   

In [9]:
for col in data.columns:
    for entry in data[col]:
        if entry < 0:
            print(entry, " in column", col) 
            # we see there is indeed a negative value in adr

-6.38  in column adr


adr stands for Average Daily Rate, which is defined by dividing the sum of all lodging transactions by the total number of staying nights. Therefore this must have been an error either when inputing the data or when the adr was calculated. Therefore we will assume the numerical value is correct and make it positive.

In [10]:
data['adr'] = data['adr'].abs()
any(data['adr'] < 0)

False

In [11]:
X = data.drop(['is_canceled'], axis=1)
Y = data['is_canceled']
print(X.shape)
print(Y.shape)

x_train, x_test, y_train, y_test = train_test_split(data, Y, test_size=0.20, random_state=66)

(119390, 27)
(119390,)


## Logistic Model

In [12]:
# Logistic Regression Model
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression().fit(x_train, y_train)

log_predict = log_model.predict(x_test)



In [13]:
from sklearn import metrics

log_accuracy = metrics.accuracy_score(y_test, log_predict)
log_precision = metrics.precision_score(y_test, log_predict)

print("Model's Accuracy: {}  Model's Precision: {}".format(
    round(log_accuracy,4), round(log_precision,4)))

Model's Accuracy: 1.0  Model's Precision: 1.0


## Support Vector Machine

In [14]:
# Support Vector Machine Model

from sklearn.svm import LinearSVC

svm_model = LinearSVC().fit(x_train, y_train)
svm_predict = svm_model.predict(x_test)




In [15]:
svm_accuracy = metrics.accuracy_score(y_test, svm_predict)
svm_precision = metrics.precision_score(y_test, svm_predict)

print("Model's Accuracy: {}  Model's Precision: {}".format(
    round(svm_accuracy,4), round(svm_precision,4)))


Model's Accuracy: 1.0  Model's Precision: 1.0


## Naive Bayes

In [16]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB().fit(x_train, y_train)
nb_predict = nb_model.predict(x_test)

nb_accuracy = metrics.accuracy_score(y_test, nb_predict)
nb_precision = metrics.precision_score(y_test, nb_predict)

print("Model's Accuracy: {}  Model's Precision: {}".format(
    round(nb_accuracy,4), round(nb_precision,4)))

Model's Accuracy: 0.677  Model's Precision: 0.5517


In [17]:
print(x_train.shape)
list(x_train.columns)

(95512, 28)


['hotel',
 'is_canceled',
 'lead_time',
 'arrival_date_month',
 'arrival_date_week_number',
 'arrival_date_day_of_month',
 'stays_in_weekend_nights',
 'stays_in_week_nights',
 'adults',
 'children',
 'babies',
 'meal',
 'country',
 'market_segment',
 'distribution_channel',
 'is_repeated_guest',
 'previous_cancellations',
 'previous_bookings_not_canceled',
 'reserved_room_type',
 'assigned_room_type',
 'booking_changes',
 'deposit_type',
 'agent',
 'days_in_waiting_list',
 'customer_type',
 'adr',
 'required_car_parking_spaces',
 'total_of_special_requests']

The naive bayes model has the least accuracy and precision. This is due to the underlying assumption that each feature is independent, hence is naive. 

## Conclusion

We will prefer to perform the the logistic regression. We were able to predict the cancellation of a booking with 77.74% accuracy and 85.38% precision.