In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import warnings
warnings.filterwarnings("ignore")
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [2]:
# import data
dataset = pd.read_csv('hotel_bookings.csv')

# Data prepatation
# do not require
data = dataset.drop(['arrival_date_year', 'reservation_status', 
                  'reservation_status_date','company'], axis=1)
data.head()
# 1 = yes is canceled
# 0 = no is not canceled

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,...,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests
0,Resort Hotel,0,342,July,27,1,0,0,2,0.0,...,C,C,3,No Deposit,,0,Transient,0.0,0,0
1,Resort Hotel,0,737,July,27,1,0,0,2,0.0,...,C,C,4,No Deposit,,0,Transient,0.0,0,0
2,Resort Hotel,0,7,July,27,1,0,1,1,0.0,...,A,C,0,No Deposit,,0,Transient,75.0,0,0
3,Resort Hotel,0,13,July,27,1,0,1,1,0.0,...,A,A,0,No Deposit,304.0,0,Transient,75.0,0,0
4,Resort Hotel,0,14,July,27,1,0,2,2,0.0,...,A,A,0,No Deposit,240.0,0,Transient,98.0,0,1


We have 28 features, which gives us information about a customer's hotel booking and 119,390 observations, which are the number of bookings that either had or had not churned.

In [3]:
# idfentify which columns must be encoded with dummy variables
variables_to_dummy = []
for col in data.columns:
    if data[col].dtypes != 'int64':
        if data[col].dtypes != 'float64':
            variables_to_dummy.append(col)
variables_to_dummy

['hotel',
 'arrival_date_month',
 'meal',
 'country',
 'market_segment',
 'distribution_channel',
 'reserved_room_type',
 'assigned_room_type',
 'deposit_type',
 'customer_type']

In [4]:
# identify if na values
data.isna().sum()

hotel                                 0
is_canceled                           0
lead_time                             0
arrival_date_month                    0
arrival_date_week_number              0
arrival_date_day_of_month             0
stays_in_weekend_nights               0
stays_in_week_nights                  0
adults                                0
children                              4
babies                                0
meal                                  0
country                             488
market_segment                        0
distribution_channel                  0
is_repeated_guest                     0
previous_cancellations                0
previous_bookings_not_canceled        0
reserved_room_type                    0
assigned_room_type                    0
booking_changes                       0
deposit_type                          0
agent                             16340
days_in_waiting_list                  0
customer_type                         0


In [5]:
data.groupby('children').size()

children
0.0     110796
1.0       4861
2.0       3652
3.0         76
10.0         1
dtype: int64

In [6]:
# The number children is not even or normally distribution 
# Therefore we will use the median to replce the na values

child_med = data['children'].median() 
data['children'] = data['children'].fillna(value=child_med)

In [7]:
# The values of the agent variable refers to a travel agency's ID
#  that made the booking. Therefore we cannot take the median or mean
#  We will input 0 for no agency.

data['agent'] = data['agent'].fillna(value=0)

data.isna().sum() # dummy variables will take care of the na values in country

hotel                               0
is_canceled                         0
lead_time                           0
arrival_date_month                  0
arrival_date_week_number            0
arrival_date_day_of_month           0
stays_in_weekend_nights             0
stays_in_week_nights                0
adults                              0
children                            0
babies                              0
meal                                0
country                           488
market_segment                      0
distribution_channel                0
is_repeated_guest                   0
previous_cancellations              0
previous_bookings_not_canceled      0
reserved_room_type                  0
assigned_room_type                  0
booking_changes                     0
deposit_type                        0
agent                               0
days_in_waiting_list                0
customer_type                       0
adr                                 0
required_car

In [8]:
variables_to_dummy = []
for col in data.columns:
    if data[col].dtypes == 'object':
        variables_to_dummy.append(col)


for var in variables_to_dummy:
    data[var] = pd.get_dummies(data[var])

In [9]:
for col in data.columns:
    for entry in data[col]:
        if entry < 0:
            print(entry, " in column", col) 
            # we see there is indeed a negative value in adr

-6.38  in column adr


adr stands for Average Daily Rate, which is defined by dividing the sum of all lodging transactions by the total number of staying nights. Therefore this must have been an error either when inputing the data or when the adr was calculated. Therefore we will assume the numerical value is correct and make it positive.

In [10]:
data['adr'] = data['adr'].abs()
any(data['adr'] < 0)

False

In [11]:
from sklearn.model_selection import train_test_split

X = data.drop(['is_canceled'], axis=1)
Y = data['is_canceled']
print(X.shape)
print(Y.shape)

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.50, random_state=66)

(119390, 27)
(119390,)


## Logistic Model

In [12]:
# Logistic Regression Model
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression().fit(x_train, y_train)

log_predict = log_model.predict(x_test)

In [13]:
from sklearn import metrics

log_accuracy = metrics.accuracy_score(y_test, log_predict)
log_precision = metrics.precision_score(y_test, log_predict)

print("Model's Accuracy: {}  Model's Precision: {}".format(
    round(log_accuracy,2), round(log_precision,2)))

Model's Accuracy: 0.78  Model's Precision: 0.86


In [14]:
print(metrics.classification_report(y_test, log_predict))

              precision    recall  f1-score   support

           0       0.76      0.95      0.84     37681
           1       0.86      0.48      0.61     22014

    accuracy                           0.78     59695
   macro avg       0.81      0.72      0.73     59695
weighted avg       0.80      0.78      0.76     59695



## Support Vector Machine

In [15]:
# Support Vector Machine Model

from sklearn.svm import LinearSVC

svm_model = LinearSVC(random_state = 100).fit(x_train, y_train)
svm_predict = svm_model.predict(x_test)

svm_accuracy = metrics.accuracy_score(y_test, svm_predict)
svm_precision = metrics.precision_score(y_test, svm_predict)

print("Model's Accuracy: {}  Model's Precision: {}".format(
    round(svm_accuracy,2), round(svm_precision,2)))


Model's Accuracy: 0.73  Model's Precision: 0.85


In [16]:
print(metrics.classification_report(y_test, svm_predict))

              precision    recall  f1-score   support

           0       0.71      0.97      0.82     37681
           1       0.85      0.31      0.45     22014

    accuracy                           0.73     59695
   macro avg       0.78      0.64      0.63     59695
weighted avg       0.76      0.73      0.68     59695



## Naive Bayes

In [17]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB().fit(x_train, y_train)
nb_predict = nb_model.predict(x_test)

nb_accuracy = metrics.accuracy_score(y_test, nb_predict)
nb_precision = metrics.precision_score(y_test, nb_predict)

print("Model's Accuracy: {}  Model's Precision: {}".format(
    round(nb_accuracy,2), round(nb_precision,2)))

Model's Accuracy: 0.64  Model's Precision: 0.51


In [18]:
print(metrics.classification_report(y_test, nb_predict))

              precision    recall  f1-score   support

           0       0.73      0.69      0.71     37681
           1       0.51      0.56      0.53     22014

    accuracy                           0.64     59695
   macro avg       0.62      0.62      0.62     59695
weighted avg       0.65      0.64      0.64     59695



The naive bayes model has the least accuracy and precision. This is due to the underlying assumption that each feature is independent, hence is naive. 

## Conclusion

We will prefer to perform the the logistic regression. We were able to predict the cancellation of a booking with 78% accuracy and 86% precision.



In [19]:
# random test sample
c = np.empty(10)
random_sample = pd.DataFrame(np.random.randn(10,27))
for i in range(10000):
    c += log_model.predict(random_sample)
c = pd.DataFrame(np.round(c/10000))
c

Unnamed: 0,0
0,1.0
1,1.0
2,1.0
3,1.0
4,1.0
5,0.0
6,1.0
7,1.0
8,1.0
9,1.0


We found 10,000 and found the average outcome for each observation.

Therefore the random sample has the above outcomes 

In [20]:
random_sample['Outcomes'] = c
random_sample

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,Outcomes
0,-1.048237,-1.17785,-0.32969,-0.748837,-0.955762,-0.999233,1.580258,0.249086,-0.41916,0.233989,...,-0.239598,1.669945,-1.04636,0.944464,-1.15108,-0.433002,0.551311,-0.716123,-0.509917,1.0
1,-0.257313,0.655195,-1.925731,-0.301612,1.137402,0.331732,0.125353,-0.129178,-2.17671,-2.027929,...,0.769063,0.679052,-0.163749,-1.330961,-1.149445,0.571356,-0.263973,1.009266,-2.43203,1.0
2,0.716879,0.952129,-1.299806,-0.777211,-0.825972,-0.112118,0.360696,-0.449561,-0.290327,-0.640328,...,1.531359,-1.358935,1.389547,0.577456,-1.635252,-1.918095,0.239007,-0.77484,-0.595975,1.0
3,-0.135839,-0.886511,-0.282536,1.03287,-0.547754,0.626721,-0.211959,-0.842537,-2.243985,-0.352186,...,1.654057,-0.674402,-0.977155,1.55309,-1.198917,1.005157,1.365722,0.831526,0.641112,1.0
4,-1.968738,-0.222509,1.226401,-1.460575,-0.689895,0.046385,2.067285,-1.166776,-0.886518,0.546649,...,0.853525,1.787272,-1.221759,-1.775975,0.239565,1.358041,0.691665,-1.551293,-1.373329,1.0
5,0.412734,-0.059597,0.812127,0.430077,-0.804504,-0.678329,-0.77702,-0.889431,0.937193,1.16642,...,-0.42763,0.390631,0.19881,-0.911257,0.051544,-0.571777,-2.796424,1.304923,0.144336,0.0
6,0.279964,0.337231,0.639461,-0.022194,-2.773315,-0.636106,-0.966522,0.809911,-1.542868,1.093192,...,-0.174682,0.567916,-1.262827,-0.873806,0.282866,-0.217693,-0.066034,-0.516735,1.263125,1.0
7,-1.685336,-0.888158,1.270165,-2.118234,0.476207,1.798417,0.588514,0.93521,0.718441,-0.874753,...,0.191026,-0.097678,0.94853,-0.564917,1.987756,-1.515568,-1.009646,0.143584,0.145343,1.0
8,-0.421291,0.828956,-0.048193,-0.017411,-0.638772,-0.099717,1.429507,0.22974,-0.579539,-0.499456,...,-0.578696,0.160309,-0.080859,-0.924045,-2.416119,-0.471987,-0.219674,-3.224725,0.401374,1.0
9,-0.087126,-0.625807,-0.092286,0.120264,0.49573,-0.059712,0.48474,0.006047,-0.065243,-0.775925,...,1.107085,0.587883,-0.51401,-0.227971,0.580632,0.524439,0.261398,0.515177,1.380158,1.0
