In [1]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, cohen_kappa_score
from sklearn.model_selection import cross_val_score

In [2]:
# Dataset from https://www.kaggle.com/jessemostipak/hotel-booking-demand

df = pd.read_csv('./Data/hotel_bookings.csv')

In [3]:
df

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.00,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.00,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.00,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.00,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.00,0,1,Check-Out,2015-07-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,City Hotel,0,23,2017,August,35,30,2,5,2,...,No Deposit,394.0,,0,Transient,96.14,0,0,Check-Out,2017-09-06
119386,City Hotel,0,102,2017,August,35,31,2,5,3,...,No Deposit,9.0,,0,Transient,225.43,0,2,Check-Out,2017-09-07
119387,City Hotel,0,34,2017,August,35,31,2,5,2,...,No Deposit,9.0,,0,Transient,157.71,0,4,Check-Out,2017-09-07
119388,City Hotel,0,109,2017,August,35,31,2,5,2,...,No Deposit,89.0,,0,Transient,104.40,0,0,Check-Out,2017-09-07


In [4]:
df.dtypes

hotel                              object
is_canceled                         int64
lead_time                           int64
arrival_date_year                   int64
arrival_date_month                 object
arrival_date_week_number            int64
arrival_date_day_of_month           int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                          float64
babies                              int64
meal                               object
country                            object
market_segment                     object
distribution_channel               object
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
reserved_room_type                 object
assigned_room_type                 object
booking_changes                     int64
deposit_type                       object
agent                             

In [5]:
df.children = df.children.fillna(0)
df.country = df.country.fillna(0)

In [6]:
# Dropping redundand and unneccessary information

df = df.drop(['arrival_date_year', 'arrival_date_month', 'company', 'reservation_status', 'reservation_status_date'], axis = 1)

In [7]:
# Defining categories so these values are not mistaken as numeric information 

new_categories = ['hotel', 'arrival_date_week_number', 'arrival_date_day_of_month', 'meal', 'country', 'market_segment', 'distribution_channel', 'is_repeated_guest', 'reserved_room_type', 'assigned_room_type', 'deposit_type', 'agent', 'customer_type']

for col in new_categories:
    df[col] = df[col].astype('category')

In [8]:
df.dtypes

hotel                             category
is_canceled                          int64
lead_time                            int64
arrival_date_week_number          category
arrival_date_day_of_month         category
stays_in_weekend_nights              int64
stays_in_week_nights                 int64
adults                               int64
children                           float64
babies                               int64
meal                              category
country                           category
market_segment                    category
distribution_channel              category
is_repeated_guest                 category
previous_cancellations               int64
previous_bookings_not_canceled       int64
reserved_room_type                category
assigned_room_type                category
booking_changes                      int64
deposit_type                      category
agent                             category
days_in_waiting_list                 int64
customer_ty

In [9]:
df = pd.get_dummies(df)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Columns: 660 entries, is_canceled to customer_type_Transient-Party
dtypes: float64(2), int64(12), uint8(646)
memory usage: 86.3 MB


In [11]:
# Creating df_compact with only relevant columns for prediction of cancellation

In [12]:
# As seen in "Hotel_Cancellation_Feature_Correlations.ipynb" only "country_PRT" plays a role in cancellation rate and thus is kept here.
# HOW CAN I GROUP ALL OTHER COUNTRIES ???
# ----------------------------------------

selected_cols = ('market_segment', 'distribution_channel', 'deposit_type', 'hotel')
filter_cols = [col for col in df if col.startswith(selected_cols)]
df_compact = df[['is_canceled', 'lead_time', 'previous_cancellations', 'required_car_parking_spaces', 'total_of_special_requests', 'country_PRT']] 
df2 = df[filter_cols]
df_compact = pd.concat([df_compact, df2], axis = 1)

In [13]:
# fig, ax = plt.subplots(figsize=(20, 15)) 
# sns.heatmap(df_compact.corr(), annot=True, ax = ax)

In [14]:
# Cancellation Rate

In [15]:
# Cancellation prediction 

In [21]:
vals = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008, 0.0009, 0.001 ]
y = df_compact['is_canceled']

# Preparing empty lists for next loop
val_list, features_list, accuracy_rf_list, kappa_rf_list, accuracy_etc_list, kappa_etc_list = [], [], [], [], [], []

# Loop for automatic prediction using the best found methods for this dataset: Random Forest, Extra Trees
for val in vals:
    features = abs(df_compact.corr()['is_canceled'][abs(df_compact.corr()['is_canceled']) > val]).drop(['is_canceled']).index.tolist()

    # Storing values of val and features per loop
    val_list.append(val)
    features_list.append(features)

    # X = df.drop(columns = ['is_canceled', 'reservation_status', 'date'])
    X = df_compact
    X = X[features]

    X_scaled = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2, random_state = 6)
    
    # Random Forest
    model_rf = RandomForestClassifier(n_estimators=800).fit(X_train, y_train)
    model_rf_prediction = model_rf.predict(X_test)
    accuracy_rf = accuracy_score(y_test, model_rf_prediction)
    kappa_rf = cohen_kappa_score(y_test, model_rf_prediction)

    # Storing the results of Random Forest
    accuracy_rf_list.append(accuracy_rf)
    kappa_rf_list.append(kappa_rf)
    
    # Extra Trees Classifier
    model_etc = ExtraTreesClassifier().fit(X_train, y_train)
    model_etc_prediction = model_etc.predict(X_test)
    accuracy_etc = accuracy_score(y_test, model_etc_prediction)
    kappa_etc = cohen_kappa_score(y_test, model_etc_prediction)

    # Storing the results of Extra Trees Classifier
    accuracy_etc_list.append(accuracy_etc)
    kappa_etc_list.append(kappa_etc)

df_loop = pd.DataFrame(list(zip(val_list, features_list, accuracy_rf_list, kappa_rf_list, accuracy_etc_list, kappa_etc_list)), columns =['Val', 'Features', 'Accuracy_RF', 'Kappa_RF', 'Accuracy_ETC', 'Kappa_ETC'])
df_loop

Unnamed: 0,Val,Features,Accuracy_RF,Kappa_RF,Accuracy_ETC,Kappa_ETC
0,0.0001,"[lead_time, previous_cancellations, required_c...",0.829885,0.628785,0.830011,0.627353
1,0.0002,"[lead_time, previous_cancellations, required_c...",0.830723,0.630561,0.830011,0.627371
2,0.0003,"[lead_time, previous_cancellations, required_c...",0.8311,0.631756,0.829927,0.627134
3,0.0004,"[lead_time, previous_cancellations, required_c...",0.830346,0.62966,0.82976,0.626802
4,0.0005,"[lead_time, previous_cancellations, required_c...",0.830848,0.63074,0.829969,0.627182
5,0.0006,"[lead_time, previous_cancellations, required_c...",0.829885,0.629099,0.829718,0.626719
6,0.0007,"[lead_time, previous_cancellations, required_c...",0.83022,0.629795,0.829843,0.626968
7,0.0008,"[lead_time, previous_cancellations, required_c...",0.829843,0.628807,0.829927,0.627187
8,0.0009,"[lead_time, previous_cancellations, required_c...",0.830472,0.630082,0.829801,0.626832
9,0.001,"[lead_time, previous_cancellations, required_c...",0.830472,0.630047,0.829843,0.626915


In [22]:
df_loop.to_csv('./Data/Hotel_Cancellation_Prediction_Dummy_Standardization3_Loop_Result2.csv')

In [20]:
df.shape

(119390, 660)