# Predicting hotel booking cancellations

In [1]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import (accuracy_score, roc_auc_score, f1_score, precision_score, recall_score,)
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [2]:
df_hoteis = pd.read_csv('tb_hotel_traintest.csv')
df_hoteis.head()

Unnamed: 0,hotel,is_cancelled,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,...,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status_date,arrival_date,id_booking
0,Resort Hotel,0,342,0,0,2,0.0,0,BB,PRT,...,,,0,Transient,0.0,0,0,2015-07-01,2015-07-01,0
1,Resort Hotel,0,737,0,0,2,0.0,0,BB,PRT,...,,,0,Transient,0.0,0,0,2015-07-01,2015-07-01,1
2,Resort Hotel,0,7,0,1,1,0.0,0,BB,GBR,...,,,0,Transient,75.0,0,0,2015-07-02,2015-07-01,2
3,Resort Hotel,0,13,0,1,1,0.0,0,BB,GBR,...,304.0,,0,Transient,75.0,0,0,2015-07-02,2015-07-01,3
4,Resort Hotel,0,14,0,2,2,0.0,0,BB,GBR,...,240.0,,0,Transient,98.0,0,1,2015-07-03,2015-07-01,4


In [3]:
df_hoteis.describe()

Unnamed: 0,is_cancelled,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,agent,company,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests,id_booking
count,113409.0,113409.0,113409.0,113409.0,113409.0,113406.0,113409.0,113409.0,113409.0,113409.0,113409.0,97918.0,6437.0,113409.0,113409.0,113409.0,113409.0,113409.0
mean,0.370658,104.109074,0.927907,2.500498,1.857304,0.104227,0.00798,0.031673,0.087101,0.13633,0.220917,86.459476,189.195122,2.3262,101.882431,0.062367,0.571612,59714.795969
std,0.482983,106.894825,0.998723,1.90667,0.583753,0.398976,0.098027,0.175129,0.844538,1.497662,0.649771,110.559811,131.579937,17.613897,50.626711,0.24519,0.792979,34464.577528
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,0.0,-6.38,0.0,0.0,0.0
25%,0.0,18.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,62.0,0.0,69.4,0.0,0.0,29879.0
50%,0.0,69.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,179.0,0.0,94.9,0.0,0.0,59708.0
75%,1.0,161.0,2.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,229.0,270.0,0.0,126.0,0.0,1.0,89584.0
max,1.0,737.0,19.0,50.0,55.0,10.0,10.0,1.0,26.0,72.0,21.0,535.0,543.0,391.0,5400.0,8.0,5.0,119389.0


In [4]:
df_hoteis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113409 entries, 0 to 113408
Data columns (total 29 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           113409 non-null  object 
 1   is_cancelled                    113409 non-null  int64  
 2   lead_time                       113409 non-null  int64  
 3   stays_in_weekend_nights         113409 non-null  int64  
 4   stays_in_week_nights            113409 non-null  int64  
 5   adults                          113409 non-null  int64  
 6   children                        113406 non-null  float64
 7   babies                          113409 non-null  int64  
 8   meal                            113409 non-null  object 
 9   country                         112951 non-null  object 
 10  market_segment                  113409 non-null  object 
 11  distribution_channel            113409 non-null  object 
 12  is_repeated_gues

In [5]:
df_hoteis['is_cancelled'].value_counts()

0    71373
1    42036
Name: is_cancelled, dtype: int64

## Treat DataType 

In [6]:
df_hoteis['reservation_status_date'] = pd.to_datetime(df_hoteis['reservation_status_date'])
df_hoteis['arrival_date'] = pd.to_datetime(df_hoteis['arrival_date'])

## Treat Missing Values  

In [7]:
df_hoteis[df_hoteis['children'].isna()]

Unnamed: 0,hotel,is_cancelled,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,...,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status_date,arrival_date,id_booking
38498,City Hotel,1,2,1,0,2,,0,BB,PRT,...,,,0,Transient-Party,12.0,0,1,2015-08-01,2015-08-03,40600
38561,City Hotel,1,1,0,2,2,,0,BB,PRT,...,14.0,,0,Transient-Party,12.0,0,1,2015-08-04,2015-08-05,40667
38573,City Hotel,1,1,0,2,3,,0,BB,PRT,...,,,0,Transient-Party,18.0,0,2,2015-08-04,2015-08-05,40679


In [8]:
df_hoteis['children'] = np.where(df_hoteis['children'].isna(),0,df_hoteis['children'])
df_hoteis['children'] = df_hoteis['children'].astype('int64')

In [9]:
df_hoteis[df_hoteis['country'].isna()]

Unnamed: 0,hotel,is_cancelled,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,...,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status_date,arrival_date,id_booking
30,Resort Hotel,0,118,4,10,1,0,0,BB,,...,,,0,Transient,62.0,0,2,2015-07-15,2015-07-01,30
3923,Resort Hotel,1,0,0,0,0,0,0,SC,,...,,383.0,0,Transient,0.0,0,0,2016-02-15,2016-02-15,4127
6734,Resort Hotel,1,8,0,1,1,0,0,BB,,...,,204.0,0,Transient,73.0,0,2,2016-07-20,2016-07-21,7092
7459,Resort Hotel,1,39,0,5,2,0,0,HB,,...,,,0,Transient,159.0,0,5,2016-07-22,2016-08-30,7860
8328,Resort Hotel,1,0,0,1,1,0,0,BB,,...,,457.0,0,Transient,50.0,0,0,2016-10-13,2016-10-13,8779
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62602,City Hotel,1,0,0,0,0,0,0,SC,,...,,279.0,0,Transient,0.0,0,0,2017-04-10,2017-04-10,65908
62603,City Hotel,1,0,0,0,0,0,0,SC,,...,,279.0,0,Transient,0.0,0,0,2017-04-10,2017-04-10,65909
62604,City Hotel,1,0,0,0,0,0,0,SC,,...,,279.0,0,Transient,0.0,0,0,2017-04-10,2017-04-10,65910
76749,City Hotel,0,4,1,2,1,0,0,BB,,...,37.0,,0,Transient-Party,70.0,0,0,2015-11-26,2015-11-23,80830


In [10]:
df_hoteis['country'] = np.where(df_hoteis['country'].isna(),'Unknown',df_hoteis['country'])
df_hoteis['country'] = df_hoteis['country'].astype(str)

In [11]:
df_hoteis[df_hoteis['agent'].isna()]

Unnamed: 0,hotel,is_cancelled,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,...,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status_date,arrival_date,id_booking
0,Resort Hotel,0,342,0,0,2,0,0,BB,PRT,...,,,0,Transient,0.00,0,0,2015-07-01,2015-07-01,0
1,Resort Hotel,0,737,0,0,2,0,0,BB,PRT,...,,,0,Transient,0.00,0,0,2015-07-01,2015-07-01,1
2,Resort Hotel,0,7,0,1,1,0,0,BB,GBR,...,,,0,Transient,75.00,0,0,2015-07-02,2015-07-01,2
6,Resort Hotel,0,0,0,2,2,0,0,BB,PRT,...,,,0,Transient,107.00,0,0,2015-07-03,2015-07-01,6
18,Resort Hotel,0,0,0,1,2,0,0,BB,FRA,...,,110.0,0,Transient,107.42,0,0,2015-07-02,2015-07-01,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113154,City Hotel,0,0,0,1,1,0,0,BB,PRT,...,,72.0,0,Transient,0.00,0,2,2017-08-30,2017-08-29,119124
113181,City Hotel,0,0,0,1,2,2,0,BB,NLD,...,,,0,Transient,270.00,0,0,2017-08-30,2017-08-29,119151
113195,City Hotel,0,0,0,1,1,0,0,BB,BRA,...,,,0,Transient,140.00,0,0,2017-08-31,2017-08-30,119166
113241,City Hotel,0,2,0,1,1,0,0,SC,LBN,...,,,0,Transient,140.00,0,2,2017-09-01,2017-08-31,119215


In [12]:
df_hoteis['agent_bin'] = np.where(df_hoteis['agent'].isna(),0,1)

In [13]:
df_hoteis[df_hoteis['company'].isna()]

Unnamed: 0,hotel,is_cancelled,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,...,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status_date,arrival_date,id_booking,agent_bin
0,Resort Hotel,0,342,0,0,2,0,0,BB,PRT,...,,0,Transient,0.00,0,0,2015-07-01,2015-07-01,0,0
1,Resort Hotel,0,737,0,0,2,0,0,BB,PRT,...,,0,Transient,0.00,0,0,2015-07-01,2015-07-01,1,0
2,Resort Hotel,0,7,0,1,1,0,0,BB,GBR,...,,0,Transient,75.00,0,0,2015-07-02,2015-07-01,2,0
3,Resort Hotel,0,13,0,1,1,0,0,BB,GBR,...,,0,Transient,75.00,0,0,2015-07-02,2015-07-01,3,1
4,Resort Hotel,0,14,0,2,2,0,0,BB,GBR,...,,0,Transient,98.00,0,1,2015-07-03,2015-07-01,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113404,City Hotel,0,23,2,5,2,0,0,BB,BEL,...,,0,Transient,96.14,0,0,2017-09-06,2017-08-30,119385,1
113405,City Hotel,0,102,2,5,3,0,0,BB,FRA,...,,0,Transient,225.43,0,2,2017-09-07,2017-08-31,119386,1
113406,City Hotel,0,34,2,5,2,0,0,BB,DEU,...,,0,Transient,157.71,0,4,2017-09-07,2017-08-31,119387,1
113407,City Hotel,0,109,2,5,2,0,0,BB,GBR,...,,0,Transient,104.40,0,0,2017-09-07,2017-08-31,119388,1


In [14]:
df_hoteis['company_bin'] = np.where(df_hoteis['company'].isna(),0,1)

In [15]:
df_hoteis[(df_hoteis['stays_in_weekend_nights'] == 0) & (df_hoteis['stays_in_week_nights']== 0)]

Unnamed: 0,hotel,is_cancelled,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,...,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status_date,arrival_date,id_booking,agent_bin,company_bin
0,Resort Hotel,0,342,0,0,2,0,0,BB,PRT,...,0,Transient,0.0,0,0,2015-07-01,2015-07-01,0,0,0
1,Resort Hotel,0,737,0,0,2,0,0,BB,PRT,...,0,Transient,0.0,0,0,2015-07-01,2015-07-01,1,0,0
164,Resort Hotel,0,111,0,0,2,0,0,BB,PRT,...,0,Transient,0.0,0,2,2015-07-06,2015-07-06,167,1,0
165,Resort Hotel,0,0,0,0,1,0,0,BB,PRT,...,0,Transient,0.0,0,0,2015-07-06,2015-07-06,168,1,0
191,Resort Hotel,0,8,0,0,2,0,0,BB,PRT,...,0,Transient,0.0,0,1,2015-07-07,2015-07-07,196,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109696,City Hotel,0,15,0,0,1,0,0,BB,FRA,...,0,Transient-Party,0.0,1,0,2017-07-06,2017-07-06,115482,1,0
111794,City Hotel,0,0,0,0,2,0,0,BB,PRT,...,0,Transient,0.0,0,0,2017-08-08,2017-08-08,117701,1,0
112112,City Hotel,0,0,0,0,2,0,0,BB,PRT,...,0,Transient,0.0,0,0,2017-08-14,2017-08-14,118029,0,0
112683,City Hotel,0,78,0,0,1,0,0,BB,PRT,...,0,Transient-Party,0.0,0,0,2017-08-23,2017-08-23,118631,1,0


In [16]:
df_hoteis['stay_nigths_0'] = np.where((df_hoteis['stays_in_weekend_nights'] == 0) & (df_hoteis['stays_in_week_nights']== 0),1,0)

In [17]:
df_hoteis['Family_bin'] = np.where((df_hoteis['children'] != 0) | (df_hoteis['babies'] != 0), 1, 0)

In [18]:
df_hoteis['different_room_type'] = np.where(df_hoteis['reserved_room_type'] != df_hoteis['assigned_room_type'],1,0)

In [19]:
df_hoteis['Resort'] = np.where(df_hoteis['hotel'] == 'Resort Hotel', 1, 0)

In [20]:
df_hoteis['lead_time'].quantile(q=[0.05,0.25,0.50,0.75,0.90,0.95,0.99])

0.05      0.0
0.25     18.0
0.50     69.0
0.75    161.0
0.90    265.0
0.95    320.0
0.99    444.0
Name: lead_time, dtype: float64

In [21]:
df_hoteis['adr'].quantile(q=[0.05,0.25,0.50,0.75,0.90,0.95,0.99])

0.05     38.500
0.25     69.400
0.50     94.900
0.75    126.000
0.90    164.022
0.95    193.500
0.99    252.000
Name: adr, dtype: float64

In [22]:
sns.scatterplot(data=df_hoteis, x='adr', y='lead_time', hue = 'is_cancelled')

<AxesSubplot:xlabel='adr', ylabel='lead_time'>

In [23]:
fig, ax = plt.subplots(1, 2, figsize=(12, 8))
sns.boxplot(data=df_hoteis, x='is_cancelled', y='lead_time', ax=ax[0])
sns.boxplot(data=df_hoteis, x='adr', y='lead_time', hue = 'is_cancelled', ax=ax[1])

<AxesSubplot:xlabel='adr', ylabel='lead_time'>

In [24]:
df_hoteis.columns

Index(['hotel', 'is_cancelled', 'lead_time', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
       'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'reserved_room_type',
       'assigned_room_type', 'booking_changes', 'deposit_type', 'agent',
       'company', 'days_in_waiting_list', 'customer_type', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'reservation_status_date', 'arrival_date', 'id_booking', 'agent_bin',
       'company_bin', 'stay_nigths_0', 'Family_bin', 'different_room_type',
       'Resort'],
      dtype='object')

In [25]:
dfmodel_hoteis = df_hoteis[['is_cancelled','lead_time','stays_in_weekend_nights', 'stays_in_week_nights','adults','children',
                            'babies','is_repeated_guest','previous_cancellations','previous_bookings_not_canceled',
                            'booking_changes','days_in_waiting_list','adr','required_car_parking_spaces',
                            'total_of_special_requests','agent_bin','company_bin','stay_nigths_0','Family_bin',
                            'different_room_type','Resort']]

## Normalizar

In [26]:
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(dfmodel_hoteis.drop('is_cancelled', axis = 1)))
y = dfmodel_hoteis["is_cancelled"]

## PCA

In [27]:
pca = PCA()
pca.fit(X)
pca_X_norm = pca.transform(X)

In [28]:
np.cumsum(pca.explained_variance_)

array([ 2.61575979,  4.74285308,  6.38965091,  7.7241204 ,  8.94189937,
       10.08419869, 11.13643903, 12.16985245, 13.13745308, 14.05211405,
       14.93443543, 15.79446049, 16.5589131 , 17.29907098, 17.9476355 ,
       18.52902172, 19.06902821, 19.54756327, 19.93212418, 20.00017635])

In [29]:
plt.plot(np.cumsum(pca.explained_variance_))

[<matplotlib.lines.Line2D at 0x13eb64f0f40>]

## Train Test 

In [30]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)

## Logistic Regretion

In [31]:
logistic = LogisticRegression()
logistic.fit(X_train, y_train)

LogisticRegression()

In [32]:
pred_testlog = logistic.predict(X_test)

In [33]:
print(f"Acurácia: {accuracy_score(y_test, pred_testlog)}")
print(f"ROC-AUC Score:  {roc_auc_score(y_test, pred_testlog)}")
print(f"Precision:  {precision_score(y_test, pred_testlog)}")
print(f"Recall:  {recall_score(y_test, pred_testlog)}")
print(f"F1-Score: {f1_score(y_test, pred_testlog)}")

Acurácia: 0.7480689874087398
ROC-AUC Score:  0.7072107082945396
Precision:  0.7081545064377682
Recall:  0.5480167014613778
F1-Score: 0.617878350184561


## KNN (n_neighbors=1)

In [61]:
knn_fit = KNeighborsClassifier(n_neighbors=1)
knn_fit.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=1)

In [62]:
pred_test_1nn = knn_fit.predict(X_test)

In [63]:
print(f"Acurácia: {accuracy_score(y_test, pred_test_1nn )}")
print(f"ROC-AUC Score:  {roc_auc_score(y_test, pred_test_1nn)}")
print(f"Precision:  {precision_score(y_test, pred_test_1nn)}")
print(f"Recall:  {recall_score(y_test, pred_test_1nn)}")
print(f"F1-Score: {f1_score(y_test, pred_test_1nn)}")

Acurácia: 0.7954361090537156
ROC-AUC Score:  0.7810881720336175
Precision:  0.7246349326758961
Recall:  0.7251850446004935
F1-Score: 0.7249098842724341


## SVM

In [37]:
svm_fit = SVC()
svm_fit.fit(X_train, y_train)

SVC()

In [38]:
pred_test_svm = svm_fit.predict(X_test)

In [39]:
print(f"Acurácia: {accuracy_score(y_test, pred_test_svm)}")
print(f"ROC-AUC Score:  {roc_auc_score(y_test, pred_test_svm)}")
print(f"Precision:  {precision_score(y_test, pred_test_svm)}")
print(f"Recall:  {recall_score(y_test, pred_test_svm)}")
print(f"F1-Score: {f1_score(y_test, pred_test_svm)}")

Acurácia: 0.782703770324128
ROC-AUC Score:  0.746981843558085
Precision:  0.7595161864105301
Recall:  0.6078003416208009
F1-Score: 0.6752411575562701


## DecisionTree

In [67]:
tree_fit = DecisionTreeClassifier()
tree_fit.fit(X_train, y_train)

DecisionTreeClassifier()

In [41]:
pred_test_tree= tree_fit.predict(X_test)

In [42]:
print(f"Acurácia: {accuracy_score(y_test, pred_test_tree)}")
print(f"ROC-AUC Score:  {roc_auc_score(y_test, pred_test_tree)}")
print(f"Precision:  {precision_score(y_test, pred_test_tree)}")
print(f"Recall:  {recall_score(y_test, pred_test_tree)}")
print(f"F1-Score: {f1_score(y_test, pred_test_tree)}")

Acurácia: 0.8021373399640249
ROC-AUC Score:  0.7895411135738685
Precision:  0.7307548230005619
Recall:  0.7404630859745682
F1-Score: 0.7355769230769232


## Decision Tree Opt

In [43]:
max_depth = [int(x) for x in np.linspace(1, 21, 7)]
min_samples_leaf = [int(x) for x in np.linspace(1, 6, 3)]
min_samples_split = [int(x) for x in np.linspace(2, 40, 10)]
parameter_grid = {
    "max_depth": max_depth,
    "min_samples_leaf": min_samples_leaf,
    "min_samples_split": min_samples_split,
    "class_weight": ["balanced", None] 
}
tree_fit = DecisionTreeClassifier()
tree_opt = GridSearchCV(estimator=tree_fit, param_grid=parameter_grid, scoring="f1",cv = 5)
tree_opt.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'class_weight': ['balanced', None],
                         'max_depth': [1, 4, 7, 11, 14, 17, 21],
                         'min_samples_leaf': [1, 3, 6],
                         'min_samples_split': [2, 6, 10, 14, 18, 23, 27, 31, 35,
                                               40]},
             scoring='f1')

In [44]:
pred_test_treeopt = tree_opt.predict(X_test)

In [45]:
print(f"Acurácia: {accuracy_score(y_test, pred_test_treeopt)}")
print(f"ROC-AUC Score:  {roc_auc_score(y_test, pred_test_treeopt)}")
print(f"Precision:  {precision_score(y_test, pred_test_treeopt)}")
print(f"Recall:  {recall_score(y_test, pred_test_treeopt)}")
print(f"F1-Score: {f1_score(y_test, pred_test_treeopt)}")

Acurácia: 0.7918033365076006
ROC-AUC Score:  0.784864447109793
Precision:  0.7044191585075417
Recall:  0.7578288100208769
F1-Score: 0.7301485714285715


## KNN OPT

In [46]:
"""parameter_grid = {
    "n_neighbors": range(1, 10),
    "weights": ["uniform", "distance"],
    "metric": ["euclidean", "manhattan", "chebyshev"],
}
knn_fit = KNeighborsClassifier()
knn_optgrid = GridSearchCV(estimator=knn_fit, param_grid=parameter_grid, scoring="f1", cv = 3)
knn_optgrid.fit(X_train, y_train)"""

'parameter_grid = {\n    "n_neighbors": range(1, 10),\n    "weights": ["uniform", "distance"],\n    "metric": ["euclidean", "manhattan", "chebyshev"],\n}\nknn_fit = KNeighborsClassifier()\nknn_optgrid = GridSearchCV(estimator=knn_fit, param_grid=parameter_grid, scoring="f1", cv = 3)\nknn_optgrid.fit(X_train, y_train)'

In [47]:
#pred_test_knnoptgrid = knn_optgrid.predict(X_test)

In [48]:
"""print(f"Acurácia: {accuracy_score(y_test, pred_test_knnoptgrid)}")
print(f"ROC-AUC Score:  {roc_auc_score(y_test, pred_test_knnoptgrid)}")
print(f"Precision:  {precision_score(y_test, pred_test_knnoptgrid)}")
print(f"Recall:  {recall_score(y_test, pred_test_knnoptgrid)}")
print(f"F1-Score: {f1_score(y_test, pred_test_knnoptgrid)}")"""

'print(f"Acurácia: {accuracy_score(y_test, pred_test_knnoptgrid)}")\nprint(f"ROC-AUC Score:  {roc_auc_score(y_test, pred_test_knnoptgrid)}")\nprint(f"Precision:  {precision_score(y_test, pred_test_knnoptgrid)}")\nprint(f"Recall:  {recall_score(y_test, pred_test_knnoptgrid)}")\nprint(f"F1-Score: {f1_score(y_test, pred_test_knnoptgrid)}")'

In [49]:
parameter_grid = {"n_neighbors": range(1, 20),
                  "weights": ["uniform", "distance"],
                  "metric": ["euclidean", "manhattan", "chebyshev"],}
knn_fit = KNeighborsClassifier()
knn_opt = RandomizedSearchCV(estimator= knn_fit, param_distributions = parameter_grid, n_iter= 5)
knn_opt.fit(X_train, y_train)

RandomizedSearchCV(estimator=KNeighborsClassifier(), n_iter=5,
                   param_distributions={'metric': ['euclidean', 'manhattan',
                                                   'chebyshev'],
                                        'n_neighbors': range(1, 20),
                                        'weights': ['uniform', 'distance']})

In [50]:
pred_test_knnopt = knn_opt.predict(X_test)

In [51]:
print(f"Acurácia: {accuracy_score(y_test, pred_test_knnopt)}")
print(f"ROC-AUC Score:  {roc_auc_score(y_test, pred_test_knnopt)}")
print(f"Precision:  {precision_score(y_test, pred_test_knnopt)}")
print(f"Recall:  {recall_score(y_test, pred_test_knnopt)}")
print(f"F1-Score: {f1_score(y_test, pred_test_knnopt)}")

Acurácia: 0.8070045497830918
ROC-AUC Score:  0.7637999377525077
Precision:  0.8384553714591128
Recall:  0.5954640349212375
F1-Score: 0.696371101986461


## Validation 

In [52]:
hotel = pd.read_csv('tb_hotel_feat_valid.csv')
hotel.describe()

Unnamed: 0,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,agent,company,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests,id_booking
count,5981.0,5981.0,5981.0,5981.0,5980.0,5981.0,5981.0,5981.0,5981.0,5981.0,5132.0,360.0,5981.0,5981.0,5981.0,5981.0,5981.0
mean,102.159672,0.921752,2.496572,1.839325,0.097492,0.007357,0.036449,0.087444,0.151647,0.225046,91.156274,190.547222,2.22538,100.858208,0.065374,0.566628,59309.657081
std,106.251587,0.996598,1.93883,0.486024,0.390591,0.085462,0.18742,0.840575,1.49321,0.698671,114.714577,133.16866,17.228244,48.773608,0.247205,0.789426,34475.000586
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,9.0,0.0,0.0,0.0,0.0,47.0
25%,17.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,51.0,0.0,67.76,0.0,0.0,29298.0
50%,67.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,178.5,0.0,93.0,0.0,0.0,59458.0
75%,158.0,2.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,240.0,281.0,0.0,126.0,0.0,1.0,88868.0
max,629.0,9.0,22.0,3.0,3.0,1.0,1.0,25.0,44.0,20.0,531.0,525.0,391.0,382.0,1.0,4.0,119373.0


In [53]:
hotel['reservation_status_date'] = pd.to_datetime(hotel['reservation_status_date'])
hotel['arrival_date'] = pd.to_datetime(hotel['arrival_date'])
hotel['children'] = np.where(hotel['children'].isna(),0,hotel['children'])
hotel['country'] = np.where(hotel['country'].isna(),'Unknown',hotel['country'])
hotel['agent_bin'] = np.where(hotel['agent'].isna(),0,1)
hotel['company_bin'] = np.where(hotel['company'].isna(),0,1)
hotel['Family_bin'] = np.where((hotel['children'] != 0) | (hotel['babies'] != 0), 1, 0)
hotel['different_room_type'] = np.where(hotel['reserved_room_type'] != hotel['assigned_room_type'],1,0)
hotel['stay_nigths_0'] = np.where((hotel['stays_in_weekend_nights'] == 0) & (hotel['stays_in_week_nights']== 0),1,0)
hotel['Resort'] = np.where(hotel['hotel'] == 'Resort Hotel', 1, 0)

In [56]:
hotel_val = hotel[['lead_time','stays_in_weekend_nights', 'stays_in_week_nights','adults','children',
                    'babies','is_repeated_guest','previous_cancellations','previous_bookings_not_canceled',
                    'booking_changes','days_in_waiting_list','adr','required_car_parking_spaces',
                    'total_of_special_requests','agent_bin','company_bin','stay_nigths_0','Family_bin',
                    'different_room_type','Resort']]

In [57]:
hotel_norm = pd.DataFrame(scaler.transform(hotel_val))

In [58]:
hotel['is_cancelled'] = logistic.predict(hotel_norm)
validation = hotel[['id_booking','is_cancelled']]
validation.to_csv('hotel_valid_lregration.csv',index=False)

In [64]:
hotel['is_cancelled'] = knn_fit.predict(hotel_norm)
validation = hotel[['id_booking','is_cancelled']]
validation.to_csv('hotel_valid_Knn.csv',index=False)

In [65]:
hotel['is_cancelled'] = svm_fit.predict(hotel_norm)
validation = hotel[['id_booking','is_cancelled']]
validation.reset_index()
validation.to_csv('hotel_valid_SVM.csv',index=False)

In [68]:
hotel['is_cancelled'] = tree_fit.predict(hotel_norm)
validation = hotel[['id_booking','is_cancelled']]
validation.to_csv('hotel_valid_tree.csv',index=False)

In [69]:
hotel['is_cancelled'] = knn_opt.predict(hotel_norm)
validation = hotel[['id_booking','is_cancelled']]
validation.to_csv('hotel_valid_Knn_opt.csv',index=False)

In [70]:
hotel['is_cancelled'] = tree_opt.predict(hotel_norm)
validation = hotel[['id_booking','is_cancelled']]
validation.to_csv('hotel_valid_tree_opt.csv',index=False)