In [None]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn import tree
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, cohen_kappa_score
from sklearn.model_selection import cross_val_score

In [None]:
# Dataset from https://www.kaggle.com/jessemostipak/hotel-booking-demand

df = pd.read_csv('./Data/hotel_bookings.csv')

In [None]:
df

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
df = df.drop(['arrival_date_week_number', 'agent', 'company','reservation_status_date'], axis = 1)

In [None]:
df.children = df.children.fillna(0)
df.country = df.country.fillna(0)

In [None]:
df.info()

In [None]:
df = pd.get_dummies(df)

In [None]:
df.info()

In [None]:
# Creating df_compact with only relevant columns for prediction of cancellation

In [None]:
# As seen in "Hotel_Cancellation_Feature_Correlations.ipynb" only "country_PRT" plays a role in cancellation rate and thus is kept here.

In [None]:
selected_cols = ('market_segment', 'distribution_channel', 'deposit_type', 'hotel')
filter_cols = [col for col in df if col.startswith(selected_cols)]
df_compact = df[['is_canceled', 'lead_time', 'previous_cancellations', 'required_car_parking_spaces', 'total_of_special_requests', 'country_PRT']]
df2 = df[filter_cols]
df_compact = pd.concat([df_compact, df2], axis = 1)

In [None]:
fig, ax = plt.subplots(figsize=(20, 15)) 
sns.heatmap(df_compact.corr(), annot=True, ax = ax)

In [None]:
# Cancellation Rate: df

In [None]:
# Cancellation prediction - all data (df) but without reservation_status

In [None]:
X = df.drop(['is_canceled', 'reservation_status_Canceled', 'reservation_status_Check-Out', 'reservation_status_No-Show'], axis = 1)
y = df['is_canceled']
y = np.array(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 3)

In [None]:
# Normalization

X_norm = MinMaxScaler().fit(X_train)
X_train_norm = X_norm.transform(X_train)
X_test_norm = X_norm.transform(X_test)

In [None]:
# Logistic Regression

In [None]:
model_lr = LogisticRegression().fit(X_train_norm, y_train)
model_lr_prediction = model_lr.predict(X_test_norm)
version_lr = accuracy_score(y_test, model_lr_prediction)
version_lr

In [None]:
kappa_lr = cohen_kappa_score(y_test, model_lr_prediction)
kappa_lr

Feature importance

In [None]:
# Get importance
importance = model_lr.coef_[0]

# Summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))
 
# Plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()

In [None]:
df.iloc[:,14]

In [None]:
# Decision Tree
model_dt = tree.DecisionTreeClassifier().fit(X_train_norm, y_train)
model_dt_prediction = model_dt.predict(X_test_norm)
version_dt = accuracy_score(y_test, model_dt_prediction)
version_dt

In [None]:
# Kappa
kappa_dt = cohen_kappa_score(y_test, model_dt_prediction)
kappa_dt

Feature importance

In [None]:
# get importance

importance = model_dt.feature_importances_

# summarize feature importance

for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))

# plot feature importance

plt.bar([x for x in range(len(importance))], importance)
plt.show()

In [None]:
X.iloc[:,0].head(1)

In [None]:
X.iloc[:,249].head(1)

In [None]:
# K-Fold Decision Tree
model_kf_dt = cross_val_score(tree.DecisionTreeClassifier(max_depth =8), X, y, cv=200)   # Best result with max_depth =8), X, y, cv=200
print('Maximum: ', model_kf_dt.max())
print('Mean: ', model_kf_dt.mean())
model_kf_dt = pd.Series(model_kf_dt)
model_kf_dt.plot()

In [None]:
# KNN
model_knn = KNeighborsClassifier(n_neighbors = 4, metric = 'minkowski', p = 2).fit(X_train, y_train)  # Best result with 4 neighbors
model_knn_prediction = model_knn.predict(X_test)
version_knn = accuracy_score(y_test, model_knn_prediction)
version_knn

In [None]:
kappa_knn = cohen_kappa_score(y_test, model_knn_prediction)
kappa_knn

In [None]:
# Random Forest
model_rf = RandomForestClassifier(n_estimators=1000).fit(X_train, y_train) # 800: 0.8902755674679621
model_rf_prediction = model_rf.predict(X_test)
version_rf = accuracy_score(y_test, model_rf_prediction)
version_rf

In [None]:
kappa_rf = cohen_kappa_score(y_test, model_rf_prediction) # 800: 0.759638961971176
kappa_rf

In [None]:
# get importance
importance = model_rf.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()