In [None]:
# Competition: https://www.kaggle.com/competitions/ih-hotel-booking
# I'll be much appreciate if you add your comments!

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.feature_selection import mutual_info_regression
from fast_ml.feature_engineering import FeatureEngineering_DateTime
from lazypredict.Supervised import LazyClassifier
from joblib import dump, load
from string import ascii_uppercase
from sklearn.model_selection import RandomizedSearchCV

pd.set_option('display.max_rows', None)

In [None]:
# loading data
df = pd.read_csv("../input/ih-hotel-booking/tb_hotel_traintest.csv", index_col='id_booking')

In [None]:
# check epty values (result in percents)
round(df.isnull().sum() * 100 / len(df), 2).sort_values(ascending=False)
# result (only 3 columns have emty values):
# company                          94.32
# agent                            13.66
# country                           0.40
# arrival_date                      0.00
# is_repeated_guest                 0.00
# ......................................

In [None]:
# Now look at "company" and "agent" columns. Both of them are categorical types!
# "company" contains just 6% of information but (as I think) we should not drop it 
# untill we proof that there is no big connection between it and a target
# (column "is_cancelled") using "mutual information" (kaggle courses: 
# https://www.kaggle.com/code/ryanholbrook/mutual-information/tutorial).
# Let's clear all data. 

In [None]:
# fill with zero and 'No'
df['country'] = df['country'].fillna(value='No')
df['agent'] = df['agent'].fillna(value=0)
df['children'] = df['children'].fillna(value=0)
df['company'] = df['company'].fillna(value=0)

In [None]:
# change type
df['children'] = df['children'].astype('int64')
df['agent'] = df['agent'].astype('int64')
df['company'] = df['company'].astype('int64')

In [None]:
# convert dates to right format
df['reservation_status_date'] = pd.to_datetime(df['reservation_status_date'],
                                               format="%Y-%m-%d",
                                               infer_datetime_format=True)

df['arrival_date'] = pd.to_datetime(df['arrival_date'],
                                    format="%Y-%m-%d",
                                    infer_datetime_format=True)

# make new features from dates
dtf = FeatureEngineering_DateTime()
dtf.fit(df, datetime_variables=['reservation_status_date', 'arrival_date'])
df = dtf.transform(df)
# ML can only operate with int, float or bool.
# There is no scence to convert datetime columns to that format
df = df.drop(['reservation_status_date','arrival_date'], axis=1)

In [None]:
# let's transform amount of food to integer values ('Undefined' is 'SC'):
df['meal'] = df['meal'].replace({'Undefined':0, 'SC':0, 'BB':1, 'HB':2, 'FB':3})

In [None]:
# check if we missed some "Undefined" in other columns 
df = df.replace('Undefined', np.nan)
round(df.isnull().sum() * 100 / len(df), 2).sort_values(ascending=False)

In [None]:
# "country" column contain 174 different values. It is too much.
# Let's convert it to regions
# Get table from here: https://github.com/lukes/ISO-3166-Countries-with-Regional-Codes/blob/master/all/all.csv
country = pd.read_csv('res/ISO-3166-Countries-with-Regional-Codes.csv')
country = country[['alpha-3', 'region']]
country.columns = ['country', 'region']
# we should not loose an index. We'll use it later during prediction
# with validation set. That's why we use left_index=True, right_index=True
df = df.merge(country, on='country', how='left', left_index=True, right_index=True)
# now we have "region" and "country" column

In [None]:
# lets take a look at "reserved_room_type" and "assigned_room_type". Values in
# this columns are good candidates for transforming into integer values:
# A    70324
# D    24074
# E     7383
# F     3569
# G     2416
# C     2264
# B     2078
# H      684
# I      347
# K      258
# P       11
# L        1
# Machine learning algorithms usually think that bigger value is "better".
# That logic fits to our columns. Lets convert it to integer: 
replece_dict = dict(zip(ascii_uppercase, range(1,27)))
df['reserved_room_type'] = df['reserved_room_type'].replace(replece_dict)
df['assigned_room_type'] = df['assigned_room_type'].replace(replece_dict)
# using this to columns we can make one more that will show
# difference between expectation and reality:
df['room_type_diff'] = df['reserved_room_type'] - df['assigned_room_type']
# we need to understand was it a good experience or bad:
def positive_zero_negative(x):
    if x == 0:
        return 0
    return 1 if x > 0 else -1

df['room_type_diff_pos_neg'] = df['room_type_diff'].apply(positive_zero_negative)

In [None]:
# now lets convert some object columns using "one hot" algorithm. According
# to logic "agent" and "company" columns contain not a numbers but categories: 
df['agent'] = df['agent'].astype('object')
df['company'] = df['company'].astype('object')
df = pd.get_dummies(df, prefix_sep='__')
# now we have 935 columns...

In [None]:
# we've made a lot of new columns. Now we will delete some. To determine
# which columns should be deleted we'll use mutual information:
def make_mi_scores(X, y, discrete_features='auto'):
    if isinstance(X, pd.Series):
        X = pd.DataFrame(X)
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [None]:
# we have to find features that don't affect a target 
df_scores = []
for col in df.columns:
    if 'is_cancelled' in col:
        continue
    df_scores.append(make_mi_scores(df[col], df['is_cancelled']))
df_scores = pd.concat(df_scores)
df_scores.sort_values(ascending=False)
# results:
# adr                                        0.14
# arrival_date:year                          0.10
# country__PRT                               0.10
# ...............................................
# In the end of that list there would be only columns that can't make any
# significant impact on target value "is_cancelled". Most of them are "agent"
# and "company" features. Lets delete all with values less s 0.005:
df_valueble = df.drop(labels=df_scores[df_scores < 0.005].index.to_list(), axis=1)

In [None]:
# it's time to make ML model.
X = df_valueble.copy()
y = X.pop('is_cancelled')

X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                      train_size=0.8,
                                                      test_size=0.2,
                                                      shuffle=True,
                                                      random_state=0)

model = XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_valid)

# checking accuracy of our model:
pred_compare = [i[0] == i[1] for i in list(zip(y_pred, y_valid))]
pred_prcnt = round(100 * sum(pred_compare) / len(pred_compare), 2)
print(f'Probability of successful prediction = {pred_prcnt} %')
# Probability of successful prediction > 99 %