# modeling 

In [1]:
# imports 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import pycaret
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, TargetEncoder
from pycaret.regression import setup, compare_models
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from pycaret.classification import *
import lightgbm as lgb
from sklearn.metrics import accuracy_score
import scipy.stats as stats
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier, RandomForestRegressor

In [2]:
df = pd.read_parquet('reservation_cancelation_prediction/hotel_bookings.parquet')
df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [3]:
# null values
df.isna().sum()

hotel                                  0
is_canceled                            0
lead_time                              0
arrival_date_year                      0
arrival_date_month                     0
arrival_date_week_number               0
arrival_date_day_of_month              0
stays_in_weekend_nights                0
stays_in_week_nights                   0
adults                                 0
children                               4
babies                                 0
meal                                   0
country                              488
market_segment                         0
distribution_channel                   0
is_repeated_guest                      0
previous_cancellations                 0
previous_bookings_not_canceled         0
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
agent                              16340
company         

In [4]:
df['children'] = df.children.fillna(0)
df['country'] = df.country.fillna("Unknown")
df['agent'] = df.agent.fillna(0)
df['company'] = df.agent.fillna(0)

In [5]:
df.shape

(119390, 32)

In [6]:
categorical_columns = [f for f in df.columns if df[f].dtype == 'O']
categorical_columns

['hotel',
 'arrival_date_month',
 'meal',
 'country',
 'market_segment',
 'distribution_channel',
 'reserved_room_type',
 'assigned_room_type',
 'deposit_type',
 'customer_type',
 'reservation_status',
 'reservation_status_date']

In [12]:
base_features = [
    "lead_time",
    "arrival_date_year",
    "arrival_date_week_number",
    "arrival_date_day_of_month",
    "stays_in_weekend_nights",
    "stays_in_week_nights",
    "adults",
    "children",
    "babies",
    "is_repeated_guest",
    "previous_cancellations",
    "previous_bookings_not_canceled",
    "days_in_waiting_list",
    "adr",
    "required_car_parking_spaces",
    "total_of_special_requests"
]
ordinal_features = [
    "arrival_date_month",
    "meal",
    "market_segment",
    "distribution_channel",
    "reserved_room_type",
    "assigned_room_type",
    "customer_type"
]
target_features = [
    "country",
    "booking_changes",
    "agent",
    "company"
]

considered_columns = base_features + ordinal_features + target_features

In [13]:
trainx = df[considered_columns]
trainy = df['is_canceled']
target_col = 'is_canceled'


In [23]:
train_x, test_x, train_y, test_y = train_test_split(trainx, trainy, test_size = 0.2, random_state= 42, stratify=trainy)

In [24]:
train_x.shape, train_y.shape

((95512, 27), (95512,))

In [43]:
## pipeline
# column_transfomer = ColumnTransformer(
#     [('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value= -1),
#     ordinal_features),
#     ('target', TargetEncoder(), target_features)
#     ], remainder = 'passthrough'
# ).set_output(transform = 'pandas')

# column_transfomer



In [44]:
# pipeline = Pipeline(
#     [('column_transformation', column_transfomer)]
# )
pipeline

In [25]:
train_x

Unnamed: 0,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,...,meal,market_segment,distribution_channel,reserved_room_type,assigned_room_type,customer_type,country,booking_changes,agent,company
105447,20,2017,6,7,0,2,1,0.0,0,0,...,BB,Online TA,GDS,A,A,Transient,0.246780,0.408739,0.202052,0.202052
85242,8,2016,11,9,0,3,2,0.0,0,0,...,BB,Groups,TA/TO,A,A,Transient-Party,0.186055,0.408498,0.244617,0.244617
65604,93,2017,14,6,0,3,2,0.0,0,0,...,BB,Offline TA/TO,TA/TO,A,A,Transient,0.567415,0.408739,1.000000,1.000000
17345,90,2015,39,23,4,10,2,0.0,0,0,...,BB,Offline TA/TO,TA/TO,D,D,Contract,0.203447,0.408888,0.228727,0.228727
117786,8,2017,32,9,0,1,1,0.0,0,0,...,BB,Corporate,Corporate,A,A,Transient,0.257152,0.408888,0.244544,0.244544
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104906,22,2017,3,19,2,4,1,0.0,0,0,...,BB,Online TA,TA/TO,A,A,Transient,0.243545,0.408522,0.136088,0.136088
50818,18,2016,20,9,1,2,1,0.0,0,0,...,BB,Online TA,TA/TO,D,D,Transient,0.369264,0.408522,0.415607,0.415607
36939,135,2017,22,30,0,3,2,0.0,0,0,...,BB,Online TA,TA/TO,E,E,Transient,0.195383,0.408522,0.137942,0.137942
14587,172,2017,24,14,2,5,2,0.0,0,0,...,BB,Direct,TA/TO,E,E,Transient,0.203447,0.408888,0.244544,0.244544


In [56]:
ordinal_encoder =  OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value= -1).set_output(transform = 'pandas')
target_encoder = TargetEncoder()


train_x[ordinal_features] =  ordinal_encoder.fit_transform(trainx.loc[train_x.index,ordinal_features])
train_x[target_features] =  target_encoder.fit_transform(train_x.loc[train_x.index,target_features], trainy.loc[train_x.index])

test_x[ordinal_features] = ordinal_encoder.transform(trainx.loc[test_x.index, ordinal_features])
test_x[target_features] = target_encoder.transform(trainx.loc[test_x.index, target_features])


## MODELING

### is_canceled is target 

In [69]:
t = pd.concat([train_x, train_y], axis = 1)
t.shape

(95512, 28)

In [70]:
# using pycaret to see the best models that fit our data
models_summary = setup(data = t.select_dtypes(include = 'number'), target = 'is_canceled')

In [71]:
compare_models()

In [67]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=-1, oob_score=False,
                       random_state=2681, verbose=0, warm_start=False)
rf_model.fit(train_x, train_y)
pred = rf_model.predict(test_x)
pred_prob = rf_model.predict_proba(test_x)[:,1]



In [68]:
from sklearn import metrics

precision_score = metrics.precision_score(test_y, pred)
recall_score = metrics.recall_score(test_y, pred)
accuracy_score = metrics.accuracy_score(test_y, pred)
roc_auc_score = metrics.roc_auc_score(test_y, pred_prob)
print(f"accuracy_score : {accuracy_score}, precision_score : {precision_score}, recall_score : {recall_score}, roc_auc_score : {roc_auc_score}")

accuracy_score : 0.7799648211743027, precision_score : 0.658767353435317, recall_score : 0.8422837761447145, roc_auc_score : 0.8803275379429998


In [61]:
pred

array([0.06, 0.85, 0.96, ..., 0.87, 0.95, 1.  ])

In [72]:
gboost = GradientBoostingClassifier(random_state = 42)
gboost.fit(train_x, train_y)
pred = gboost.predict(test_x)
pred_prob = gboost.predict_proba(test_x)[:,1]



precision_score = metrics.precision_score(test_y, pred)
recall_score = metrics.recall_score(test_y, pred)
accuracy_score = metrics.accuracy_score(test_y, pred)
roc_auc_score = metrics.roc_auc_score(test_y, pred_prob)
print(f"accuracy_score : {accuracy_score}, precision_score : {precision_score}, recall_score : {recall_score}, roc_auc_score : {roc_auc_score}")


accuracy_score : 0.7496021442331854, precision_score : 0.627400426742532, recall_score : 0.7978518937252685, roc_auc_score : 0.8494881789552339


In [76]:
from sklearn.ensemble import ExtraTreesClassifier

et = ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='sqrt',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     n_estimators=100, n_jobs=-1, oob_score=False,
                     random_state=8264, verbose=0, warm_start=False)
et.fit(train_x, train_y)

pred = et.predict(test_x)
pred_prob = et.predict_proba(test_x)[:,1]



precision_score = metrics.precision_score(test_y, pred)
recall_score = metrics.recall_score(test_y, pred)
accuracy_score = metrics.accuracy_score(test_y, pred)
roc_auc_score = metrics.roc_auc_score(test_y, pred_prob)
print(f"accuracy_score : {accuracy_score}, precision_score : {precision_score}, recall_score : {recall_score}, roc_auc_score : {roc_auc_score}")


accuracy_score : 0.7723008627188207, precision_score : 0.646165723108595, recall_score : 0.8516676088185415, roc_auc_score : 0.8830422476994929


In [78]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(train_x, train_y)

pred = xgb.predict(test_x)
pred_prob = xgb.predict_proba(test_x)[:,1]



precision_score = metrics.precision_score(test_y, pred)
recall_score = metrics.recall_score(test_y, pred)
accuracy_score = metrics.accuracy_score(test_y, pred)
roc_auc_score = metrics.roc_auc_score(test_y, pred_prob)
print(f"accuracy_score : {accuracy_score}, precision_score : {precision_score}, recall_score : {recall_score}, roc_auc_score : {roc_auc_score}")


accuracy_score : 0.749267107797973, precision_score : 0.6193817878028405, recall_score : 0.8382136800452233, roc_auc_score : 0.8556416960508626
