In [507]:
import numpy as np

In [508]:
import pandas as pd

In [509]:
df = pd.read_csv("hotel_bookings.csv")

In [510]:
df.head(2)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01


In [511]:
# Here we can see nulls
# df.info()

In [512]:
df['children'].fillna(df['children'].median(), inplace=True) # 0.003% misses
df['country'].fillna(df['country'].mode()[0], inplace=True) # 0.4% misses

In [513]:
df.drop(['company'], axis=1, inplace=True) # 94% misses
df.drop(['agent'], axis=1, inplace=True) # 14% misses
df.drop(['reservation_status'], axis=1, inplace=True) # affects is_canceled

In [514]:
# Here we can see that nulls have gone
# df.info()

In [515]:
df['arrival_date_month'].value_counts()

August       13877
July         12661
May          11791
October      11160
April        11089
June         10939
September    10508
March         9794
February      8068
November      6794
December      6780
January       5929
Name: arrival_date_month, dtype: int64

In [516]:
arrivalMonthMap = {'January': 0, 'February': 1, 'March': 2, 'April': 3, 'May': 4, 'June': 5, 'July':6, 'August':7, 'September': 8, 'October': 9, 'November': 10, 'December': 11}
df['arrival_date_month'] = df['arrival_date_month'].map(arrivalMonthMap)

In [517]:
df['meal'].value_counts()

BB           92310
HB           14463
SC           10650
Undefined     1169
FB             798
Name: meal, dtype: int64

In [518]:
mealMap = {"SC": 0, "Undefined": 0, "BB": 1, "HB": 2, "FB": 3}
df['meal'] = df['meal'].map(mealMap)

In [519]:
df['deposit_type'].value_counts()

No Deposit    104641
Non Refund     14587
Refundable       162
Name: deposit_type, dtype: int64

In [520]:
depositTypeMap = {"No Deposit": 0, "Non Refund": 1, "Refundable": 2}
df['deposit_type'] = df['deposit_type'].map(depositTypeMap)

In [521]:
df['reservation_status_date'].head(2)

0    2015-07-01
1    2015-07-01
Name: reservation_status_date, dtype: object

In [522]:
# dividing reservation_status_date
reservStatusDate = pd.to_datetime(df['reservation_status_date'], format='%Y-%m-%d')

df['reservation_status_date_day_of_month'] = reservStatusDate.dt.day
df['reservation_status_date_month'] = reservStatusDate.dt.month
df['reservation_status_date_year'] = reservStatusDate.dt.year

df.drop(['reservation_status_date'], axis=1, inplace=True) # unecessary column

In [523]:
df['hotel'] = pd.factorize(df['hotel'])[0]

In [524]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 31 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   hotel                                 119390 non-null  int64  
 1   is_canceled                           119390 non-null  int64  
 2   lead_time                             119390 non-null  int64  
 3   arrival_date_year                     119390 non-null  int64  
 4   arrival_date_month                    119390 non-null  int64  
 5   arrival_date_week_number              119390 non-null  int64  
 6   arrival_date_day_of_month             119390 non-null  int64  
 7   stays_in_weekend_nights               119390 non-null  int64  
 8   stays_in_week_nights                  119390 non-null  int64  
 9   adults                                119390 non-null  int64  
 10  children                              119390 non-null  float64
 11  

In [525]:
df = pd.concat([df,  pd.get_dummies(df['country'], prefix="country")],  axis=1)
df = pd.concat([df,  pd.get_dummies(df['market_segment'], prefix="market_segment")],  axis=1)
df = pd.concat([df,  pd.get_dummies(df['distribution_channel'], prefix="distribution_channel")],  axis=1)
df = pd.concat([df,  pd.get_dummies(df['reserved_room_type'], prefix="reserved_room_type")],  axis=1)
df = pd.concat([df,  pd.get_dummies(df['assigned_room_type'], prefix="assigned_room_type")],  axis=1)
df = pd.concat([df,  pd.get_dummies(df['customer_type'], prefix="customer_type")],  axis=1)

df.drop(['country', 'market_segment', 'distribution_channel', 'reserved_room_type', 'assigned_room_type', 'customer_type'], axis=1, inplace=True)

In [526]:
# Here we can see that objects have gone
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Columns: 241 entries, hotel to customer_type_Transient-Party
dtypes: float64(2), int64(23), uint8(216)
memory usage: 47.4 MB


In [527]:
#
#  Model Learning
#

In [528]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Columns: 241 entries, hotel to customer_type_Transient-Party
dtypes: float64(2), int64(23), uint8(216)
memory usage: 47.4 MB


In [529]:
target=df['is_canceled']
df.drop(['is_canceled'], axis=1, inplace=True)

In [530]:
randomState = 30

In [531]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.4, random_state=randomState)

In [532]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71634 entries, 116867 to 38693
Columns: 240 entries, hotel to customer_type_Transient-Party
dtypes: float64(2), int64(22), uint8(216)
memory usage: 28.4 MB


In [533]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz

In [539]:
dtc = DecisionTreeClassifier(criterion='gini', min_samples_split=5)

In [540]:
dtc.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [541]:
dtc.score(X_train, y_train)

0.9909679761007343

In [542]:
dtc.score(X_test, y_test)

0.9368246921852752

In [544]:
# Найдём оптимальные параметры
from sklearn.model_selection import GridSearchCV

In [556]:
param_grid = {'criterion': ['gini','entropy'],
            'splitter': ['best','random'],
            'max_depth': [4,5,6,None],
            'min_samples_split': [4,5,6],
            'min_samples_leaf': [2,3,4]}
grid = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)

In [557]:
grid.fit(X_train,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'criterion': ['gini', 'entropy'],
                   

In [560]:
grid.best_params_

{'criterion': 'entropy',
 'max_depth': None,
 'min_samples_leaf': 2,
 'min_samples_split': 4,
 'splitter': 'best'}

In [561]:
grid.score(X_test, y_test) # лучше чем до подборка на 0.005 (5 п.п. или 50 б.п.)

0.9419758773766647