In [14]:
import pandas as pd

In [15]:
hotel_bookings = pd.read_csv("hotel_bookings_training.csv")

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
hotel_bookings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119190 entries, 0 to 119189
Data columns (total 36 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119190 non-null  object 
 1   is_canceled                     119190 non-null  int64  
 2   lead_time                       119190 non-null  int64  
 3   arrival_date_year               119190 non-null  int64  
 4   arrival_date_month              119190 non-null  object 
 5   arrival_date_week_number        119190 non-null  int64  
 6   arrival_date_day_of_month       119190 non-null  int64  
 7   stays_in_weekend_nights         119190 non-null  int64  
 8   stays_in_week_nights            119190 non-null  int64  
 9   adults                          119190 non-null  int64  
 10  children                        119186 non-null  float64
 11  babies                          119190 non-null  int64  
 12  meal            

In [18]:
# Remove personal information of customers
hotel_bookings = hotel_bookings.drop(['name', 'email', 'phone-number', 'credit_card'], axis=1)

In [19]:
hotel_bookings.sample(10)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
73987,City Hotel,0,346,2016,September,38,13,0,2,2,...,No Deposit,6.0,,0,Transient-Party,115.0,0,1,Check-Out,2016-09-15
58050,City Hotel,1,61,2017,January,4,25,0,2,2,...,Non Refund,326.0,,0,Transient,80.0,0,0,Canceled,2016-11-25
99594,City Hotel,0,39,2015,August,33,14,0,2,2,...,No Deposit,6.0,,0,Transient-Party,109.0,0,0,Check-Out,2015-08-16
24953,Resort Hotel,0,70,2017,January,5,30,2,5,2,...,No Deposit,40.0,,0,Transient,83.76,0,1,Check-Out,2017-02-06
74931,Resort Hotel,0,71,2017,June,24,13,0,4,2,...,No Deposit,240.0,,0,Transient,170.0,0,1,Check-Out,2017-06-17
5059,City Hotel,1,46,2017,March,10,11,2,1,2,...,No Deposit,9.0,,0,Transient,106.2,0,0,Canceled,2017-02-18
1415,City Hotel,0,46,2016,September,39,19,1,0,2,...,No Deposit,9.0,,0,Transient,125.1,0,1,Check-Out,2016-09-20
77946,City Hotel,1,11,2017,July,29,21,0,1,2,...,No Deposit,9.0,,0,Transient,98.0,0,1,Canceled,2017-07-12
60373,Resort Hotel,1,411,2017,May,22,31,2,4,2,...,Non Refund,,,0,Transient,71.1,0,0,Canceled,2016-04-15
44015,City Hotel,1,34,2017,June,23,5,1,5,1,...,No Deposit,9.0,,0,Transient,130.0,0,1,Canceled,2017-05-03


### EDA

In [20]:
from ydata_profiling import ProfileReport

  from .autonotebook import tqdm as notebook_tqdm


In [21]:
profile = ProfileReport(hotel_bookings, title="Pandas Profiling Report")

In [22]:
#profile.to_file("bookings_profile.html")


In [23]:
# Avoid data leakage
hotel_bookings = hotel_bookings.drop(['reservation_status', 'reservation_status_date'], axis=1)

## What are we going to predict?
### We can create a failure prediction model for users.

In [24]:
is_canceled = hotel_bookings['is_canceled'].copy()
hotel_data = hotel_bookings.drop(['is_canceled'], axis=1)

### Split dataset

In [25]:
# Calculate test and validation set size:
original_count = len(hotel_bookings)
training_size = 0.60 # 60% of records
test_size = (1 - training_size) / 2


training_count = int(original_count * training_size)
test_count = int(original_count * test_size)
validation_count = original_count - training_count - test_count

print(training_count, test_count, validation_count, original_count)

71514 23838 23838 119190


In [26]:
from sklearn.model_selection import train_test_split

train_x, rest_x, train_y, rest_y = train_test_split(hotel_data, is_canceled, train_size=training_count)
test_x, validate_x, test_y, validate_y = train_test_split(rest_x, rest_y, train_size=test_count)

print(len(train_x), len(test_x), len(validate_x))

71514 23838 23838


### One-hot encoding

In [27]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion, Pipeline

In [28]:
one_hot_encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")

In [29]:
one_hot_encoding = ColumnTransformer([
    (
        'one_hot_encode',
        OneHotEncoder(sparse_output=False, handle_unknown="ignore"),
        [
            "hotel",
            "meal", 
            "distribution_channel", 
            "reserved_room_type", 
            "assigned_room_type", 
            "customer_type"
        ]
    )
])

### Variables transformed to binary

In [30]:
from sklearn.preprocessing import Binarizer

In [31]:
binarizer = Binarizer()

In [32]:
binarizer = ColumnTransformer([
    (
        'binarizer',
        Binarizer(),
        [
            "total_of_special_requests", 
            "required_car_parking_spaces", 
            "booking_changes", 
            "previous_bookings_not_canceled", 
            "previous_cancellations",
        ]
    )
])
    
one_hot_binarized = Pipeline([
    ("binarizer", binarizer),
    ("one_hot_encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore")),
])

### We scale variables.

In [33]:
from sklearn.preprocessing import RobustScaler

In [34]:
scaler = RobustScaler()

In [35]:
scaler = ColumnTransformer([
    ("scaler", RobustScaler(), ["adr"])
])

### Variables without modification

In [36]:
passthrough = ColumnTransformer([
    (
        "passthrough",
        "passthrough",
        [
            "stays_in_week_nights",
            "stays_in_weekend_nights",
        ]
    )
])

### Complete pipeline

In [37]:
feature_engineering_pipeline = pipe = Pipeline(
    [
        (
            "features",
            FeatureUnion(
                [
                    ("categorical", one_hot_encoding),
                    ("categorical_binarized", one_hot_binarized),
                    ("scaled", scaler),
                    ("pass", passthrough),
                ]
            ),
        )
    ]
)

In [38]:
transformed = feature_engineering_pipeline.fit_transform(train_x)
transformed.shape

(71514, 50)

In [39]:
# Our variables can now be consumed by the model.
transformed

array([[ 0.        ,  1.        ,  0.        , ..., -0.13241525,
         2.        ,  0.        ],
       [ 1.        ,  0.        ,  1.        , ...,  0.37252825,
         3.        ,  0.        ],
       [ 1.        ,  0.        ,  1.        , ...,  0.87394068,
         2.        ,  1.        ],
       ...,
       [ 1.        ,  0.        ,  1.        , ..., -1.6684322 ,
         0.        ,  0.        ],
       [ 0.        ,  1.        ,  1.        , ..., -0.84216102,
         4.        ,  2.        ],
       [ 1.        ,  0.        ,  0.        , ..., -0.59180791,
         0.        ,  2.        ]], shape=(71514, 50))

### Model training

In [40]:
# Get a fresh copy of the pipeline
from sklearn.base import clone

feature_transformer = clone(feature_engineering_pipeline)

features_train_x = feature_transformer.fit_transform(train_x)
features_validate_x = feature_transformer.transform(validate_x)

In [41]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

model = RandomForestClassifier(n_estimators=100)

model.fit(features_train_x, train_y)

### Model validation

In [42]:
from sklearn.metrics import accuracy_score, recall_score

pred_y = model.predict(features_validate_x)

print(accuracy_score(validate_y, pred_y))
print(recall_score(validate_y, pred_y))

0.8052688983975166
0.7128423888639426


### Final pipeline

In [43]:
final_inference_pipeline = Pipeline([
    ("feature_engineering", clone(feature_engineering_pipeline)),
    ("model", RandomForestClassifier(n_estimators=100))
])

In [44]:
final_training_dataset = pd.concat([train_x, validate_x])
final_training_response = pd.concat([train_y, validate_y])

In [45]:
final_inference_pipeline.fit(final_training_dataset, final_training_response)

### Model testing

In [46]:
test_pred_y = final_inference_pipeline.predict(test_x)

print(accuracy_score(test_pred_y, test_y))
print(recall_score(test_pred_y, test_y))

0.8166792516150684
0.7624594691965894


### Model persistence

In [47]:
from joblib import dump

dump(final_inference_pipeline, "inference_pipeline.joblib")

['inference_pipeline.joblib']

## Which clients are we targeting to improve the service?

In [48]:
from joblib import load

ultimate_inference_pipeline = load("inference_pipeline.joblib")

In [49]:
new_customers = pd.read_csv("new_customers.csv")
new_customers.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,name,email,phone-number,credit_card
0,City Hotel,0,0,2016,March,13,22,0,1,2,...,,0,Transient,99.0,0,1,Elizabeth Morton,ElizabethMorton@xfinity.com,218-662-6872,************5891
1,City Hotel,0,21,2016,March,13,23,0,3,2,...,,0,Transient-Party,62.0,0,1,Virginia Ward,Virginia_Ward51@gmail.com,845-529-3632,************1071
2,City Hotel,0,418,2016,September,40,26,1,2,2,...,,223,Transient-Party,107.0,0,0,Joseph Taylor,Joseph_T@protonmail.com,451-454-5767,************5326
3,City Hotel,0,58,2016,March,12,17,0,3,2,...,,0,Transient,63.0,0,0,Sara Allen,Sara_Allen18@yandex.com,402-581-2687,************8597
4,Resort Hotel,0,130,2017,July,28,9,2,0,1,...,,0,Transient-Party,178.0,0,1,John Black,Black.John47@yandex.com,541-901-5663,************9017


In [50]:
new_customers['will_cancel'] = ultimate_inference_pipeline.predict(new_customers)
new_customers[['proba_check_in', 'proba_cancel']] = ultimate_inference_pipeline.predict_proba(new_customers)

### These are the clients we should target...

In [51]:
new_customers[['name', 'phone-number', 'will_cancel', 'proba_cancel']].sort_values(by='proba_cancel', ascending=False).head(20)

Unnamed: 0,name,phone-number,will_cancel,proba_cancel
32,Renee Reed,970-325-8809,1,1.0
8,Carrie Tanner,392-436-1692,1,1.0
46,Katelyn Jones,323-339-3265,1,1.0
43,Joseph Lawson,166-493-3428,1,1.0
91,Donald Alvarez,105-155-9939,1,1.0
63,Deanna Jenkins,229-507-3138,1,1.0
90,Regina Pacheco,350-100-9605,1,1.0
58,Cory Alexander,864-688-3246,1,0.996681
64,Daniel Ortiz,960-672-0720,1,0.996681
40,Bryan Fitzpatrick,601-411-5278,1,0.995
