# **Import libraries**

In [54]:
import numpy as np
import pandas as pd
import keras
import plotly.graph_objects as go
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [18]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"

# **Data set**

<h2 dir=rtl align=right style="line-height:200%;font-family:vazir;color:#0099cc">

</h2>



<p dir=rtl style="direction: rtl; text-align: justify; line-height:200%; font-family:vazir; font-size:medium">




<center>
<div dir=rtl style="direction: rtl;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=3>

| نام ستون | توضیحات ستون |
|:----------:|:------------------:|
| <code>user</code> | شناسه کاربر |
| <code>user_location_country</code> | شناسه‌ی کشور کاربر |
| <code>user_location_region</code> | شناسه‌ی منطقه‌ی کاربر |
| <code>user_location_city</code> | شناسه‌ی شهر کاربر |
| <code>destination_distance</code> | فاصله‌ی فیزیکی بین کاربر و هتل در زمان جست‌وجو‌ (مقدار <code>null</code> به معنی عدم توانایی محاسبه فاصله است) |
| <code>search_date</code> | زمان انجام جست‌وجو |
| <code>is_mobile</code> | آیا کاربر با دستگاه موبایل جست‌وجو انجام داده است؟ |
| <code>is_package</code> | آیا کاربر در حال جست‌وجوی هتل به‌همراه بلیط حمل‌و‌نقل (اتوبوس، هواپیما یا قطار) است؟|
| <code>channel</code> | کاربر از چه کانالی وارد سایت شده‌است؟ (تبلیغات پیامکی، تبلیغات شبکه‌ی اجتماعی، ورود مستقیم و ...)  |
| <code>Search_count</code> |تعداد جست‌وجو‌های مشابه در لحظه‌ی کاربر (در همان نشست یا ‌<i>session</i>)|
| <code>checkIn_date</code> | تاریخ ورود به هتل |
| <code>checkOut_date</code> | تاریخ خروج از هتل |
| <code>n_adults</code> | تعداد افراد بالغ اعلام‌شده جهت رزرو هتل |
| <code>n_children</code> | تعداد کودکان اعلام‌شده جهت رزرو هتل |
| <code>n_rooms</code> | تعداد اتاق مورد نظر برای رزرو |
| <code>destination</code> | شناسه‌ی محل هتل مورد نظر کاربر (با هم‌پوشانی همچون نیویورک، نیویورک سیتی، سنترال پارک و غیره) |
| <code>destination_type</code> | کد نوع هتل مورد نظر کار‌بر |
| <code>hotel_continent</code> | شناسه‌ی قاره‌ی هتل مورد نظر کاربر |
| <code>hotel_country</code> | شناسه‌ی کشور هتل مورد نظر کاربر |
| <code>hotel_market</code> | ناحیه‌ای که هتل مورد نظر کاربر در آن قرار گرفته‌است (بدون هم‌پوشانی همچون نیویورک، بوستون و غیره)|
| <code>hotel_category</code> | گرو‌ه‌بندی هتلی که جزییاتش را مشاهده می‌کنند. این گروه‌بندی می‌تواند بر اساس مواردی مانند چندستاره بودن و یا نوع هتل باشد. |
| <code>is_booking<code> | آیا کاربر در نهایت، هتل مشاهده‌شده را رزرو کرده یا خیر؟ |


</font>
</div>
</center>

In [19]:
!pip3 install --upgrade --no-cache-dir gdown
!gdown 1jIl8uXsrdRz3ZBxljjQNo0ssDT8LbJBx

Collecting gdown
  Downloading gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Downloading gdown-5.2.0-py3-none-any.whl (18 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 5.1.0
    Uninstalling gdown-5.1.0:
      Successfully uninstalled gdown-5.1.0
Successfully installed gdown-5.2.0
Downloading...
From (original): https://drive.google.com/uc?id=1jIl8uXsrdRz3ZBxljjQNo0ssDT8LbJBx
From (redirected): https://drive.google.com/uc?id=1jIl8uXsrdRz3ZBxljjQNo0ssDT8LbJBx&confirm=t&uuid=4b9d6028-6d8d-4f77-8947-afdbd5718452
To: /content/will_not_travel_again_data.zip
100% 188M/188M [00:05<00:00, 32.8MB/s]


In [20]:
!unzip will_not_travel_again_data.zip

Archive:  will_not_travel_again_data.zip
   creating: data/
  inflating: data/test.csv           
  inflating: data/train.csv          


# **Data set reading**

In [21]:
train = pd.read_csv('./data/train.csv', engine = 'pyarrow')

train.head()

Unnamed: 0,user,user_location_country,user_location_region,user_location_city,destination_distance,search_date,is_mobile,is_package,channel,search_count,...,n_adults,n_children,n_rooms,destination,destination_type,hotel_continent,hotel_country,hotel_market,hotel_category,is_booking
0,461899,3,50,5703,,2013-01-07 00:00:02,0,0,9,1,...,2,1,1,669,3,2,50,212,41,0
1,13796,66,174,21177,5713.6206,2013-01-07 00:00:06,0,0,9,3,...,1,0,1,8821,1,6,17,30,58,0
2,1128575,205,155,14703,795.7298,2013-01-07 00:00:06,0,0,9,1,...,1,0,1,25064,6,2,50,1230,91,0
3,1080476,69,761,41949,,2013-01-07 00:00:17,0,1,9,1,...,2,0,1,7635,3,2,50,675,10,0
4,1080476,69,761,41949,,2013-01-07 00:00:23,0,1,9,1,...,2,0,1,7635,3,2,50,675,10,0


In [8]:
train.info()

# **Data mining and feature engineering**

In [22]:
# TODO : Drop columns = ['user'] for train set
train.drop(columns = ['user'], inplace = True)

In [9]:
train.info()

In [23]:
train[train['checkIn_date'].isna()]['is_booking'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
is_booking,Unnamed: 1_level_1
0,1.0


In [24]:
train[train['checkOut_date'].isna()]['is_booking'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
is_booking,Unnamed: 1_level_1
0,1.0


In [25]:
train.dropna(subset = ['checkIn_date'], inplace = True)
train.dropna(subset = ['checkOut_date'], inplace = True)

In [10]:
train.info()

In [26]:
train.isna().mean()['destination_distance']

0.35762694533684297

In [27]:
mean = train.groupby(['user_location_city','destination'])['destination_distance'].transform('mean')

train['destination_distance'].fillna(mean, inplace = True)

In [11]:
train.info()

In [29]:
train[train['destination_distance'] == 0]

Unnamed: 0,user_location_country,user_location_region,user_location_city,destination_distance,search_date,is_mobile,is_package,channel,search_count,checkIn_date,...,n_adults,n_children,n_rooms,destination,destination_type,hotel_continent,hotel_country,hotel_market,hotel_category,is_booking


In [30]:
train['destination_distance'].fillna(0, inplace = True)

In [13]:
train.isna().sum()

In [14]:
train.info()

# **Time characteristics**

<h3 align=right style="line-height:200%;font-family:vazir;color:#0099cc">

</font>
    
```python
df[‘time_type’] = pd.to_datetime(df[‘time_string’])
```
</p>

In [31]:
# TODO : change type of columns in time_columns into datetime64[ns]

time_columns = ['search_date', 'checkIn_date', 'checkOut_date']
for column in time_columns:
    train[column] = pd.to_datetime(train[column])

In [15]:
train.info()

In [32]:
# TODO : Add Days of Stay

duration = train['checkOut_date'] - train['checkIn_date'] #TODO
train['duration'] = duration.dt.days.astype(int) #TODO

# TODO : Add Days between search_date and checkIn_date

days_between = train['checkIn_date'] - train['search_date'] #TODO
train['days_between'] = days_between.dt.days.astype(int) #TODO

In [16]:
train.info()

In [33]:
del days_between, duration, mean

In [34]:
train['search_date_hour'] = train['search_date'].dt.hour
train['search_date_dayofweek'] = train['search_date'].dt.dayofweek
train['checkIn_date_dayofweek'] = train['checkIn_date'].dt.dayofweek
train['search_date_month'] = train['search_date'].dt.month
train['checkIn_date_month'] = train['checkIn_date'].dt.month

In [35]:
# You can only add the columns you need to is_booked and not_booked column (Due to RAM problem)
is_booked = train[train['is_booking'] == True]
not_booked = train[train['is_booking'] == False]

# **illustration 1**

In [36]:
trace_not_booked = go.Bar(y = not_booked['search_date_hour'].value_counts().sort_index()/len(not_booked) , name='Not Booked') #TODO
trace_is_booked = go.Bar(y = is_booked['search_date_hour'].value_counts().sort_index()/len(is_booked) , name='Booked') #TODO

data = [trace_is_booked, trace_not_booked]

layout = go.Layout(
    xaxis=dict(title='Search Hour', tickangle=45, automargin=True),
    yaxis=dict(title='Frequency')
)

fig = go.Figure(data=data, layout=layout)
fig.show()
fig.write_json('./search_hour.json')

In [38]:
# TODO: Make Bins for search_date_hour (Optional)
train['search_time_of_day'] = np.where((train['search_date_hour'] >= 5) & (train['search_date_hour'] <= 15), 'morning', 'night') #TODO

# **illustration 2**

In [39]:
trace_not_booked = go.Bar(y = not_booked['checkIn_date_dayofweek'].value_counts().sort_index()/len(not_booked) ,
                          name='Not Booked') #TODO
trace_is_booked = go.Bar(y = is_booked['checkIn_date_dayofweek'].value_counts().sort_index()/len(is_booked) ,
                         name='Booked') #TODO

ticktext = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] #TODO

data = [trace_is_booked, trace_not_booked]

layout = go.Layout(
    xaxis=dict(title='Day of Week', tickangle=45, automargin=True,
               tickvals = [0,1,2,3,4,5,6], ticktext= ticktext
 ),
    yaxis=dict(title='Frequency'),
)

fig = go.Figure(data=data, layout=layout)
fig.show()
fig.write_json('./checkIn_day.json')

# **illustration 3**

In [40]:
trace_not_booked = go.Bar(y = not_booked['checkIn_date_month'].value_counts().sort_index()/len(not_booked) , name='Not Booked') #TODO
trace_is_booked = go.Bar(y = is_booked['checkIn_date_month'].value_counts().sort_index()/len(is_booked) , name='Booked') #TODO

data = [trace_is_booked, trace_not_booked]

ticktext = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

layout = go.Layout(
    xaxis=dict(title='Month', tickangle=45, automargin=True,
             ticktext = ticktext ,tickvals = np.arange(0,12)),
    yaxis=dict(title='Frequency')
)

fig = go.Figure(data=data, layout=layout)
fig.show()
fig.write_json('./checkIn_date_month.json')

In [41]:
# TODO (Optional): Make season column for checkIn_date_month or search_date_month
seasons_dict = {1: 'Winter', 2: 'Winter', 3: 'Spring', 4: 'Spring', 5: 'Spring', 6: 'Summer', 7: 'Summer', 8: 'Summer', 9: 'Fall', 10: 'Fall', 11: 'Fall', 12: 'Winter'}

train['checkIn_of_year'] = train['checkIn_date_month'].map(seasons_dict)

# **illustration 4**

In [42]:
trace_not_booked = go.Scatter(y = not_booked['days_between'].value_counts().sort_index()/len(not_booked) , name='Not Booked', opacity= 0.5) #TODO
trace_is_booked = go.Scatter(y = is_booked['days_between'].value_counts().sort_index()/len(is_booked) , name='Booked') #TODO

data = [trace_is_booked, trace_not_booked]

layout = go.Layout(
    xaxis=dict(title='Days between search and checking time', tickangle=45, automargin=True),
    yaxis=dict(title='Frequency')
)

fig = go.Figure(data=data, layout=layout)
fig.show()
fig.write_json('./days_between.json')

# **illustration 5**

In [43]:
trace_not_booked = go.Scatter(y = not_booked['duration'].value_counts().sort_index()/len(not_booked) , name='Not Booked', opacity= 0.5) #TODO
trace_is_booked = go.Scatter(y = is_booked['duration'].value_counts().sort_index()/len(is_booked) , name='Booked') #TODO

data = [trace_is_booked, trace_not_booked]

layout = go.Layout(
    xaxis=dict(title='Length of Stay', tickangle=45, automargin=True),
    yaxis=dict(title='Frequency')
)

fig = go.Figure(data=data, layout=layout)
fig.show()
fig.write_json('./los.json')

In [44]:
del trace_not_booked, trace_is_booked, is_booked, not_booked, data

In [46]:
# TODO: Add is_abroad column

train['is_abroad'] = 0
train.loc[train['user_location_country'] == train['hotel_country'], 'is_abroad'] = 1

In [47]:
# TODO: Preprocessing (Drop Unnecessary Columns) (Optional)

train = train.drop(columns = ['search_date','checkIn_date','checkOut_date'])

In [48]:
# TODO: Preprocessing (Make One-hotted Columns) (Optional)
dummy_columns = ['channel'] #TODO

train = pd.get_dummies(train, columns = dummy_columns)

In [49]:
# TODO: Preprocessing (Encode Categorical Columns) (Optional)
le = LabelEncoder()

columns_to_encode = ['search_time_of_day','checkIn_of_year']

for column in columns_to_encode:
    train.loc[:, column] = le.fit_transform(train[column])

In [50]:
#To make sure orders are correct,
#Also If I want to drop any other column I can change this part of code

drop_columns = ['user_location_country','hotel_country','destination']
train_columns = train.drop(columns = drop_columns).columns
train_columns.drop('is_booking')

train = train[train_columns]

In [51]:
# Make Data Set Balance (We Also could set class_weights for our model)
train = train.drop(index = train[train['is_booking'] == False].sample(frac =.905).index)
train = train.astype(np.float32)

In [57]:
# TODO: Make Validation Set (Optional)

X_train, X_test, y_train, y_test = train_test_split(train.drop(columns = ['is_booking'])
                                                      ,train['is_booking'], test_size = 0.05)

# **Modeling**

In [58]:
# TODO : Design Model
model = keras.models.Sequential()
model.add(keras.layers.Input(shape=(X_train.shape[1],)))
model.add(keras.layers.BatchNormalization()) # Also Can be done in preparation phase
model.add(keras.layers.Dense(100, activation='relu'))
model.add(keras.layers.Dropout(0.1))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dense(50, activation='relu'))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dense(1, activation='sigmoid'))

In [59]:
# TODO : Complie the Model
model.compile(optimizer = keras.optimizers.Adam(),
              loss = keras.losses.BinaryCrossentropy(),
              metrics = ['auc'])

In [67]:
# TODO : Train your Model
epochs = 100
BATCH_SIZE = 4096

checkpoint_cb = ModelCheckpoint(
    'best_model.keras',
    save_best_only=True,
    monitor='val_auc',
    mode='max',
    verbose=1
)
early_stopping_cb = EarlyStopping(monitor='val_auc', patience=20, restore_best_weights=True)

history = model.fit(X_train, y_train,
                    batch_size=BATCH_SIZE,
                    epochs=epochs,
                   validation_split=0.1,
                    callbacks=[checkpoint_cb,early_stopping_cb])

Epoch 1/100
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - auc: 0.7708 - loss: 0.5515
Epoch 1: val_auc improved from -inf to 0.77026, saving model to best_model.keras
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - auc: 0.7708 - loss: 0.5515 - val_auc: 0.7703 - val_loss: 0.5528
Epoch 2/100
[1m269/276[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 3ms/step - auc: 0.7704 - loss: 0.5518
Epoch 2: val_auc did not improve from 0.77026
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - auc: 0.7705 - loss: 0.5518 - val_auc: 0.7701 - val_loss: 0.5529
Epoch 3/100
[1m263/276[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 3ms/step - auc: 0.7709 - loss: 0.5511
Epoch 3: val_auc did not improve from 0.77026
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - auc: 0.7709 - loss: 0.5511 - val_auc: 0.7701 - val_loss: 0.5529
Epoch 4/100
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━

# **Prediction**

In [68]:
# TODO: Evaluate Model
roc_auc_score(y_test, model.predict(X_test))

[1m2066/2066[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step


0.7708919844804819