<a href="https://colab.research.google.com/github/AzadehZahedi/Deep-Learning/blob/main/Deep_Learning_Project1_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Project 1

When you want to travel to a city like Urmia for the Nowruz holidays, you probably enter a hotel booking site via mobile or laptop, via Google search, promotional SMS, or directly, and after creating an account, you search and enter your destination and check-in/check-out date. As a result, a list of hotels is shown to you, and finally, you compare them by clicking on different hotels and maybe (🙃) finally book one of them.
In this project, after preprocessing the data and solving the challenges related to it, we trained a model that can predict whether a user will book the viewed hotel or not, based on user search information and other related features. In this way, appropriate decisions can be made in real time for each user, based on the booking forecast, such as offering discounts or suggesting other hotels.

In [None]:
import numpy as np
import pandas as pd

In [None]:
import gc
# Get the current garbage collector thresholds
thresholds = gc.get_threshold()
print(thresholds)

(700, 10, 10)


In [None]:
gc.collect()

0

**Read data**

In [None]:
!pip install --upgrade --no-cache-dir gdown # gdown is library to read file in google drive



In [None]:
import gdown

# Step 1
#!gdown 1jIl8uXsrdRz3ZBxljjQNo0ssDT8LbJBx
gdown.download('https://drive.google.com/uc?id=1jIl8uXsrdRz3ZBxljjQNo0ssDT8LbJBx', 'will_not_travel_again_data.zip', quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=1jIl8uXsrdRz3ZBxljjQNo0ssDT8LbJBx
From (redirected): https://drive.google.com/uc?id=1jIl8uXsrdRz3ZBxljjQNo0ssDT8LbJBx&confirm=t&uuid=f2ab3855-4f30-4016-91ce-c91e9c8fdd6a
To: /content/will_not_travel_again_data.zip
100%|██████████| 188M/188M [00:09<00:00, 19.1MB/s]


'will_not_travel_again_data.zip'

In [None]:
# unzip data
import zipfile
import os

# Step 2: Extract the contents of the ZIP file
with zipfile.ZipFile('will_not_travel_again_data.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/will_not_travel_again_data')

# Step 3: List the contents of the extracted directory to ensure the CSV file is there
extracted_files = os.listdir('/content/will_not_travel_again_data')

In [None]:
train_df = pd.read_csv('/content/will_not_travel_again_data/data/train.csv')
train_df.head()

Unnamed: 0,user,user_location_country,user_location_region,user_location_city,destination_distance,search_date,is_mobile,is_package,channel,search_count,...,n_adults,n_children,n_rooms,destination,destination_type,hotel_continent,hotel_country,hotel_market,hotel_category,is_booking
0,461899,3,50,5703,,2013-01-07 00:00:02,0,0,9,1,...,2,1,1,669,3,2,50,212,41,0
1,13796,66,174,21177,5713.6206,2013-01-07 00:00:06,0,0,9,3,...,1,0,1,8821,1,6,17,30,58,0
2,1128575,205,155,14703,795.7298,2013-01-07 00:00:06,0,0,9,1,...,1,0,1,25064,6,2,50,1230,91,0
3,1080476,69,761,41949,,2013-01-07 00:00:17,0,1,9,1,...,2,0,1,7635,3,2,50,675,10,0
4,1080476,69,761,41949,,2013-01-07 00:00:23,0,1,9,1,...,2,0,1,7635,3,2,50,675,10,0


In [None]:
test_df = pd.read_csv('/content/will_not_travel_again_data/data/test.csv')
test_df.head()

Unnamed: 0.1,Unnamed: 0,user,user_location_country,user_location_region,user_location_city,destination_distance,search_date,is_mobile,is_package,channel,...,checkOut_date,n_adults,n_children,n_rooms,destination,destination_type,hotel_continent,hotel_country,hotel_market,hotel_category
0,0,956535,215,817,19599,,2014-12-01 12:23:32,0,0,9,...,2014-12-14,4,0,1,1031,1,2,50,1107,39
1,1,1099321,66,321,47535,7322.6527,2014-12-01 13:00:29,0,0,0,...,2014-12-13,1,0,1,8796,1,3,104,1003,57
2,2,993646,109,0,21903,,2014-12-01 16:04:55,0,1,9,...,2014-12-19,1,0,1,8253,1,6,70,19,70
3,3,544256,66,174,50284,,2014-12-01 12:31:22,0,0,9,...,2014-12-05,2,0,2,8279,1,2,50,1230,21
4,4,26542,215,646,51733,2045.2943,2014-12-01 11:29:26,0,0,9,...,2014-12-19,2,0,1,59198,3,4,119,2064,53


**Data preprocessing**



In [None]:
# TODO : Drop columns = ['user'] for both train and test sets
train = train_df.drop(columns=['user'])
test = test_df.drop(columns=['Unnamed: 0', 'user'])

In [None]:
# Check for missing values

print(train.isnull().sum())
print(test.isnull().sum())

user_location_country          0
user_location_region           0
user_location_city             0
destination_distance     2727207
search_date                    0
is_mobile                      0
is_package                     0
channel                        0
search_count                   0
checkIn_date               14292
checkOut_date              14292
n_adults                       0
n_children                     0
n_rooms                        0
destination                    0
destination_type               0
hotel_continent                0
hotel_country                  0
hotel_market                   0
hotel_category                 0
is_booking                     0
dtype: int64
user_location_country        0
user_location_region         0
user_location_city           0
destination_distance     98670
search_date                  0
is_mobile                    0
is_package                   0
channel                      0
search_count                 0
checkIn_date   

Handle missing values

In [None]:
# TODO : check is_booking where checkIn_date or checkOut_date is null
missing_checkin = train['checkIn_date'].isnull().sum()
missing_checkout = train['checkOut_date'].isnull().sum()

checkin_target_0 = train[train['checkIn_date'].isnull()]['is_booking'].value_counts().get(0, 0) # pandas.Series.get(key, default=None) # 14292 number of zero
checkin_ratio = checkin_target_0 / missing_checkin

checkout_target_0 = train[train['checkOut_date'].isnull()]['is_booking'].value_counts().get(0, 0)
checkout_ratio = checkout_target_0 / missing_checkout

In [None]:
#  checkIn_date
if checkin_ratio >= 0.99:
    train = train.dropna(subset=['checkIn_date'])
else:
    train['checkIn_date'].fillna(train['checkIn_date'].mode()[0], inplace=True)


The ''destination_distance'' column represents the physical distance between the user and the hotel at the time of the search. Since we can say that the calculated distance between two cities is always constant or within a range, to fill in the missing values ​​representing this distance, we can use ''user_location_city'' and ''destination'', which represent the city ID of the user and the hotel searched for, so that the instances where the ''destination_distance'' column is missing are filled in with the average value ''destination_distance'' in the columns that ''user_location_city'' and ''destination'' are similar.

In [None]:
missing_destination_distance = train['destination_distance'].isnull().sum()

In [None]:
mean_distance = train.groupby(['user_location_city', 'destination'])['destination_distance'].transform('mean')

# Output: This will return a Series that has the same length as the original DataFrame train (having the same number of entries)
# Each value in this series corresponds to the mean distance of each specific group

# If we use ... .mean(), it will return a pandas Series with a multi-level index (the combination of user_location_city and destination) and the mean distance as values.
# The length of this Series will be fewer than the original DataFrame since it only contains the unique combinations of the group by.

In [None]:
train['destination_distance'].fillna(mean_distance, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['destination_distance'].fillna(mean_distance, inplace=True)


In [None]:
test['destination_distance'].fillna(mean_distance, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['destination_distance'].fillna(mean_distance, inplace=True)


In [None]:
# Check how many missing values remain
print(train['destination_distance'].isnull().sum())
print(test['destination_distance'].isnull().sum())

2537339
33277


Now for other missing items we use a number like 0 which is not used in this column.

In [None]:
train[train['destination_distance'] == 0]

Unnamed: 0,user_location_country,user_location_region,user_location_city,destination_distance,search_date,is_mobile,is_package,channel,search_count,checkIn_date,...,n_adults,n_children,n_rooms,destination,destination_type,hotel_continent,hotel_country,hotel_market,hotel_category,is_booking


In [None]:
train['destination_distance'].fillna(0, inplace=True) #TODO
test['destination_distance'].fillna(0, inplace=True) #TODO

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['destination_distance'].fillna(0, inplace=True) #TODO
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['destination_distance'].fillna(0, inplace=True) #TODO


Date type data

In [None]:
# search_date, checkIn_date, checkOut_date
# TODO : change type of these columns in time_columns into datetime64[ns]

time_columns = ['search_date', 'checkIn_date', 'checkOut_date']
for col in time_columns:
  train[col] = pd.to_datetime(train[col])
  test[col] = pd.to_datetime(test[col])

In [None]:
# TODO : Add Days of Stay

train['duration'] = train['checkOut_date'] - train['checkIn_date']
train['duration'] = train['duration'].dt.days

test['duration'] = test['checkOut_date'] - test['checkIn_date']
test['duration'] = test['duration'].dt.days

In [None]:
# TODO : Add Days between search_date and checkIn_date

train['days_between'] = (train['checkIn_date'] - train['search_date']).dt.days
test['days_between'] = (test['checkIn_date'] - test['search_date']).dt.days

In [None]:
train['search_date_hour'] = train['search_date'].dt.hour
train['search_date_dayofweek'] = train['search_date'].dt.day_of_week
train['checkIn_date_dayofweek'] = train['checkIn_date'].dt.day_of_week

train['search_date_year'] = train['search_date'].dt.year
train['search_date_month'] = train['search_date'].dt.month
train['checkIn_date_year'] = train['checkIn_date'].dt.year
train['checkIn_date_month'] = train['checkIn_date'].dt.month



test['search_date_hour'] = test['search_date'].dt.hour
test['search_date_dayofweek'] = test['search_date'].dt.day_of_week
test['checkIn_date_dayofweek'] = test['checkIn_date'].dt.day_of_week

test['search_date_year'] = test['search_date'].dt.year
test['search_date_month'] = test['search_date'].dt.month
test['checkIn_date_year'] = test['checkIn_date'].dt.year
test['checkIn_date_month'] = test['checkIn_date'].dt.month

Now, to examine the relationship between the features we obtained in the previous step and the target variable (whether the search resulted in a hotel reservation), create two data frames based on the training dataset, so that the first data frame includes only the examples that resulted in a reservation and the second data frame includes only the examples that did not result in a reservation.

In [None]:
is_booked = train[train['is_booking'] == 1]
not_booked = train[train['is_booking'] == 0]

In [None]:
# ذخیره‌ فایل DataFrame به عنوان CSV
train.to_csv('/content/drive/My Drive/saved_data.csv', index=False)
test.to_csv('/content/drive/My Drive/saved_testdata.csv', index=False)

**Visualization**

A bar chart showing the relationship between the percentage of searches that resulted in hotel reservations and those that did not (each relative to their own category) at different times of day.

In [None]:
hourly_search_percentage_notbooked = not_booked['search_date_hour'].value_counts().sort_index()/len(not_booked)
hourly_search_percentage_booked = is_booked['search_date_hour'].value_counts().sort_index()/len(is_booked)

In [None]:
import plotly.graph_objects as go

trace_not_booked = go.Bar(y = hourly_search_percentage_notbooked, name='Not Booked') # TODO
trace_is_booked = go.Bar(y = hourly_search_percentage_booked, name='Booked') # TODO

data = [trace_is_booked, trace_not_booked]

layout = go.Layout(
    xaxis=dict(title='Search Hour', tickangle=45, automargin=True),
    yaxis=dict(title='Frequency')
)

fig = go.Figure(data=data, layout=layout)
fig.show()
fig.write_json('./search_hour.json')

A bar chart for the check in day

In [None]:
daily_search_percentage_notbooked = not_booked['checkIn_date_dayofweek'].value_counts().sort_index()/len(not_booked)
daily_search_percentage_booked = is_booked['checkIn_date_dayofweek'].value_counts().sort_index()/len(is_booked)

In [None]:
trace_not_booked = go.Bar(y=daily_search_percentage_notbooked, name='Not Booked')
trace_is_booked = go.Bar(y=daily_search_percentage_booked, name='Booked')

# Modify tick label text
ticktext = ['دوشنبه', 'سه‌شنبه', 'چهارشنبه', 'پنج‌شنبه', 'جمعه', 'شنبه', 'یکشنبه']

data = [trace_is_booked, trace_not_booked]

layout = go.Layout(
    xaxis=dict(title='Day of Week', tickangle=45, automargin=True,
               tickvals = [0,1,2,3,4,5,6], ticktext= ticktext
 ),
    yaxis=dict(title='Frequency'),
)

fig = go.Figure(data=data, layout=layout)
fig.show()
fig.write_json('./checkIn_day.json')

A bar chart for the month of check in

In [None]:
monthly_search_percentage_notbooked = not_booked['checkIn_date_month'].value_counts().sort_index()/len(not_booked)
monthly_search_percentage_booked = is_booked['checkIn_date_month'].value_counts().sort_index()/len(is_booked)

In [None]:
trace_not_booked = go.Bar(y = monthly_search_percentage_notbooked, name='Not Booked') # TODO
trace_is_booked = go.Bar(y = monthly_search_percentage_booked, name='Booked') # TODO

data = [trace_is_booked, trace_not_booked]

ticktext = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

layout = go.Layout(
    xaxis=dict(title='Month', tickangle=45, automargin=True,
             ticktext = ticktext ,tickvals = np.arange(0,12)),
    yaxis=dict(title='Frequency')
)

fig = go.Figure(data=data, layout=layout)
fig.show()
fig.write_json('./checkIn_date_month.json')

A line graph for the time interval between search and check in.

In [None]:
days_search_percentage_notbooked = not_booked['days_between'].value_counts().sort_index()/len(not_booked)
days_search_percentage_booked = is_booked['days_between'].value_counts().sort_index()/len(is_booked)

In [None]:
trace_not_booked = go.Scatter(y = days_search_percentage_notbooked, name='Not Booked', opacity = 0.5)
trace_is_booked = go.Scatter(y = days_search_percentage_booked, name='Booked')

data = [trace_is_booked, trace_not_booked]

layout = go.Layout(
    xaxis=dict(title='Days between search and checking time', tickangle=45, automargin=True),
    yaxis=dict(title='Frequency')
)

fig = go.Figure(data=data, layout=layout)
fig.show()
fig.write_json('./days_between.json')

A line graph for the duration of stay

In [None]:
stay_search_percentage_notbooked = not_booked['duration'].value_counts().sort_index()/len(not_booked)
stay_search_percentage_booked = is_booked['duration'].value_counts().sort_index()/len(is_booked)

In [None]:
trace_not_booked = go.Scatter(y = stay_search_percentage_notbooked, name='Not Booked', opacity = 0.5)
trace_is_booked = go.Scatter(y = stay_search_percentage_booked, name='Booked')

data = [trace_is_booked, trace_not_booked]

layout = go.Layout(
    xaxis=dict(title='Length of Stay', tickangle=45, automargin=True),
    yaxis=dict(title='Frequency')
)

fig = go.Figure(data=data, layout=layout)
fig.show()
fig.write_json('./los.json')

Foreign or Domestic?

In [None]:
# TODO: Add is_abroad column
import numpy as np
train['is_abroad'] = np.where(
     train['user_location_country'] == train['hotel_country'], 0, 1
)

test['is_abroad'] = np.where(
     test['user_location_country'] == test['hotel_country'], 0, 1
)

In [None]:
# TODO: Preprocessing (Make One-hotted Columns) (Optional)
#dummy_columns = ['channel', 'destination_type', 'hotel_category']

#for col in dummy_columns:
#    print("column:", col, "unique_values:", len(train[col].unique()))
#    print("column:", col, "unique_values:", len(train[col].unique()))

In [None]:
#!pip install category_encoders

In [None]:
#import category_encoders as ce
#binary_encoder = ce.BinaryEncoder()

#train_binary_channel = binary_encoder.fit_transform(train['channel'])
#test_binary_channel = binary_encoder.transform(test['channel'])

#train_binary_destination_type = binary_encoder.fit_transform(train['destination_type'])
#test_binary_destination_type = binary_encoder.transform(test['destination_type'])

#train_binary_hotel_category = binary_encoder.fit_transform(train['hotel_category'])
#test_binary_hotel_category = binary_encoder.transform(test['hotel_category'])

In [None]:
#train = pd.concat([train, train_binary_channel, train_binary_destination_type, train_binary_hotel_category], axis = 1)
#test = pd.concat([test, test_binary_channel, test_binary_destination_type, test_binary_hotel_category], axis = 1)

In [None]:
# TODO: Preprocessing (Drop Unnecessary Columns) (Optional)
columns_to_del = ['user_location_country', 'user_location_region', 'user_location_city', 'checkIn_date', 'checkOut_date', 'hotel_continent', 'hotel_country', 'hotel_market',
                  'search_date'
                  #'channel', 'destination_type', 'hotel_category'
                  ]

train = train.drop(columns=columns_to_del)
test = test.drop(columns=columns_to_del)

In [None]:
# normalize data
columns = test.columns
for column in columns:
  mean = train[column].mean()
  std = train[column].std()
  train[column] = (train[column]- mean)/std
  test[column] = (test[column] - mean)/std

In [None]:
train['is_booking'].value_counts()

Unnamed: 0_level_0,count
is_booking,Unnamed: 1_level_1
0,6946841
1,661867


In [None]:
X = train.drop(columns=['is_booking'], axis=1)
y = train['is_booking']

In [None]:
# balance data
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(sampling_strategy={0: 700000})
X_ru, y_ru = rus.fit_resample(X, y)

#from imblearn.under_sampling import NearMiss
#undersample = NearMiss(version=3, n_neighbors=3)
#X_ru, y_ru = undersample.fit_resample(X, y)

#from imblearn.over_sampling import SMOTE
#oversample = SMOTE(sampling_strategy='minority', k_neighbors=3, random_state=42)
#X_ru, y_ru = oversample.fit_resample(X, y)

In [None]:
y_ru.value_counts()

Unnamed: 0_level_0,count
is_booking,Unnamed: 1_level_1
0,700000
1,661867


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X_ru, y_ru, test_size = 0.2, random_state=123)

print('Train examples:', len(X_train), len(y_train))
print('Validation examples:', len(X_valid), len(y_valid))
print('Test examples:',  len(test))

Train examples: 1089493 1089493
Validation examples: 272374 272374
Test examples: 325821


**Modeling**

In [None]:
#import pandas as pd
#train = pd.read_csv('/content/drive/My Drive/saved_data.csv')
#test = pd.read_csv('/content/drive/My Drive/saved_testdata.csv')

In [None]:
!pip install keras --upgrade
# Uncomment the above line if you don't have keras 3 installed

Collecting keras
  Downloading keras-3.6.0-py3-none-any.whl.metadata (5.8 kB)
Downloading keras-3.6.0-py3-none-any.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m51.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: keras
  Attempting uninstall: keras
    Found existing installation: keras 3.5.0
    Uninstalling keras-3.5.0:
      Successfully uninstalled keras-3.5.0
Successfully installed keras-3.6.0


In [None]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"

In [None]:
import keras
print(keras.__version__)

3.6.0


In [None]:
# TODO : Design Model
# Batch normalization applies a transformation that maintains the mean output close to 0 and the output standard deviation close to 1.

model = keras.Sequential()
model.add(keras.layers.Input(shape=(21,))) # number of features

model.add(keras.layers.Dense(400,
                             activation='selu',
                             kernel_initializer=keras.initializers.LecunNormal(seed=123),
                             kernel_regularizer=keras.regularizers.L1L2(l1=3e-5, l2=3e-4)))
model.add(keras.layers.LayerNormalization())

model.add(keras.layers.Dense(400,
                             activation='selu',
                             kernel_initializer=keras.initializers.LecunNormal(seed=123),
                             kernel_regularizer=keras.regularizers.L1L2(l1=3e-5, l2=3e-4)))
model.add(keras.layers.LayerNormalization())

model.add(keras.layers.Dense(100,
                             activation='selu',
                             kernel_initializer=keras.initializers.LecunNormal(seed=123),
                             kernel_regularizer=keras.regularizers.L1L2(l1=3e-5, l2=3e-4)))
model.add(keras.layers.LayerNormalization())

model.add(keras.layers.Dense(1,
                             activation='sigmoid',
                             kernel_regularizer=keras.regularizers.L1L2(l1=3e-5, l2=3e-4)))


In [None]:
# TODO : Complie the Model
model.compile(optimizer = 'adam',
              loss = keras.losses.BinaryCrossentropy(from_logits=False),
              metrics = [keras.metrics.AUC()])

In [None]:
# TODO : Train your Model

EPochs = 100
BATCH_SIZE = 32

checkpoint_cb = keras.callbacks.ModelCheckpoint(filepath="/content/drive/My Drive/my_keras_model.keras", save_best_only=True)
early_stopping_cb = keras.callbacks.EarlyStopping(monitor='val_loss', mode='auto', patience=10)

history = model.fit(X_train, y_train,
                    epochs=EPochs, batch_size=BATCH_SIZE,
                    validation_data=(X_valid, y_valid),
                    callbacks=[checkpoint_cb, early_stopping_cb])

Epoch 1/100
[1m34047/34047[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 3ms/step - auc: 0.7422 - loss: 0.6625 - val_auc: 0.7564 - val_loss: 0.5700
Epoch 2/100
[1m34047/34047[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 3ms/step - auc: 0.7552 - loss: 0.5701 - val_auc: 0.7571 - val_loss: 0.5677
Epoch 3/100
[1m34047/34047[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 3ms/step - auc: 0.7567 - loss: 0.5684 - val_auc: 0.7582 - val_loss: 0.5667
Epoch 4/100
[1m34047/34047[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 3ms/step - auc: 0.7565 - loss: 0.5682 - val_auc: 0.7585 - val_loss: 0.5670
Epoch 5/100
[1m34047/34047[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 3ms/step - auc: 0.7584 - loss: 0.5670 - val_auc: 0.7576 - val_loss: 0.5666
Epoch 6/100
[1m34047/34047[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 3ms/step - auc: 0.7580 - loss: 0.5674 - val_auc: 0.7578 - val_loss: 0.5670
Epoch 7/100
[1m34047/34047[0m [32m━━━━━━━━━━━━━

In [None]:
from sklearn.metrics import roc_auc_score

y_pred = model.predict(X_valid, batch_size=BATCH_SIZE)
auc = roc_auc_score(y_valid, y_pred)

[1m8512/8512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2ms/step


In [None]:
# Calculate AUC score
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(y_valid, y_pred)
print("AUC Score:", auc)

AUC Score: 0.758474704741942


In [None]:
predictions = model.predict(test, batch_size=BATCH_SIZE)

[1m10182/10182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 2ms/step


In [None]:
# Preparing Submission Dataframe

submission = pd.DataFrame(predictions, columns=['prediction'])
submission.head()

Unnamed: 0,prediction
0,0.514805
1,0.723857
2,0.631785
3,0.805812
4,0.525902


In [None]:
import zipfile
import joblib
import os

if not os.path.exists(os.path.join(os.getcwd(), 'will_not_travel_again.ipynb')):
    %notebook -e will_not_travel_again.ipynb

def compress(file_names):
    print("File Paths:")
    print(file_names)
    compression = zipfile.ZIP_DEFLATED
    with zipfile.ZipFile("result.zip", mode="w") as zf:
        for file_name in file_names:
            zf.write('./' + file_name, file_name, compress_type=compression)

submission.to_csv('submission.csv', index=False)
file_names = ['will_not_travel_again.ipynb', 'submission.csv',
              'search_hour.json', 'checkIn_date_month.json',
              'checkIn_day.json', 'days_between.json', 'los.json']
compress(file_names)

File Paths:
['will_not_travel_again.ipynb', 'submission.csv', 'search_hour.json', 'checkIn_date_month.json', 'checkIn_day.json', 'days_between.json', 'los.json']
