In [None]:
!git clone https://github.com/DorBernsohn/Booking-Challenge.git

In [None]:
import sys
sys.path.append('/content/Booking-Challenge')

# train_path = Path(r"/content/Booking-Challenge/data/train_set.csv")
# test_path = Path(r"/content/Booking-Challenge/data/test_set.csv")

In [None]:
# external
import numpy as np
import pandas as pd

# internal
from utils import (load_data,
                   extract_features,
                   build_time_features,
                   build_prev_city,
                   build_first_city,
                   split_features_label,
                   flatten_features,
                   LabelEncoderMapping)
from config import train_path, test_path

In [None]:
df_train = load_data(train_path, min_trip_length_threshold=4)
df_test = load_data(test_path, min_trip_length_threshold=4)
data = pd.concat([df_train, df_test], sort=False)

In [None]:
df = data.copy()
df = extract_features(df)
features, labels = split_features_label(df)

In [None]:
print(features.shape)
features = features[features.utrip_id.isin(labels[labels.city_id != 0].utrip_id.values)]
labels = labels[labels.city_id != 0]
print(features.shape)

In [None]:
df1 = features.set_index(['utrip_id', features.groupby('utrip_id').cumcount()])

mux = pd.MultiIndex.from_product(df1.index.levels, names=df1.index.names)
df2 = df1.reindex(mux, fill_value=1).reset_index(level=1, drop=True).reset_index()
embeded_features = df2.groupby(['utrip_id']).agg(list).reset_index()

In [None]:
embeded_features.head()

In [None]:
import tensorflow as tf
dict_slices = tf.data.Dataset.from_tensor_slices((embeded_features.loc[:, ~embeded_features.columns.isin(['utrip_id', 'user_id', 'city_id', 'length'])].to_dict('list'), \
                                                  labels['city_id_encode'].values)).batch(16)

In [None]:
keys = embeded_features.loc[:, ~embeded_features.columns.isin(['utrip_id', 'user_id', 'city_id', 'length'])].keys()

emb_map = {}
for key in keys:
  len_set_feature = max(set([st for row in embeded_features[key] for st in row])) + 1
  emb_map[key]  = (len_set_feature, [int(len_set_feature/4) + 4 if len_set_feature > 4 else len_set_feature][0])
emb_map

In [None]:
inputs = {key: tf.keras.layers.Input(shape=(47), name=key) for key in embeded_features.loc[:, ~embeded_features.columns.isin(['utrip_id', 'user_id', 'city_id', 'length'])].keys()}

embeddings = []
for a, key in enumerate(inputs):
    emb = tf.keras.layers.Embedding(emb_map[key][0], 20, name=f'embedding_{key}')(inputs[key])
    embeddings.append(emb)
h = tf.keras.layers.Concatenate()(embeddings)
h = tf.keras.layers.Flatten()(h)
h = tf.keras.layers.Dense(1028, activation='relu')(h)
h = tf.keras.layers.Dropout(0.5)(h)
h = tf.keras.layers.Dense(2048, activation='relu')(h)
h = tf.keras.layers.Dropout(0.5)(h)
h = tf.keras.layers.Dense(4096, activation='relu')(h)
h = tf.keras.layers.Dense(max(labels['city_id_encode'].values) + 1, activation='softmax')(h) # len(list(set([st for row in embeded_features['city_id'] for st in row])))

model_func = tf.keras.Model(inputs=inputs, outputs=h)
model_func.compile(optimizer=tf.keras.optimizers.Adam(0.001),
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                  metrics=[tf.keras.metrics.SparseTopKCategoricalAccuracy(k=4)])
model_func.summary()

In [None]:
for row, label in dict_slices.take(1):
  print(row)
  print('-'*4)
  print(label)

In [None]:
labeled_all_length = [i for i,_ in enumerate(dict_slices)][-1] + 1

train_size = int(0.8 * labeled_all_length)
val_test_size = int(0.1 * labeled_all_length)

df_train = dict_slices.take(train_size)
df_test = dict_slices.skip(train_size)
df_val = df_test.skip(val_test_size)
df_test = df_test.take(val_test_size)

In [None]:
model_func.fit(df_train, validation_data=df_val, epochs=20)