In [41]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split, KFold
from tensorflow.keras.callbacks import ModelCheckpoint

In [42]:
fraction_of_data_to_use = 1
embedding_dim = 50  # Dimensionality of embedding layer
lstm_units = 100    # Number of units in LSTM layers
dropout_rate = 0.3  # Dropout rate for regularization
batch_size = 64
epochs = 10
n_splits = 5

In [43]:
# Load the datasets
fraction_of_data_to_use = 0.01
train_set = pd.read_csv('train_set.csv')
train_set = train_set.sample(frac=fraction_of_data_to_use)
test_set = pd.read_csv('test_set.csv')
test_set = test_set.sample(frac=fraction_of_data_to_use)

In [44]:
# Load the datasets
try:
    train_set = pd.read_csv('train_set.csv').sample(frac=fraction_of_data_to_use)
    test_set = pd.read_csv('test_set.csv').sample(frac=fraction_of_data_to_use)
except Exception as e:
    print(f"Failed to load data: {e}")
    exit()

In [45]:
# Preprocessing steps
train_set['checkin'] = pd.to_datetime(train_set['checkin'])
train_set['checkout'] = pd.to_datetime(train_set['checkout'])
test_set['checkin'] = pd.to_datetime(test_set['checkin'])
test_set['checkout'] = pd.to_datetime(test_set['checkout'])
train_set['stay_duration'] = (train_set['checkout'] - train_set['checkin']).dt.days
test_set['stay_duration'] = (test_set['checkout'] - test_set['checkin']).dt.days

In [46]:
# Fill NaN values with 'Unknown' to avoid errors during encoding
train_set.fillna('Unknown', inplace=True)
test_set.fillna('Unknown', inplace=True)

In [47]:
# Feature Engineering
trip_sequences = train_set.groupby('utrip_id').agg(list).reset_index()
X = trip_sequences['city_id'].apply(lambda x: x[:-1]).tolist()
y = trip_sequences['city_id'].apply(lambda x: x[-1]).tolist()
X_padded = pad_sequences(X, padding='post')

In [48]:
# Encode target labels to integers
city_encoder = LabelEncoder()
unique_cities = train_set['city_id'].unique()
y_encoded = city_encoder.fit_transform(unique_cities)
y_encoded = city_encoder.transform(y)

# Pad sequences to have the same length for model input
X_padded = pad_sequences(X, padding='post')

In [49]:
# Fit the LabelEncoder on all unique city IDs from both training and testing datasets
all_cities = np.concatenate([train_set['city_id'].unique(), test_set['city_id'].unique()])
all_cities = np.unique(all_cities)  # Ensure unique cities
city_encoder = LabelEncoder()
city_encoder.fit(all_cities)
y_encoded = city_encoder.transform(trip_sequences['city_id'].apply(lambda x: x[-1]))

# Check if any encoded label index exceeds the configured input_dim of the embedding layer
if np.any(y_encoded >= len(city_encoder.classes_)):
    raise ValueError("Encoded city index out of bounds of the embedding layer input dimension.")


In [50]:
# Model Configuration
model = Sequential([
    Embedding(input_dim=len(city_encoder.classes_), output_dim=embedding_dim, input_length=X_padded.shape[1]),
    Bidirectional(LSTM(lstm_units, return_sequences=True, dropout=dropout_rate, recurrent_dropout=dropout_rate)),
    LSTM(lstm_units, dropout=dropout_rate, recurrent_dropout=dropout_rate),
    Dense(128, activation='relu'),
    Dropout(dropout_rate),
    Dense(len(city_encoder.classes_), activation='softmax')])



In [51]:
# Compile the model with optimizer, loss function, and metrics
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [52]:
# Cross-Validation
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
fold_no = 1
for train_index, val_index in kf.split(X_padded):
    X_train, X_val = X_padded[train_index], X_padded[val_index]
    y_train, y_val = y_encoded[train_index], y_encoded[val_index]

    print(f'Training fold {fold_no}...')
    model_checkpoint_callback = ModelCheckpoint(
        filepath=f'model_fold_{fold_no}.weights.h5',
        save_weights_only=True,
        monitor='val_accuracy',
        mode='max',
        save_best_only=True)

    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_val, y_val), callbacks=[model_checkpoint_callback])
    fold_no += 1

Training fold 1...
Epoch 1/10
[1m  1/143[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m20:53[0m 9s/step - accuracy: 0.0000e+00 - loss: 8.4526

2024-06-01 09:30:41.392370: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: INVALID_ARGUMENT: indices[17,0] = 11757 is not in [0, 4687)
	 [[{{function_node __inference_one_step_on_data_42886}}{{node sequential_6_1/embedding_6_1/GatherV2}}]]


InvalidArgumentError: Graph execution error:

Detected at node sequential_6_1/embedding_6_1/GatherV2 defined at (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main

  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 739, in start

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 205, in start

  File "/usr/lib/python3.10/asyncio/base_events.py", line 603, in run_forever

  File "/usr/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once

  File "/usr/lib/python3.10/asyncio/events.py", line 80, in _run

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 534, in process_one

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 362, in execute_request

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 778, in execute_request

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 449, in do_execute

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3075, in run_cell

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3130, in _run_cell

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3334, in run_cell_async

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3517, in run_ast_nodes

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code

  File "/tmp/ipykernel_6585/1535412224.py", line 16, in <module>

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 314, in fit

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 117, in one_step_on_iterator

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 104, in one_step_on_data

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 51, in train_step

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/keras/src/layers/layer.py", line 846, in __call__

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/keras/src/ops/operation.py", line 48, in __call__

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 156, in error_handler

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/keras/src/models/sequential.py", line 209, in call

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/keras/src/models/functional.py", line 202, in call

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/keras/src/ops/function.py", line 155, in _run_through_graph

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/keras/src/models/functional.py", line 592, in call

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/keras/src/layers/layer.py", line 846, in __call__

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/keras/src/ops/operation.py", line 48, in __call__

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 156, in error_handler

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/keras/src/layers/core/embedding.py", line 146, in call

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/keras/src/ops/numpy.py", line 4850, in take

  File "/home/ajay/Documents/projects/RecSys-local/.venv/lib/python3.10/site-packages/keras/src/backend/tensorflow/numpy.py", line 1940, in take

indices[17,0] = 11757 is not in [0, 4687)
	 [[{{node sequential_6_1/embedding_6_1/GatherV2}}]] [Op:__inference_one_step_on_iterator_42995]