In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split, KFold

2024-05-31 01:40:14.385604: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load the datasets
train_set = pd.read_csv('train_set.csv')
test_set = pd.read_csv('test_set.csv')

In [3]:
# Preprocessing steps
# Convert dates to datetime objects
train_set['checkin'] = pd.to_datetime(train_set['checkin'])
train_set['checkout'] = pd.to_datetime(train_set['checkout'])
test_set['checkin'] = pd.to_datetime(test_set['checkin'])
test_set['checkout'] = pd.to_datetime(test_set['checkout'])

In [4]:
# Calculate stay duration
train_set['stay_duration'] = (train_set['checkout'] - train_set['checkin']).dt.days
test_set['stay_duration'] = (test_set['checkout'] - test_set['checkin']).dt.days

In [5]:
# Fill NaN values with 'Unknown' to avoid errors during encoding
train_set.fillna('Unknown', inplace=True)
test_set.fillna('Unknown', inplace=True)

In [6]:
# Encode categorical variables
columns_to_encode = ['device_class', 'booker_country', 'hotel_country', 'affiliate_id']
combined = pd.concat([train_set, test_set])  # Combine for consistent encoding
for column in columns_to_encode:
    le = LabelEncoder()
    combined[column] = le.fit_transform(combined[column].astype(str))  # Convert all to string to avoid type issues

# Split combined back into train and test sets
train_set = combined.iloc[:len(train_set)]
test_set = combined.iloc[len(train_set):]

In [7]:
# Feature Engineering
# Group by 'utrip_id' to create sequences of visits within the same trip
trip_sequences = train_set.groupby('utrip_id').agg(list).reset_index()

# Extract features and targets for modeling
# Input (features): All cities visited in a trip except the last one
# Target (output): The last visited city, which we want to predict
X = trip_sequences['city_id'].apply(lambda x: x[:-1]).tolist()
y = trip_sequences['city_id'].apply(lambda x: x[-1]).tolist()

# Pad sequences to have the same length for model input
X_padded = pad_sequences(X, padding='post')

In [8]:
# Encode target labels to one-hot vectors
unique_cities = train_set['city_id'].unique()
city_encoder = LabelEncoder()
city_encoder.fit(unique_cities)
y_encoded = city_encoder.transform(y)
y_one_hot = to_categorical(y_encoded, num_classes=len(unique_cities))

MemoryError: Unable to allocate 64.7 GiB for an array with shape (217686, 39901) and data type float64

In [None]:
# Model Configuration
embedding_dim = 50  # Dimensionality of embedding layer
lstm_units = 100    # Number of units in LSTM layers
dropout_rate = 0.3  # Dropout rate for regularization

# Define the model architecture
model = Sequential([
    Embedding(input_dim=len(unique_cities), output_dim=embedding_dim, input_length=X_padded.shape[1]),
    Bidirectional(LSTM(lstm_units, return_sequences=True, dropout=dropout_rate, recurrent_dropout=dropout_rate)),
    LSTM(lstm_units, dropout=dropout_rate, recurrent_dropout=dropout_rate),
    Dense(128, activation='relu'),
    Dropout(dropout_rate),
    Dense(len(unique_cities), activation='softmax')
])

In [None]:
# Compile the model with optimizer, loss function, and metrics
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Cross-Validation
# Initialize KFold for cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_no = 1
for train_index, val_index in kf.split(X_padded):
    X_train, X_val = X_padded[train_index], X_padded[val_index]
    y_train, y_val = y_one_hot[train_index], y_one_hot[val_index]
    
    print(f'Training fold {fold_no}...')
    model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_val, y_val))
    fold_no += 1