#### We'll will compare the following models for their accuracy and precision. 

1. Collaborative Filtering (Item-Based)
2. Markov Chains
3. Random Forest
4. Gradient Boosting Machine
5. Recurrent Neural Networks (RNN) - LSTM

To get us started we will set up the preprocessing and helper functions that will be used by all models.

In [3]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
import pickle 
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from dask import delayed, compute
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from dask.diagnostics import ProgressBar




2024-05-26 00:30:29.005665: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
# Define paths for checkpoints
train_data_path = 'train_data.pkl'
test_data_path = 'test_data.pkl'
processed_data_path = 'processed_data.pkl'
models_predictions_path = 'models_predictions.pkl'

In [5]:
# Function to save data
def save_data(data, path):
    with open(path, 'wb') as f:
        pickle.dump(data, f)

# Function to load data
def load_data(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

In [6]:
# Check if preprocessed data exists
try:
    train_data, test_data, train_sequences, test_sequences, encoder, X_train, y_train, X_test, y_test = load_data(processed_data_path)
    print("Loaded preprocessed data from checkpoint.")
except FileNotFoundError:
    print("Preprocessed data not found. Running preprocessing steps...")

    # Load the datasets using Dask
    train_data = dd.read_csv('train_set.csv')
    test_data = dd.read_csv('test_set.csv')

    # Sample a subset of the data (optional, adjust frac as needed)
    fraction_of_data_to_use = 0.1  # Adjust this value to suit your needs
    train_data = train_data.sample(frac=fraction_of_data_to_use)
    test_data = test_data.sample(frac=fraction_of_data_to_use)

    # Ensure 'checkin' is in string format
    train_data['checkin'] = train_data['checkin'].astype(str)
    test_data['checkin'] = test_data['checkin'].astype(str)

    # Create a new column that combines 'utrip_id' and 'checkin'
    train_data['utrip_id_checkin'] = train_data['utrip_id'].astype(str) + '_' + train_data['checkin']
    test_data['utrip_id_checkin'] = test_data['utrip_id'].astype(str) + '_' + test_data['checkin']

    # Set the new column as index (if necessary for your logic)
    # Note: This may not be necessary for the overall logic.
    # train_data = train_data.set_index('utrip_id_checkin')
    # test_data = test_data.set_index('utrip_id_checkin')

    # Create a city_country column
    train_data['city_country'] = train_data['city_id'].astype(str) + '_' + train_data['hotel_country'].astype(str)
    test_data['city_country'] = test_data['city_id'].astype(str) + '_' + test_data['hotel_country'].astype(str)

    # Handle missing values
    train_data['city_country'] = train_data['city_country'].fillna('missing')
    test_data['city_country'] = test_data['city_country'].fillna('missing')

    # Convert city_country to category type for efficient encoding
    train_data = train_data.categorize(columns=['city_country'])
    test_data = test_data.categorize(columns=['city_country'])

    # Group by utrip_id to create sequences
    with ProgressBar():
        train_sequences = train_data.groupby('utrip_id')['city_country'].apply(list).compute().tolist()
        test_sequences = test_data.groupby('utrip_id')['city_country'].apply(list).compute().tolist()

    # Encode city_country strings as integers
    all_sequences = train_sequences + test_sequences
    all_cities_countries = [city_country for seq in all_sequences for city_country in seq]
    encoder = LabelEncoder()
    encoder.fit(all_cities_countries)

    encoded_train_sequences = [encoder.transform(seq).tolist() for seq in train_sequences]
    encoded_test_sequences = [encoder.transform(seq).tolist() for seq in test_sequences]

    # Prepare data for training models
    def prepare_data(sequences, sequence_length=None):
        if sequence_length is None:
            sequence_length = max(len(seq) for seq in sequences)
        X, y = [], []
        for seq in tqdm(sequences, desc="Preparing data"):
            for i in range(1, len(seq)):
                X.append(seq[:i])
                y.append(seq[i])
        X = pad_sequences(X, maxlen=sequence_length, padding='pre')
        y = np.array(y)
        return X, y

    X_train, y_train = prepare_data(encoded_train_sequences)
    X_test, y_test = prepare_data(encoded_test_sequences, sequence_length=X_train.shape[1])

    # Save preprocessed data to checkpoint
    save_data((train_data, test_data, train_sequences, test_sequences, encoder, X_train, y_train, X_test, y_test), processed_data_path)
    print("Saved preprocessed data to checkpoint.")

    # Print shapes to verify the data preparation
    print(f"X_train shape: {X_train.shape}")
    print(f"y_train shape: {y_train.shape}")
    print(f"X_test shape: {X_test.shape}")
    print(f"y_test shape: {y_test.shape}")

Preprocessed data not found. Running preprocessing steps...
[                                        ] | 0% Completed | 208.67 us

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  train_sequences = train_data.groupby('utrip_id')['city_country'].apply(list).compute().tolist()


[########################################] | 100% Completed | 7.44 sms
[                                        ] | 0% Completed | 308.59 us

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  test_sequences = test_data.groupby('utrip_id')['city_country'].apply(list).compute().tolist()


[########################################] | 100% Completed | 2.35 sms


In [None]:
# Print the first 5 rows of X_train and y_train
print("First 5 rows of X_train:")
print(X_train[:5])

print("\nFirst 5 elements of y_train:")
print(y_train[:5])

# Convert to Dask DataFrame for better readability
X_train_dd = dd.from_pandas(pd.DataFrame(X_train), npartitions=5)
y_train_dd = dd.from_pandas(pd.Series(y_train, name='Target'), npartitions=5)

X_test_dd = dd.from_pandas(pd.DataFrame(X_test), npartitions=5)
y_test_dd = dd.from_pandas(pd.Series(y_test, name='Target'), npartitions=5)

# Display the first 5 rows
print("\nFirst 5 rows of X_train (as DataFrame):")
print(X_train_dd.head())

print("\nFirst 5 elements of y_train (as DataFrame):")
print(y_train_dd.head())

print("\nFirst 5 rows of X_test (as DataFrame):")
print(X_test_dd.head())

print("\nFirst 5 elements of y_test (as DataFrame):")
print(y_test_dd.head())

First 5 rows of X_train:
[[    0     0     0     0     0     0     0 14852]
 [    0     0     0     0     0     0 14852 16088]
 [    0     0     0     0     0     0     0 11497]
 [    0     0     0     0     0     0 11497 11184]
 [    0     0     0     0     0     0     0  4523]]

First 5 elements of y_train:
[16088 14852 11184  1718  4611]

First 5 rows of X_train (as DataFrame):
   0  1  2  3  4  5      6      7
0  0  0  0  0  0  0      0  14852
1  0  0  0  0  0  0  14852  16088
2  0  0  0  0  0  0      0  11497
3  0  0  0  0  0  0  11497  11184
4  0  0  0  0  0  0      0   4523

First 5 elements of y_train (as DataFrame):
0    16088
1    14852
2    11184
3     1718
4     4611
Name: Target, dtype: int64

First 5 rows of X_test (as DataFrame):
   0  1  2  3  4  5     6      7
0  0  0  0  0  0  0     0   2630
1  0  0  0  0  0  0  2630  13409
2  0  0  0  0  0  0     0      0
3  0  0  0  0  0  0     0      0
4  0  0  0  0  0  0     0   2443

First 5 elements of y_test (as DataFrame):
0  

In [None]:
unique_city_country = set(city_country for seq in all_sequences for city_country in seq)
print("Unique city_country values:", len(unique_city_country))

Unique city_country values: 17196


### Collaborative Filtering (Item-Based)

In [None]:
# Collaborative Filtering (Item-Based)
# Create a co-occurrence matrix
item_cooccurrence_matrix = np.zeros((len(encoder.classes_), len(encoder.classes_)))

for seq in encoded_train_sequences:
    for i in range(len(seq)):
        for j in range(i + 1, len(seq)):
            item_cooccurrence_matrix[seq[i], seq[j]] += 1
            item_cooccurrence_matrix[seq[j], seq[i]] += 1

# Use TruncatedSVD for dimensionality reduction
embedding_dim = 50
svd = TruncatedSVD(n_components=embedding_dim)
item_embeddings = svd.fit_transform(item_cooccurrence_matrix)

# Calculate cosine similarity matrix for embeddings
item_sim_matrix = cosine_similarity(item_embeddings)

def collaborative_filtering_predict(current_place):
    if current_place in encoder.classes_:
        current_idx = encoder.transform([current_place])[0]
        similarity_scores = item_sim_matrix[current_idx]
        most_similar_idx = similarity_scores.argsort()[-2]
        return encoder.inverse_transform([most_similar_idx])[0]
    else:
        return None

# Collaborative Filtering Predictions
collab_preds = [collaborative_filtering_predict(encoder.inverse_transform([seq[-1]])[0]) for seq in encoded_test_sequences]

print("Collaborative Filtering Complete")
# Print the first 5 collaborative filtering predictions

print("First 5 collaborative filtering predictions:")
print(collab_preds[:5])

NameError: name 'np' is not defined

### Markov Chains

In [None]:
# Markov Chains
# Create transition pairs from the city_country chains
transitions = []

for chain in encoded_train_sequences:
    for i in range(len(chain) - 1):
        transitions.append((chain[i], chain[i + 1]))

# Create a DataFrame for transitions
transitions_df = pd.DataFrame(transitions, columns=['current_place', 'next_place'])

# Calculate transition probabilities
transition_counts = transitions_df.groupby('current_place')['next_place'].value_counts(normalize=True).unstack(fill_value=0)

# Function to predict the next place based on the current place
def markov_chain_predict(current_place):
    if current_place in transition_counts.index:
        return transition_counts.loc[current_place].idxmax()
    else:
        return None

# Generate predictions using the Markov chain model
markov_preds = [markov_chain_predict(seq[-1]) for seq in encoded_test_sequences]

# Convert predictions back to original city_country labels
markov_preds = [encoder.inverse_transform([pred])[0] if pred is not None else 'unknown' for pred in markov_preds]

print("Markov Complete")

# Print the first 5 Markov chain predictions
print("First 5 Markov chain predictions:")


Markov Complete
First 5 Markov chain predictions:
['5325_Cobra Island', 'unknown', '23921_Cobra Island', '26235_Alvonia', '17087_Glubbdubdrib']


In [None]:
# Function to ensure that y_test and predictions have consistent lengths
def filter_valid_predictions(y_true, y_pred):
    y_true_filtered, y_pred_filtered = [], []
    for true, pred in zip(y_true, y_pred):
        if pred is not None:  # Filter out 'None' predictions
            y_true_filtered.append(true)
            y_pred_filtered.append(pred)
    return y_true_filtered, y_pred_filtered


# Function to evaluate models
def evaluate_model(y_true, y_pred):
    # Ensure y_pred contains only labels present in encoder.classes_
    y_pred_mapped = []
    for label in y_pred:
        if label in encoder.classes_:
            y_pred_mapped.append(label)
        else:
            # Handle previously unseen labels, e.g., by mapping to a default label
            y_pred_mapped.append('unknown')  # Replace with appropriate handling

    # Filter y_true and y_pred to only include valid pairs
    y_true_filtered, y_pred_filtered = filter_valid_predictions(y_true, y_pred_mapped)

    # Transform y_true and y_pred_filtered
    y_true_encoded = encoder.transform(y_true_filtered)
    y_pred_encoded = encoder.transform(y_pred_filtered)

    accuracy = accuracy_score(y_true_encoded, y_pred_encoded)
    precision = precision_score(y_true_encoded, y_pred_encoded, average='weighted', zero_division=1)
    return accuracy, precision

# Random Forest Model
@delayed
def train_rf(X_train, y_train, X_test):
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    rf_preds = rf_model.predict(X_test)
    rf_preds = encoder.inverse_transform(rf_preds)
    return rf_preds

# Gradient Boosting Model
@delayed
def train_gbm(X_train, y_train, X_test):
    gbm_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
    gbm_model.fit(X_train, y_train)
    gbm_preds = gbm_model.predict(X_test)
    gbm_preds = encoder.inverse_transform(gbm_preds)
    return gbm_preds

# LSTM Model
def train_lstm(X_train, y_train, X_test):
    lstm_model = Sequential()
    lstm_model.add(Embedding(input_dim=len(encoder.classes_), output_dim=50, input_length=X_train.shape[1]))
    lstm_model.add(LSTM(100, return_sequences=False))
    lstm_model.add(Dense(len(encoder.classes_), activation='softmax'))
    lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    lstm_model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2)
    
    # Predict probabilities using softmax output
    lstm_preds = lstm_model.predict(X_test)
    
    # Use np.argmax to get the index of the class with the highest probability
    lstm_preds_idx = np.argmax(lstm_preds, axis=1)
    
    # Convert the predicted indices back to original labels
    lstm_preds_labels = encoder.inverse_transform(lstm_preds_idx)
    
    return lstm_preds_labels

In [None]:
# Train and predict using Dask delayed functions
rf_preds, gbm_preds, lstm_preds = compute(train_rf(X_train, y_train, X_test), train_gbm(X_train, y_train, X_test), train_lstm(X_train, y_train, X_test))

# Save model predictions to checkpoint
save_data((collab_preds, markov_preds, rf_preds, gbm_preds, lstm_preds), models_predictions_path)
print("Saved model predictions to checkpoint.")


Epoch 1/10


2024-05-26 00:18:29.411464: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-26 00:18:29.606420: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


[1m314/314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 44ms/step - accuracy: 0.0085 - loss: 8.8288 - val_accuracy: 0.0094 - val_loss: 8.2178
Epoch 2/10
[1m314/314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 42ms/step - accuracy: 0.0079 - loss: 7.7582 - val_accuracy: 0.0100 - val_loss: 8.2739
Epoch 3/10
[1m314/314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 45ms/step - accuracy: 0.0075 - loss: 7.5849 - val_accuracy: 0.0166 - val_loss: 8.3873
Epoch 4/10
[1m314/314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 43ms/step - accuracy: 0.0129 - loss: 7.4662 - val_accuracy: 0.0158 - val_loss: 8.4638
Epoch 5/10
[1m314/314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 45ms/step - accuracy: 0.0168 - loss: 7.2872 - val_accuracy: 0.0170 - val_loss: 8.4665
Epoch 6/10
[1m314/314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 43ms/step - accuracy: 0.0204 - loss: 7.0905 - val_accuracy: 0.0343 - val_loss: 8.5325
Epoch 7/10
[1m314/314[0m 

2024-05-26 00:20:50.637380: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 563684880 exceeds 10% of free system memory.


: 

In [None]:
# Evaluating Predictions
collab_accuracy, collab_precision = evaluate_model(y_test, collab_preds)
markov_accuracy, markov_precision = evaluate_model(y_test, markov_preds)
rf_accuracy, rf_precision = evaluate_model(encoder.inverse_transform(y_test), rf_preds)
gbm_accuracy, gbm_precision = evaluate_model(encoder.inverse_transform(y_test), gbm_preds)
lstm_accuracy, lstm_precision = evaluate_model(encoder.inverse_transform(y_test), lstm_preds)

# Print the results
print(f"Collaborative Filtering - Accuracy: {collab_accuracy:.2f}, Precision: {collab_precision:.2f}")
print(f"Markov Chains - Accuracy: {markov_accuracy:.2f}, Precision: {markov_precision:.2f}")
print(f"Random Forest - Accuracy: {rf_accuracy:.2f}, Precision: {rf_precision:.2f}")
print(f"Gradient Boosting - Accuracy: {gbm_accuracy:.2f}, Precision: {gbm_precision:.2f}")
print(f"LSTM - Accuracy: {lstm_accuracy:.2f}, Precision: {lstm_precision:.2f}")

ValueError: Found input variables with inconsistent numbers of samples: [110, 3677]

In [None]:
# Function to save predictions to CSV
def save_predictions(predictions, filename, current_city):
    preds_df = pd.DataFrame(predictions, columns=['predicted_next_city_country'])
    preds_df['current_city_country'] = current_city
    preds_df.to_csv(filename, index=False)
    print(f'Predictions written to {filename}')

# Prepare current city data for reference
current_city = [encoder.inverse_transform([seq[-1]])[0] for seq in encoded_test_sequences]

# Save the predictions for each model
model_predictions = {
    'collab_predictions.csv': collab_preds,
    'markov_predictions.csv': markov_preds,
    'rf_predictions.csv': rf_preds,
    'gbm_predictions.csv': gbm_preds,
    'lstm_predictions.csv': lstm_preds
}

for filename, preds in model_predictions.items():
    save_predictions(preds, filename, current_city, encoded_test_sequences)