#### We'll will compare the following models for their accuracy and precision. 

1. Collaborative Filtering (Item-Based)
2. Markov Chains
3. Random Forest
4. Gradient Boosting Machine
5. Recurrent Neural Networks (RNN) - LSTM

To get us started we will set up the preprocessing and helper functions that will be used by all models.

In [14]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from dask.diagnostics import ProgressBar
from sklearn.metrics import accuracy_score, precision_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from dask import delayed, compute
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD




In [15]:
# Load the datasets
train_data = dd.read_csv('train_set.csv')
test_data = dd.read_csv('test_set.csv')

# Sample a subset of the data (optional, adjust frac as needed)
fraction_of_data_to_use = 0.01  # Adjust this value to suit your needs
train_data = train_data.sample(frac=fraction_of_data_to_use)
test_data = test_data.sample(frac=fraction_of_data_to_use)

# Ensure 'checkin' is in string format
train_data['checkin'] = train_data['checkin'].astype(str)
test_data['checkin'] = test_data['checkin'].astype(str)

# Create a new column that combines 'utrip_id' and 'checkin'
train_data['utrip_id_checkin'] = train_data['utrip_id'].astype(str) + '_' + train_data['checkin']
test_data['utrip_id_checkin'] = test_data['utrip_id'].astype(str) + '_' + test_data['checkin']

# Set the new column as index (if necessary for your logic)
# Note: This may not be necessary for the overall logic.
# train_data = train_data.set_index('utrip_id_checkin')
# test_data = test_data.set_index('utrip_id_checkin')

# Create a city_country column
train_data['city_country'] = train_data['city_id'].astype(str) + '_' + train_data['hotel_country'].astype(str)
test_data['city_country'] = test_data['city_id'].astype(str) + '_' + test_data['hotel_country'].astype(str)

# Handle missing values
train_data['city_country'] = train_data['city_country'].fillna('missing')
test_data['city_country'] = test_data['city_country'].fillna('missing')

# Convert city_country to category type for efficient encoding
train_data = train_data.categorize(columns=['city_country'])
test_data = test_data.categorize(columns=['city_country'])

# Group by utrip_id to create sequences
with ProgressBar():
    train_sequences = train_data.groupby('utrip_id')['city_country'].apply(list).compute().tolist()
    test_sequences = test_data.groupby('utrip_id')['city_country'].apply(list).compute().tolist()

# Encode city_country strings as integers
encoder = LabelEncoder()
all_sequences = train_sequences + test_sequences
all_cities_countries = [city_country for seq in all_sequences for city_country in seq]
encoder.fit(all_cities_countries)

encoded_train_sequences = [encoder.transform(seq).tolist() for seq in train_sequences]
encoded_test_sequences = [encoder.transform(seq).tolist() for seq in test_sequences]

# Prepare data for training models
def prepare_data(sequences, sequence_length=10):
    X, y = [], []
    for seq in tqdm(sequences, desc="Preparing data"):
        for i in range(1, min(len(seq), sequence_length + 1)):
            X.append(seq[:i])
            y.append(seq[i])
    X = pad_sequences(X, maxlen=sequence_length, padding='pre')
    y = np.array(y)
    return X, y

X_train, y_train = prepare_data(encoded_train_sequences)
X_test, y_test = prepare_data(encoded_test_sequences)

# Print shapes to verify the data preparation
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

[######                                  ] | 16% Completed | 101.93 ms

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  train_sequences = train_data.groupby('utrip_id')['city_country'].apply(list).compute().tolist()


[########################################] | 100% Completed | 2.04 sms
[######                                  ] | 16% Completed | 102.24 ms

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  test_sequences = test_data.groupby('utrip_id')['city_country'].apply(list).compute().tolist()


[########################################] | 100% Completed | 780.22 ms


In [None]:
# Print the first 5 rows of X_train and y_train
print("First 5 rows of X_train:")
print(X_train[:5])

print("\nFirst 5 elements of y_train:")
print(y_train[:5])

# Convert to Dask DataFrame for better readability
X_train_dd = dd.from_pandas(pd.DataFrame(X_train), npartitions=5)
y_train_dd = dd.from_pandas(pd.Series(y_train, name='Target'), npartitions=5)

X_test_dd = dd.from_pandas(pd.DataFrame(X_test), npartitions=5)
y_test_dd = dd.from_pandas(pd.Series(y_test, name='Target'), npartitions=5)

# Display the first 5 rows
print("\nFirst 5 rows of X_train (as DataFrame):")
print(X_train_dd.head())

print("\nFirst 5 elements of y_train (as DataFrame):")
print(y_train_dd.head())

print("\nFirst 5 rows of X_test (as DataFrame):")
print(X_test_dd.head())

print("\nFirst 5 elements of y_test (as DataFrame):")
print(y_test_dd.head())

First 5 rows of X_train:
[[   0    0    0    0    0    0    0    0    0 4041]
 [   0    0    0    0    0    0    0    0    0 3963]
 [   0    0    0    0    0    0    0    0    0 2142]
 [   0    0    0    0    0    0    0    0    0 2520]
 [   0    0    0    0    0    0    0    0    0 1392]]

First 5 elements of y_train:
[2071 2636 4098  162 1554]

First 5 rows of X_train (as DataFrame):
   0  1  2  3  4  5  6  7  8     9
0  0  0  0  0  0  0  0  0  0  4041
1  0  0  0  0  0  0  0  0  0  3963
2  0  0  0  0  0  0  0  0  0  2142
3  0  0  0  0  0  0  0  0  0  2520
4  0  0  0  0  0  0  0  0  0  1392

First 5 elements of y_train (as DataFrame):
0    2071
1    2636
2    4098
3     162
4    1554
Name: Target, dtype: int64

First 5 rows of X_test (as DataFrame):
   0  1  2  3  4  5  6  7  8     9
0  0  0  0  0  0  0  0  0  0     0
1  0  0  0  0  0  0  0  0  0  1320
2  0  0  0  0  0  0  0  0  0  1229
3  0  0  0  0  0  0  0  0  0  2957
4  0  0  0  0  0  0  0  0  0     0

First 5 elements of y_test (

In [None]:
unique_city_country = set(city_country for seq in all_sequences for city_country in seq)
print("Unique city_country values:", len(unique_city_country))

Unique city_country values: 4757


### Collaborative Filtering (Item-Based)

In [None]:
# Collaborative Filtering (Item-Based)
# Create a co-occurrence matrix
item_cooccurrence_matrix = np.zeros((len(encoder.classes_), len(encoder.classes_)))

for seq in encoded_train_sequences:
    for i in range(len(seq)):
        for j in range(i + 1, len(seq)):
            item_cooccurrence_matrix[seq[i], seq[j]] += 1
            item_cooccurrence_matrix[seq[j], seq[i]] += 1

# Use TruncatedSVD for dimensionality reduction
embedding_dim = 50  # Adjust as needed
svd = TruncatedSVD(n_components=embedding_dim)
item_embeddings = svd.fit_transform(item_cooccurrence_matrix)

# Calculate cosine similarity matrix for embeddings
item_sim_matrix = cosine_similarity(item_embeddings)

def collaborative_filtering_predict(current_place):
    if current_place in encoder.classes_:
        current_idx = encoder.transform([current_place])[0]
        similarity_scores = item_sim_matrix[current_idx]
        most_similar_idx = similarity_scores.argsort()[-2]  # -2 to avoid the same item
        return encoder.inverse_transform([most_similar_idx])[0]
    else:
        return None

collab_preds = [collaborative_filtering_predict(encoder.inverse_transform([seq[-1]])[0]) for seq in encoded_test_sequences]
print("Collaborative Filtering Complete")
# Print the first 5 collaborative filtering predictions

print("First 5 collaborative filtering predictions:")
print(collab_preds[:5])

KeyboardInterrupt: 

### Markov Chains

In [None]:
# Markov Chains
# Create transition pairs from the city_country chains
transitions = []

for chain in encoded_train_sequences:
    for i in range(len(chain) - 1):
        transitions.append((chain[i], chain[i + 1]))

# Create a DataFrame for transitions
transitions_df = pd.DataFrame(transitions, columns=['current_place', 'next_place'])

# Calculate transition probabilities
transition_counts = transitions_df.groupby('current_place')['next_place'].value_counts(normalize=True).unstack(fill_value=0)

# Function to predict the next place based on the current place
def markov_chain_predict(current_place):
    if current_place in transition_counts.index:
        return transition_counts.loc[current_place].idxmax()
    else:
        return None

# Generate predictions using the Markov chain model
markov_preds = [markov_chain_predict(seq[-1]) for seq in encoded_test_sequences]

# Convert predictions back to original city_country labels
markov_preds = [encoder.inverse_transform([pred])[0] if pred is not None else 'unknown' for pred in markov_preds]

print("Markov Complete")

# Print the first 5 Markov chain predictions
print("First 5 Markov chain predictions:")
print(markov_preds[:5])

Markov Complete
First 5 Markov chain predictions:
['5241_Cobra Island', '13260_Osterlich', '18417_Glubbdubdrib', '29319_Cobra Island', '64876_Fook Island']


In [None]:
# Function to evaluate models
def evaluate_model(y_true, y_pred):
    y_true_encoded = encoder.transform(y_true)
    y_pred_encoded = encoder.transform(y_pred)
    accuracy = accuracy_score(y_true_encoded, y_pred_encoded)
    precision = precision_score(y_true_encoded, y_pred_encoded, average='weighted', zero_division=1)
    return accuracy, precision

# Random Forest Model
@delayed
def train_rf(X_train, y_train, X_test):
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    rf_preds = rf_model.predict(X_test)
    rf_preds = encoder.inverse_transform(rf_preds)
    return rf_preds

# Gradient Boosting Model
@delayed
def train_gbm(X_train, y_train, X_test):
    gbm_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
    gbm_model.fit(X_train, y_train)
    gbm_preds = gbm_model.predict(X_test)
    gbm_preds = encoder.inverse_transform(gbm_preds)
    return gbm_preds

# LSTM Model
@delayed
def train_lstm(X_train, y_train, X_test):
    lstm_model = Sequential()
    lstm_model.add(Embedding(input_dim=len(encoder.classes_), output_dim=50, input_length=X_train.shape[1]))
    lstm_model.add(LSTM(100, return_sequences=False))
    lstm_model.add(Dense(len(encoder.classes_), activation='softmax'))
    lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    lstm_model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2)
    lstm_preds = lstm_model.predict(X_test)
    lstm_preds = np.argmax(lstm_preds, axis=1)
    lstm_preds = encoder.inverse_transform(lstm_preds)
    return lstm_preds

In [None]:
# Train and predict using Dask delayed functions
rf_preds, gbm_preds, lstm_preds = compute(train_rf(X_train, y_train, X_test), train_gbm(X_train, y_train, X_test), train_lstm(X_train, y_train, X_test))

NameError: name 'X_train' is not defined

In [None]:
# Evaluate all models
collab_accuracy, collab_precision = evaluate_model(encoder.inverse_transform(y_test), collab_preds)
markov_accuracy, markov_precision = evaluate_model(encoder.inverse_transform(y_test), markov_preds)
rf_accuracy, rf_precision = evaluate_model(encoder.inverse_transform(y_test), rf_preds)
gbm_accuracy, gbm_precision = evaluate_model(encoder.inverse_transform(y_test), gbm_preds)
lstm_accuracy, lstm_precision = evaluate_model(encoder.inverse_transform(y_test), lstm_preds)

# Print the results
print(f"Collaborative Filtering - Accuracy: {collab_accuracy:.2f}, Precision: {collab_precision:.2f}")
print(f"Markov Chains - Accuracy: {markov_accuracy:.2f}, Precision: {markov_precision:.2f}")
print(f"Random Forest - Accuracy: {rf_accuracy:.2f}, Precision: {rf_precision:.2f}")
print(f"Gradient Boosting - Accuracy: {gbm_accuracy:.2f}, Precision: {gbm_precision:.2f}")
print(f"LSTM - Accuracy: {lstm_accuracy:.2f}, Precision: {lstm_precision:.2f}")

In [None]:
# Function to save predictions to CSV
def save_predictions(predictions, filename, current_city):
    preds_df = pd.DataFrame(predictions, columns=['predicted_next_city_country'])
    preds_df['current_city_country'] = current_city
    preds_df.to_csv(filename, index=False)
    print(f'Predictions written to {filename}')

# Prepare current city data for reference
current_city = [encoder.inverse_transform([seq[-1]])[0] for seq in encoded_test_sequences]

# Save the predictions for each model
model_predictions = {
    'collab_predictions.csv': collab_preds,
    'markov_predictions.csv': markov_preds,
    'rf_predictions.csv': rf_preds,
    'gbm_predictions.csv': gbm_preds,
    'lstm_predictions.csv': lstm_preds
}

for filename, preds in model_predictions.items():
    save_predictions(preds, filename, current_city, encoded_test_sequences)

### Random Forest

In [None]:
# from sklearn.ensemble import RandomForestClassifier

# # Create and train the random forest model
# rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
# rf_model.fit(X_train, y_train)

# # Predict the next city_country
# rf_preds = rf_model.predict(X_test)
# rf_preds = encoder.inverse_transform(rf_preds)

# print("Random Forest Complete")


NameError: name 'X_train' is not defined

### Gradient Boost

In [None]:
# from sklearn.ensemble import GradientBoostingClassifier

# # Create and train the gradient boosting model
# gbm_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
# gbm_model.fit(X_train, y_train)

# # Predict the next city_country
# gbm_preds = gbm_model.predict(X_test)
# gbm_preds = encoder.inverse_transform(gbm_preds)

# print("GBM Complete")

ValueError: X has 21 features, but GradientBoostingClassifier is expecting 20 features as input.

### Recurrent Neural Networks (RNN) - LSTM

In [None]:
# # Define the LSTM model
# lstm_model = Sequential()
# lstm_model.add(Embedding(input_dim=len(encoder.classes_), output_dim=50, input_length=X_train.shape[1]))
# lstm_model.add(LSTM(100, return_sequences=False))
# lstm_model.add(Dense(len(encoder.classes_), activation='softmax'))

# # Compile the model
# lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# # Train the model
# lstm_model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2)

# # Predict the next city_country
# lstm_preds = lstm_model.predict(X_test)
# lstm_preds = np.argmax(lstm_preds, axis=1)
# lstm_preds = encoder.inverse_transform(lstm_preds)

# print("LSTM Complete")


### Model Comparison

In [None]:
# # Helper function to evaluate models
# def evaluate_model(y_true, y_pred):
#     y_true_encoded = encoder.transform(y_true)
#     y_pred_encoded = encoder.transform(y_pred)
#     accuracy = accuracy_score(y_true_encoded, y_pred_encoded)
#     precision = precision_score(y_true_encoded, y_pred_encoded, average='weighted', zero_division=1)
#     return accuracy, precision


# # Evaluate all models
# #collab_accuracy, collab_precision = evaluate_model(y_test, collab_preds)
# markov_accuracy, markov_precision = evaluate_model(y_test, markov_preds)
# #rf_accuracy, rf_precision = evaluate_model(y_test, rf_preds)
# #gbm_accuracy, gbm_precision = evaluate_model(y_test, gbm_preds)
# #lstm_accuracy, lstm_precision = evaluate_model(y_test, lstm_preds)

# # Print the results
# #print(f"Collaborative Filtering - Accuracy: {collab_accuracy:.2f}, Precision: {collab_precision:.2f}")
# print(f"Markov Chains - Accuracy: {markov_accuracy:.2f}, Precision: {markov_precision:.2f}")
# #print(f"Random Forest - Accuracy: {rf_accuracy:.2f}, Precision: {rf_precision:.2f}")
# #print(f"Gradient Boosting - Accuracy: {gbm_accuracy:.2f}, Precision: {gbm_precision:.2f}")
# #print(f"LSTM - Accuracy: {lstm_accuracy:.2f}, Precision: {lstm_precision:.2f}")


ValueError: y contains previously unseen labels: '3452'

In [None]:
# # Convert collab_preds to DataFrame
# collab_preds_df = pd.DataFrame(collab_preds, columns=['predicted_next_city_country'])

# # Optionally, add a column for the original test sequences for reference
# # Assuming you want to add the last element from each sequence as the current city
# current_city = [encoder.inverse_transform([seq[-1]])[0] for seq in encoded_test_sequences]
# collab_preds_df['current_city_country'] = current_city

# # Save the DataFrame to a CSV file
# output_file = 'collab_predictions.csv'
# collab_preds_df.to_csv(output_file, index=False)
# print(f'Predictions written to {output_file}')

In [None]:
# # Convert markov_preds to DataFrame
# markov_preds_df = pd.DataFrame(markov_preds, columns=['predicted_next_city_country'])

# # Optionally, add a column for the original test sequences for reference
# # Assuming you want to add the last element from each sequence as the current city
# current_city = [encoder.inverse_transform([seq[-1]])[0] for seq in encoded_test_sequences]
# markov_preds_df['current_city_country'] = current_city

# # Save the DataFrame to a CSV file
# output_file = 'markov_predictions.csv'
# markov_preds_df.to_csv(output_file, index=False)
# print(f'Predictions written to {output_file}')


In [None]:

# # Convert gbm_preds to DataFrame
# gbm_preds_df = pd.DataFrame(gbm_preds, columns=['predicted_next_city_country'])

# # Optionally, add a column for the original test sequences for reference
# # Assuming you want to add the last element from each sequence as the current city
# current_city = [encoder.inverse_transform([seq[-1]])[0] for seq in encoded_test_sequences]
# gbm_preds_df['current_city_country'] = current_city

# # Save the DataFrame to a CSV file
# output_file = 'gbm_predictions.csv'
# gbm_preds_df.to_csv(output_file, index=False)
# print(f'Predictions written to {output_file}')


In [None]:
# # Convert lstm_preds to DataFrame
# lstm_preds_df = pd.DataFrame(lstm_preds, columns=['predicted_next_city_country'])

# # Optionally, add a column for the original test sequences for reference
# # Assuming you want to add the last element from each sequence as the current city
# current_city = [encoder.inverse_transform([seq[-1]])[0] for seq in encoded_test_sequences]
# lstm_preds_df['current_city_country'] = current_city

# # Save the DataFrame to a CSV file
# output_file = 'lstm_predictions.csv'
# lstm_preds_df.to_csv(output_file, index=False)
# print(f'Predictions written to {output_file}')