In [1]:
# Preprocessing and Helper Functions
# First, let's set up the preprocessing and helper functions that will be used by the RecSys models.

import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import timedelta


In [2]:
# Load the dataset
train_data = pd.read_csv('train_set.csv')
fraction_of_data_to_use = 0.01   
train_data = train_data.sample(frac=fraction_of_data_to_use)

# Convert date columns to datetime format
train_data['checkin'] = pd.to_datetime(train_data['checkin'])
train_data['checkout'] = pd.to_datetime(train_data['checkout'])

# Sort the data by user trip ID and check-in date to maintain the chronological order
train_data.sort_values(by=['utrip_id', 'checkin'], inplace=True)

# Create a city_country column
train_data['city_country'] = train_data['city_id'].astype(str) + '_' + train_data['hotel_country'].astype(str)


In [None]:
#Create city and country chains with additional features

In [3]:
# Initialize tqdm for progress tracking
tqdm.pandas()

# Function to calculate trip duration
def calculate_trip_duration(checkin, checkout):
    return (checkout - checkin).days

# Function to calculate stay duration
def calculate_stay_duration(checkin, checkout):
    return (checkout - checkin).days

# Group by utrip_id and create the city and country chains with additional features
trip_chains = train_data.groupby('utrip_id').progress_apply(lambda group: pd.Series({
    'user_id': group['user_id'].iloc[0],
    'cities_chain': list(group['city_id']),
    'countries_chain': list(group['hotel_country']),
    'trip_duration': calculate_trip_duration(group['checkin'].iloc[0], group['checkout'].iloc[-1]),
    'stay_durations': list(group.apply(lambda row: calculate_stay_duration(row['checkin'], row['checkout']), axis=1)),
    'device_classes': list(group['device_class']),
    'affiliate_ids': list(group['affiliate_id']),
    'checkin_months': list(group['checkin'].dt.month),
    'checkin_days_of_week': list(group['checkin'].dt.dayofweek)
})).reset_index()

# Rename columns for clarity
trip_chains.columns = ['utrip_id', 'user_id', 'cities_chain', 'countries_chain', 'trip_duration', 'stay_durations',
                       'device_classes', 'affiliate_ids', 'checkin_months', 'checkin_days_of_week']

# Create a new DataFrame for trip chains
trip_chains_df = trip_chains.copy()

# Add city_country chains to the DataFrame
trip_chains_df['city_country_chain'] = trip_chains_df.apply(
    lambda row: [f"{city}_{country}" for city, country in zip(row['cities_chain'], row['countries_chain'])], axis=1)

# Display the first few rows of the trip_chains_df DataFrame
print(trip_chains_df.head())


  0%|          | 0/11385 [00:00<?, ?it/s]

100%|██████████| 11385/11385 [00:37<00:00, 302.87it/s]


    utrip_id  user_id cities_chain countries_chain  trip_duration  \
0  1000643_1  1000643      [62185]       [Axphain]              2   
1   100160_2   100160      [49540]    [Novistrana]              2   
2  1001776_1  1001776       [4608]  [Glubbdubdrib]              1   
3  1001838_2  1001838      [43666]      [Leutonia]              1   
4  1002275_1  1002275      [36805]       [Alvonia]              1   

  stay_durations device_classes affiliate_ids checkin_months  \
0            [2]      [desktop]         [384]           [10]   
1            [2]      [desktop]        [8132]            [8]   
2            [1]      [desktop]        [3134]            [8]   
3            [1]      [desktop]        [9924]            [6]   
4            [1]      [desktop]        [4541]            [9]   

  checkin_days_of_week   city_country_chain  
0                  [5]      [62185_Axphain]  
1                  [1]   [49540_Novistrana]  
2                  [0]  [4608_Glubbdubdrib]  
3               

In [5]:
# Write the trip_chains_df DataFrame to an Excel file
output_file = 'trip_chains_enhanced.xlsx'
trip_chains_df.to_excel(output_file, index=False)
print(f'Trip chains written to {output_file}')


Trip chains written to trip_chains_enhanced.xlsx


In [6]:
#Generating Transition Pairs and Calculating Transition Probabilities:
# MARKOV CHAINS
    
transitions = []

for chain in trip_chains_df['city_country_chain']:
    for i in range(len(chain) - 1):
        transitions.append((chain[i], chain[i + 1]))

transitions_df = pd.DataFrame(transitions, columns=['current_place', 'next_place'])
transition_counts = transitions_df.groupby('current_place')['next_place'].value_counts(normalize=True).unstack(fill_value=0)
print(transition_counts.head())

next_place          10060_Novistrana  10061_Bozatta  10283_Bozatta  \
current_place                                                        
10013_Fook Island                0.0            0.0            0.0   
10064_Glubbdubdrib               0.0            0.0            0.0   
1034_Gondal                      0.0            0.0            0.0   
10392_El Othar                   0.0            0.0            0.0   
1046_Glubbdubdrib                0.0            0.0            0.0   

next_place          1046_Glubbdubdrib  10743_Cobra Island  10839_Gondal  \
current_place                                                             
10013_Fook Island                 0.0                 0.0           0.0   
10064_Glubbdubdrib                0.0                 0.0           0.0   
1034_Gondal                       0.0                 0.0           0.0   
10392_El Othar                    0.0                 0.0           0.0   
1046_Glubbdubdrib                 0.0                 0.0  

In [7]:
#Predicting the Next City-Country and Adding Predictions to DataFrame:

def predict_next_place(current_place):
    if current_place in transition_counts.index:
        return transition_counts.loc[current_place].idxmax()
    else:
        return None

trip_chains_df['predict_next_city_country'] = trip_chains_df['city_country_chain'].apply(
    lambda chain: predict_next_place(chain[-1]) if len(chain) > 0 else None)
print(trip_chains_df.head())

    utrip_id  user_id cities_chain countries_chain  trip_duration  \
0  1000643_1  1000643      [62185]       [Axphain]              2   
1   100160_2   100160      [49540]    [Novistrana]              2   
2  1001776_1  1001776       [4608]  [Glubbdubdrib]              1   
3  1001838_2  1001838      [43666]      [Leutonia]              1   
4  1002275_1  1002275      [36805]       [Alvonia]              1   

  stay_durations device_classes affiliate_ids checkin_months  \
0            [2]      [desktop]         [384]           [10]   
1            [2]      [desktop]        [8132]            [8]   
2            [1]      [desktop]        [3134]            [8]   
3            [1]      [desktop]        [9924]            [6]   
4            [1]      [desktop]        [4541]            [9]   

  checkin_days_of_week   city_country_chain predict_next_city_country  
0                  [5]      [62185_Axphain]             16612_Axphain  
1                  [1]   [49540_Novistrana]             

In [8]:
# Writing the output of the predicted next city to an Excel file
output_file = 'trip_chains_enhanced_withprednextcity.xlsx'
trip_chains_df.to_excel(output_file, index=False)
print(f'Trip chains written to {output_file}')

Trip chains written to trip_chains_enhanced_withprednextcity.xlsx


In [9]:
# Simple RNN model

# Additional Imports
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score

# Function to preprocess data and prepare for RNN
def preprocess_data(data):
    # Encode the city_country column
    label_encoder = LabelEncoder()
    data['city_country_encoded'] = label_encoder.fit_transform(data['city_country'])

    # Create sequences for RNN
    sequences = []
    targets = []
    for utrip_id, group in data.groupby('utrip_id'):
        sequences.append(group['city_country_encoded'].values[:-1])
        targets.append(group['city_country_encoded'].values[1:])
    
    # Pad sequences
    max_seq_length = max(len(seq) for seq in sequences)
    X = pad_sequences(sequences, maxlen=max_seq_length, padding='post')
    y = pad_sequences(targets, maxlen=max_seq_length, padding='post')
    
    return X, y, label_encoder

# Split the data into training and test sets
X, y, label_encoder = preprocess_data(train_data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.02, random_state=42)

# Build the RNN model
model = Sequential()
model.add(Embedding(input_dim=len(label_encoder.classes_), output_dim=50))#, input_length=X_train.shape[1]))
model.add(SimpleRNN(100, return_sequences=True))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=1, batch_size=32, validation_split=0.2)

# Evaluate the model
y_pred = model.predict(X_test)
y_pred_classes = y_pred.argmax(axis=-1)

# Calculate accuracy and precision
accuracy = accuracy_score(y_test.flatten(), y_pred_classes.flatten())
precision = precision_score(y_test.flatten(), y_pred_classes.flatten(), average='macro',zero_division=0)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")

2024-06-01 08:32:17.950301: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-01 08:32:24.315335: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:282] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 74ms/step - accuracy: 0.9561 - loss: 2.7967 - val_accuracy: 0.9870 - val_loss: 0.1104
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 43ms/step
Accuracy: 1.00
Precision: 0.33


In [10]:
# Decode the predictions back to city_country names

y_test_flat = y_test.flatten()
y_pred_flat = y_pred_classes.flatten()
y_test_decoded = label_encoder.inverse_transform(y_test_flat)
y_pred_decoded = label_encoder.inverse_transform(y_pred_flat)

# Create a DataFrame to store the results

results_df = pd.DataFrame({
    'Actual': y_test_decoded,
    'Predicted': y_pred_decoded
})

# Write the results to an Excel file
results_df.to_excel('predicted_city_results.xlsx', index=False)
print("Predicted results written to predicted_city_results.xlsx")

Predicted results written to predicted_city_results.xlsx
