In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModel

In [2]:
users_df = pd.read_excel("../data/Visitors Preference Dataset.xlsx")
places_df = pd.read_excel("../data/Places Dataset.xlsx")

In [3]:
users_df.head()

Unnamed: 0,User ID,Name,Email,Preferred Activities,Bucket list destinations Sri Lanka
0,1,Jennifer Quinn,jennifer.quinn@example.com,"['cycling', 'historical monuments', 'village h...","['Polonnaruwa', 'Hatton', 'Anuradhapura', 'Ell..."
1,2,Emily Perry,emily.perry@example.com,"['butterfly watching', 'hot springs', 'wildlif...","['Madunagala Hot Water Spring', 'Wilpattu Nati..."
2,3,Danielle Mcbride,danielle.mcbride@example.com,"['sea cruises', 'themed parks', 'craft worksho...","['Mirissa Beach', 'Negombo Lagoon', 'Batadomba..."
3,4,Angelica Wilson,angelica.wilson@example.com,"['fishing', 'hot springs', 'sailing']","['Maha Oya Hot Water Springs', 'Colombo Port C..."
4,5,Laurie Powers,laurie.powers@example.com,"['history tours', 'sailing', 'literary tours']","['Negombo Lagoon', 'Colombo Port City', 'Galle..."


In [4]:
places_df.head()

Unnamed: 0,name,lat,lng,formatted_address,rating,user_ratings_total,latest_reviews
0,Arugam Bay Beach,6.840408,81.836848,"Arugam Bay Beach, Sri Lanka",4.8,1591.0,['Arugam Bay Beach is a surfer's paradise! I s...
1,Mirissa Beach,5.944703,80.459161,"Mirissa, Sri Lanka",4.6,1748.0,['Mirissa Beach is truly a gem on Sri LankaÃ¢Â...
2,Weligama Beach (surf and stay),5.972486,80.435714,"Weligama, Sri Lanka",4.4,325.0,['Weligama Beach is a fantastic spot for both ...
3,Ahangama,5.973975,80.362159,"Ahangama, Sri Lanka",,,['Ahangama was a bit disappointing for me as a...
4,Hikkaduwa Beach,6.137727,80.09906,"Hikkaduwa Beach, Sri Lanka",4.7,1438.0,['Hikkaduwa Beach is a delightful escape for s...


In [5]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')



In [6]:
def generate_embedding(texts):
    """Generate embeddings for a list of texts using BERT"""
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :].numpy()  # CLS token embeddings
    return embeddings.mean(axis=0)  # Average across all embeddings

In [7]:
def embed_users(df):
    """Generates embeddings for user data"""
    df['activities_embedding'] = df['Preferred Activities'].apply(lambda x: generate_embedding([x]))
    df['bucketlist_embedding'] = df['Bucket list destinations Sri Lanka'].apply(lambda x: generate_embedding([x]))
    
    # Combine the two embeddings
    df['user_embedding'] = df.apply(lambda x: np.mean([x['activities_embedding'], x['bucketlist_embedding']], axis=0), axis=1)
    
    return df

In [8]:
users_df = embed_users(users_df)

In [9]:
def embed_places(df):
    """Generates embeddings for places data"""
    df['reviews_embedding'] = df['latest_reviews'].apply(lambda x: generate_embedding([x]))
    
    return df

In [10]:
places_df = embed_places(places_df)

In [11]:
users_df.head()

Unnamed: 0,User ID,Name,Email,Preferred Activities,Bucket list destinations Sri Lanka,activities_embedding,bucketlist_embedding,user_embedding
0,1,Jennifer Quinn,jennifer.quinn@example.com,"['cycling', 'historical monuments', 'village h...","['Polonnaruwa', 'Hatton', 'Anuradhapura', 'Ell...","[-0.24655655, -0.11371875, 0.23508331, -0.2954...","[-0.46765202, 0.27859274, -0.16576946, -0.4926...","[-0.3571043, 0.08243699, 0.034656927, -0.39404..."
1,2,Emily Perry,emily.perry@example.com,"['butterfly watching', 'hot springs', 'wildlif...","['Madunagala Hot Water Spring', 'Wilpattu Nati...","[-0.1571686, -0.08411879, -0.15007162, -0.6868...","[-0.4141515, 0.14644617, -0.29376894, -0.66102...","[-0.28566003, 0.031163689, -0.22192028, -0.673..."
2,3,Danielle Mcbride,danielle.mcbride@example.com,"['sea cruises', 'themed parks', 'craft worksho...","['Mirissa Beach', 'Negombo Lagoon', 'Batadomba...","[-0.2384069, -0.34387514, 0.33534482, -0.29578...","[-0.39419883, -0.17076102, -0.019418191, -0.44...","[-0.31630287, -0.25731808, 0.15796332, -0.3711..."
3,4,Angelica Wilson,angelica.wilson@example.com,"['fishing', 'hot springs', 'sailing']","['Maha Oya Hot Water Springs', 'Colombo Port C...","[-0.25459453, 0.1686801, -0.18967749, -0.30293...","[-0.32803655, 0.25178826, -0.09374596, -0.2751...","[-0.29131556, 0.21023418, -0.14171173, -0.2890..."
4,5,Laurie Powers,laurie.powers@example.com,"['history tours', 'sailing', 'literary tours']","['Negombo Lagoon', 'Colombo Port City', 'Galle...","[-0.20334406, 0.08428154, 0.15153216, -0.31344...","[-0.5467531, 0.25982797, -0.1583143, -0.256559...","[-0.37504858, 0.17205475, -0.0033910722, -0.28..."


In [12]:
places_df.head()

Unnamed: 0,name,lat,lng,formatted_address,rating,user_ratings_total,latest_reviews,reviews_embedding
0,Arugam Bay Beach,6.840408,81.836848,"Arugam Bay Beach, Sri Lanka",4.8,1591.0,['Arugam Bay Beach is a surfer's paradise! I s...,"[0.10242607, -0.07738076, -0.2346223, -0.32018..."
1,Mirissa Beach,5.944703,80.459161,"Mirissa, Sri Lanka",4.6,1748.0,['Mirissa Beach is truly a gem on Sri LankaÃ¢Â...,"[-0.12108319, -0.13072953, -0.45508036, -0.218..."
2,Weligama Beach (surf and stay),5.972486,80.435714,"Weligama, Sri Lanka",4.4,325.0,['Weligama Beach is a fantastic spot for both ...,"[0.41627577, -0.23499888, -0.5015071, -0.12829..."
3,Ahangama,5.973975,80.362159,"Ahangama, Sri Lanka",,,['Ahangama was a bit disappointing for me as a...,"[0.17558983, 0.07595525, -0.73735404, -0.10944..."
4,Hikkaduwa Beach,6.137727,80.09906,"Hikkaduwa Beach, Sri Lanka",4.7,1438.0,['Hikkaduwa Beach is a delightful escape for s...,"[0.14856876, -0.35530174, -0.7921747, -0.02979..."


In [13]:
# users_df.to_csv("../data/users_df_embedded.csv", index=False)
# places_df.to_csv("../data/places_df_embedded.csv", index=False)