In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import torch
from transformers import AutoTokenizer, AutoModel

In [2]:
users_df = pd.read_excel("../data/Visitors Preference Dataset.xlsx")
places_df = pd.read_excel("../data/Places Dataset.xlsx")

In [3]:
users_df.head()

Unnamed: 0,User ID,Name,Email,Preferred Activities,Bucket list destinations Sri Lanka
0,1,Jennifer Quinn,jennifer.quinn@example.com,"['cycling', 'historical monuments', 'village h...","['Polonnaruwa', 'Hatton', 'Anuradhapura', 'Ell..."
1,2,Emily Perry,emily.perry@example.com,"['butterfly watching', 'hot springs', 'wildlif...","['Madunagala Hot Water Spring', 'Wilpattu Nati..."
2,3,Danielle Mcbride,danielle.mcbride@example.com,"['sea cruises', 'themed parks', 'craft worksho...","['Mirissa Beach', 'Negombo Lagoon', 'Batadomba..."
3,4,Angelica Wilson,angelica.wilson@example.com,"['fishing', 'hot springs', 'sailing']","['Maha Oya Hot Water Springs', 'Colombo Port C..."
4,5,Laurie Powers,laurie.powers@example.com,"['history tours', 'sailing', 'literary tours']","['Negombo Lagoon', 'Colombo Port City', 'Galle..."


In [4]:
places_df.head()

Unnamed: 0,name,lat,lng,formatted_address,rating,user_ratings_total,latest_reviews
0,Arugam Bay Beach,6.840408,81.836848,"Arugam Bay Beach, Sri Lanka",4.8,1591.0,['Arugam Bay Beach is a surfer's paradise! I s...
1,Mirissa Beach,5.944703,80.459161,"Mirissa, Sri Lanka",4.6,1748.0,['Mirissa Beach is truly a gem on Sri LankaÃ¢Â...
2,Weligama Beach (surf and stay),5.972486,80.435714,"Weligama, Sri Lanka",4.4,325.0,['Weligama Beach is a fantastic spot for both ...
3,Ahangama,5.973975,80.362159,"Ahangama, Sri Lanka",,,['Ahangama was a bit disappointing for me as a...
4,Hikkaduwa Beach,6.137727,80.09906,"Hikkaduwa Beach, Sri Lanka",4.7,1438.0,['Hikkaduwa Beach is a delightful escape for s...


In [5]:
selected_features_user = ['Preferred Activities', 'Bucket list destinations Sri Lanka']
selected_features_places = ['name', 'latest_reviews']

In [6]:
users_df = users_df[selected_features_user]
places_df = places_df[selected_features_places]

In [7]:
users_df.fillna("", inplace=True)
places_df.fillna("", inplace=True)

In [8]:
users_df['Preferred Activities'] = users_df['Preferred Activities'].apply(lambda x: ' '.join(x.split()))
users_df['Bucket list destinations Sri Lanka'] = users_df['Bucket list destinations Sri Lanka'].apply(lambda x: ' '.join(x.split()))

In [9]:
users_df['Preferred Activities'] = users_df['Preferred Activities'].apply(
    lambda x: eval(x) if isinstance(x, str) and x.strip() != '' else ''
)
users_df['Bucket list destinations Sri Lanka'] = users_df['Bucket list destinations Sri Lanka'].apply(
    lambda x: eval(x) if isinstance(x, str) and x.strip() != '' else ''
)

In [10]:
users_df['Preferred Activities'] = users_df['Preferred Activities'].apply(lambda x: " ".join(x))
users_df['Bucket list destinations Sri Lanka'] = users_df['Bucket list destinations Sri Lanka'].apply(lambda x: " ".join(x))

In [11]:
users_df.head()

Unnamed: 0,Preferred Activities,Bucket list destinations Sri Lanka
0,cycling historical monuments village homestays,Polonnaruwa Hatton Anuradhapura Ella Haputale
1,butterfly watching hot springs wildlife viewing,Madunagala Hot Water Spring Wilpattu National ...
2,sea cruises themed parks craft workshops,Mirissa Beach Negombo Lagoon Batadombalena Cra...
3,fishing hot springs sailing,Maha Oya Hot Water Springs Colombo Port City N...
4,history tours sailing literary tours,Negombo Lagoon Colombo Port City Galle Dutch F...


In [12]:
places_df['latest_reviews'] = places_df['latest_reviews'].apply(
    lambda x: ''.join([i if i.isalpha() or i.isspace() else '' for i in x])
)

In [13]:
places_df['latest_reviews'] = places_df['latest_reviews'].apply(lambda x: x.lower())
places_df['name'] = places_df['name'].apply(lambda x: x.lower())
users_df['Preferred Activities'] = users_df['Preferred Activities'].apply(lambda x: x.lower())
users_df['Bucket list destinations Sri Lanka'] = users_df['Bucket list destinations Sri Lanka'].apply(lambda x: x.lower())

In [14]:
stop_words = set(stopwords.words('english'))

places_df['latest_reviews'] = places_df['latest_reviews'].apply(
    lambda x: ' '.join([word for word in x.split() if word not in stop_words])
)

In [15]:
places_df.head()

Unnamed: 0,name,latest_reviews
0,arugam bay beach,arugam bay beach surfers paradise spent incred...
1,mirissa beach,mirissa beach truly gem sri lankaãââs southern...
2,weligama beach (surf and stay),weligama beach fantastic spot beginner experie...
3,ahangama,ahangama bit disappointing solo traveler surfi...
4,hikkaduwa beach,hikkaduwa beach delightful escape solo travele...


In [16]:
combined_features_places = places_df['name'] + ' ' + places_df['latest_reviews']
combined_features_users = users_df['Preferred Activities'] + ' ' + users_df['Bucket list destinations Sri Lanka']

In [17]:
combined_features_places[0], combined_features_users[0]

('arugam bay beach arugam bay beach surfers paradise spent incredible days riding waves local surf schools fantastic beginners like atmosphere laidback friendly locals fellow travelers long day surfing sunsets simply magical beach bit crowded especially peak season adds lively vibe canãâât wait return friends unforgettable time arugam bay beach surfing conditions excellent managed catch great waves beach beautiful soft sand clear waters perfect swimming however noticed litter beach bit disappointing overall vibrant nightlife delicious food made definitely worth visit couple looking relaxation arugam bay beach offered perfect blend tranquility excitement enjoyed lazy days lounging beach indulging fresh seafood beachside restaurants surf scene lively easy find quieter spots unwind downside occasional noise nearby parties didnãâât detract much experience lovely getaway visited arugam bay beach family children loved surf lessons found beach bit overcrowded atmosphere vibrant locals warm we

In [18]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")



In [19]:
def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :]
    return embeddings

In [20]:
places_embeddings = []
for text in combined_features_places:
    embeddings = get_bert_embeddings(text)
    places_embeddings.append(embeddings.detach().numpy())
    
places_embeddings = np.array(places_embeddings)

In [21]:
users_embeddings = []
for text in combined_features_users:
    embeddings = get_bert_embeddings(text)
    users_embeddings.append(embeddings.detach().numpy())
    
users_embeddings = np.array(users_embeddings)

In [22]:
user_embedding = np.array(users_embeddings[0])  
places_embeddings = np.vstack(places_embeddings)

if user_embedding.ndim == 2:
    user_embedding = user_embedding.flatten()
    
similarity = cosine_similarity([user_embedding], places_embeddings)
print(similarity)

[[0.8402272  0.8344376  0.7671116  0.8141402  0.82462764 0.83331627
  0.80389076 0.83578026 0.841722   0.7939187  0.84546614 0.7953314
  0.8168553  0.81634253 0.8417226  0.85229504 0.8323041  0.82890975
  0.8072432  0.8168755  0.8477305  0.8185252  0.82947874 0.8077965
  0.8589698  0.8254177  0.833706   0.8356146  0.8348495  0.8461016
  0.7680709  0.7984809  0.81921154 0.80103064 0.8377528  0.8517184
  0.8135715  0.81345344 0.8193511  0.8088815  0.83535683 0.85182595
  0.8317849  0.8238195  0.8091043  0.82858026 0.77683955 0.8325143
  0.78420377 0.8607769  0.8359536  0.8004573  0.814752   0.83567595
  0.8023353  0.8464005  0.84508264 0.8226776  0.84554374 0.82747996
  0.80766696 0.7784786  0.7721653  0.8015765  0.819438   0.80714726
  0.8043567  0.8002417  0.8356118  0.8482743  0.85267633 0.83728784
  0.8000858  0.8403493  0.8394454  0.8177206  0.8368954  0.847592
  0.8328368  0.79148275 0.83155286 0.82697475 0.8225511  0.83757186
  0.6890092  0.8166372  0.8243923  0.8188045  0.8229067

In [23]:
places_original = pd.read_excel("../data/Places Dataset.xlsx")
users_original = pd.read_excel("../data/Visitors Preference Dataset.xlsx")

In [24]:
top_5_places = similarity.argsort()[0][::-1][:5]

In [25]:
print("User's Preferred Activities: ", users_original['Preferred Activities'][0])
print("User's Bucket list destinations Sri Lanka: ", users_original['Bucket list destinations Sri Lanka'][0])
print("\n")
print("Top 5 Recommended Places: ")
for i, place in enumerate(top_5_places):
    print(f"{i+1}. {places_original['name'][place]}")

User's Preferred Activities:  ['cycling', 'historical monuments', 'village homestays']
User's Bucket list destinations Sri Lanka:  ['Polonnaruwa', 'Hatton', 'Anuradhapura', 'Ella', 'Haputale']


Top 5 Recommended Places: 
1. Thummulla Wewa
2. Ram setu
3. Ruwanweli Maha Seya
4. Dimiyawa View Point
5. Kandy Lake View Point
