In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModel

In [2]:
users_df = pd.read_excel("../data/Visitors Preference Dataset.xlsx")
places_df = pd.read_excel("../data/Places Dataset.xlsx")

In [3]:
users_df.head()

Unnamed: 0,User ID,Name,Email,Preferred Activities,Bucket list destinations Sri Lanka
0,1,Jennifer Quinn,jennifer.quinn@example.com,"['cycling', 'historical monuments', 'village h...","['Polonnaruwa', 'Hatton', 'Anuradhapura', 'Ell..."
1,2,Emily Perry,emily.perry@example.com,"['butterfly watching', 'hot springs', 'wildlif...","['Madunagala Hot Water Spring', 'Wilpattu Nati..."
2,3,Danielle Mcbride,danielle.mcbride@example.com,"['sea cruises', 'themed parks', 'craft worksho...","['Mirissa Beach', 'Negombo Lagoon', 'Batadomba..."
3,4,Angelica Wilson,angelica.wilson@example.com,"['fishing', 'hot springs', 'sailing']","['Maha Oya Hot Water Springs', 'Colombo Port C..."
4,5,Laurie Powers,laurie.powers@example.com,"['history tours', 'sailing', 'literary tours']","['Negombo Lagoon', 'Colombo Port City', 'Galle..."


In [4]:
places_df.head()

Unnamed: 0,name,lat,lng,formatted_address,rating,user_ratings_total,latest_reviews
0,Arugam Bay Beach,6.840408,81.836848,"Arugam Bay Beach, Sri Lanka",4.8,1591.0,['Arugam Bay Beach is a surfer's paradise! I s...
1,Mirissa Beach,5.944703,80.459161,"Mirissa, Sri Lanka",4.6,1748.0,['Mirissa Beach is truly a gem on Sri LankaÃ¢Â...
2,Weligama Beach (surf and stay),5.972486,80.435714,"Weligama, Sri Lanka",4.4,325.0,['Weligama Beach is a fantastic spot for both ...
3,Ahangama,5.973975,80.362159,"Ahangama, Sri Lanka",,,['Ahangama was a bit disappointing for me as a...
4,Hikkaduwa Beach,6.137727,80.09906,"Hikkaduwa Beach, Sri Lanka",4.7,1438.0,['Hikkaduwa Beach is a delightful escape for s...


In [5]:
selected_features_user = ['Preferred Activities', 'Bucket list destinations Sri Lanka']
selected_features_places = ['name', 'latest_reviews']

In [6]:
users_df = users_df[selected_features_user]
places_df = places_df[selected_features_places]

In [7]:
users_df.fillna("", inplace=True)
places_df.fillna("", inplace=True)

In [8]:
users_df['Preferred Activities'] = users_df['Preferred Activities'].apply(lambda x: ' '.join(x.split()))
users_df['Bucket list destinations Sri Lanka'] = users_df['Bucket list destinations Sri Lanka'].apply(lambda x: ' '.join(x.split()))

In [9]:
users_df['Preferred Activities'] = users_df['Preferred Activities'].apply(
    lambda x: eval(x) if isinstance(x, str) and x.strip() != '' else ''
)
users_df['Bucket list destinations Sri Lanka'] = users_df['Bucket list destinations Sri Lanka'].apply(
    lambda x: eval(x) if isinstance(x, str) and x.strip() != '' else ''
)

In [10]:
users_df['Preferred Activities'] = users_df['Preferred Activities'].apply(lambda x: " ".join(x))
users_df['Bucket list destinations Sri Lanka'] = users_df['Bucket list destinations Sri Lanka'].apply(lambda x: " ".join(x))

In [11]:
users_df.head()

Unnamed: 0,Preferred Activities,Bucket list destinations Sri Lanka
0,cycling historical monuments village homestays,Polonnaruwa Hatton Anuradhapura Ella Haputale
1,butterfly watching hot springs wildlife viewing,Madunagala Hot Water Spring Wilpattu National ...
2,sea cruises themed parks craft workshops,Mirissa Beach Negombo Lagoon Batadombalena Cra...
3,fishing hot springs sailing,Maha Oya Hot Water Springs Colombo Port City N...
4,history tours sailing literary tours,Negombo Lagoon Colombo Port City Galle Dutch F...


In [12]:
places_df['latest_reviews'] = places_df['latest_reviews'].apply(
    lambda x: ''.join([i if i.isalpha() or i.isspace() else '' for i in x])
)

In [13]:
places_df['latest_reviews'] = places_df['latest_reviews'].apply(lambda x: x.lower())
places_df['name'] = places_df['name'].apply(lambda x: x.lower())
users_df['Preferred Activities'] = users_df['Preferred Activities'].apply(lambda x: x.lower())
users_df['Bucket list destinations Sri Lanka'] = users_df['Bucket list destinations Sri Lanka'].apply(lambda x: x.lower())

In [14]:
stop_words = set(stopwords.words('english'))

places_df['latest_reviews'] = places_df['latest_reviews'].apply(
    lambda x: ' '.join([word for word in x.split() if word not in stop_words])
)

In [15]:
places_df.head()

Unnamed: 0,name,latest_reviews
0,arugam bay beach,arugam bay beach surfers paradise spent incred...
1,mirissa beach,mirissa beach truly gem sri lankaãââs southern...
2,weligama beach (surf and stay),weligama beach fantastic spot beginner experie...
3,ahangama,ahangama bit disappointing solo traveler surfi...
4,hikkaduwa beach,hikkaduwa beach delightful escape solo travele...


In [16]:
combined_features_places = places_df['name'] + ' ' + places_df['latest_reviews']
combined_features_users = users_df['Preferred Activities'] + ' ' + users_df['Bucket list destinations Sri Lanka']

In [17]:
combined_features_places[0], combined_features_users[0]

('arugam bay beach arugam bay beach surfers paradise spent incredible days riding waves local surf schools fantastic beginners like atmosphere laidback friendly locals fellow travelers long day surfing sunsets simply magical beach bit crowded especially peak season adds lively vibe canãâât wait return friends unforgettable time arugam bay beach surfing conditions excellent managed catch great waves beach beautiful soft sand clear waters perfect swimming however noticed litter beach bit disappointing overall vibrant nightlife delicious food made definitely worth visit couple looking relaxation arugam bay beach offered perfect blend tranquility excitement enjoyed lazy days lounging beach indulging fresh seafood beachside restaurants surf scene lively easy find quieter spots unwind downside occasional noise nearby parties didnãâât detract much experience lovely getaway visited arugam bay beach family children loved surf lessons found beach bit overcrowded atmosphere vibrant locals warm we

In [18]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")



In [19]:
def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :]
    return embeddings

In [20]:
places_embeddings = []
for text in combined_features_places:
    embeddings = get_bert_embeddings(text)
    places_embeddings.append(embeddings.detach().numpy())
    
places_embeddings = np.array(places_embeddings)

In [21]:
users_embeddings = []
for text in combined_features_users:
    embeddings = get_bert_embeddings(text)
    users_embeddings.append(embeddings.detach().numpy())
    
users_embeddings = np.array(users_embeddings)

In [22]:
places_embeddings[0].shape, places_embeddings[1].shape

((1, 768), (1, 768))

In [23]:
users_embeddings[0].shape, users_embeddings[1].shape

((1, 768), (1, 768))

In [42]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, input_dim),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded, encoded

In [43]:
def reduce_dimensions(embeddings, encoding_dim=128, epochs=50, batch_size=64):
    embeddings_tensor = torch.tensor(embeddings, dtype=torch.float32)

    dataset = TensorDataset(embeddings_tensor, embeddings_tensor)  
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    input_dim = 768
    autoencoder = Autoencoder(input_dim, encoding_dim)
    
    criterion = nn.MSELoss()
    optimizer = optim.Adam(autoencoder.parameters(), lr=1e-3)

    for epoch in range(epochs):
        for data, _ in loader:
            optimizer.zero_grad()
            reconstructed, _ = autoencoder(data)
            loss = criterion(reconstructed, data)
            loss.backward()
            optimizer.step()
            
        print(f"Epoch: {epoch+1}, Loss: {loss.item()}")
            
    with torch.no_grad():
        _, reduced_embeddings = autoencoder(embeddings_tensor)
    
    return reduced_embeddings.numpy()

In [44]:
places_embeddings_reduced = reduce_dimensions(places_embeddings, encoding_dim=128)

Epoch: 1, Loss: 0.35723382234573364
Epoch: 2, Loss: 0.2037438005208969
Epoch: 3, Loss: 0.20589686930179596
Epoch: 4, Loss: 0.20761318504810333
Epoch: 5, Loss: 0.20041348040103912
Epoch: 6, Loss: 0.2026900053024292
Epoch: 7, Loss: 0.19401061534881592
Epoch: 8, Loss: 0.19820939004421234
Epoch: 9, Loss: 0.19752812385559082
Epoch: 10, Loss: 0.19871024787425995
Epoch: 11, Loss: 0.20053039491176605
Epoch: 12, Loss: 0.19320732355117798
Epoch: 13, Loss: 0.1977614164352417
Epoch: 14, Loss: 0.1956159621477127
Epoch: 15, Loss: 0.19608354568481445
Epoch: 16, Loss: 0.19008773565292358
Epoch: 17, Loss: 0.19105343520641327
Epoch: 18, Loss: 0.1888892948627472
Epoch: 19, Loss: 0.18813614547252655
Epoch: 20, Loss: 0.18561722338199615
Epoch: 21, Loss: 0.18860819935798645
Epoch: 22, Loss: 0.18623840808868408
Epoch: 23, Loss: 0.18560248613357544
Epoch: 24, Loss: 0.18593695759773254
Epoch: 25, Loss: 0.1864466667175293
Epoch: 26, Loss: 0.18590085208415985
Epoch: 27, Loss: 0.18766456842422485
Epoch: 28, Loss:

In [45]:
places_embeddings_reduced.shape

(411, 1, 128)

In [46]:
users_embeddings_reduced = reduce_dimensions(users_embeddings, encoding_dim=128)

Epoch: 1, Loss: 0.2090817242860794
Epoch: 2, Loss: 0.20518498122692108
Epoch: 3, Loss: 0.20904715359210968
Epoch: 4, Loss: 0.20405130088329315
Epoch: 5, Loss: 0.20530976355075836
Epoch: 6, Loss: 0.20181910693645477
Epoch: 7, Loss: 0.2054326981306076
Epoch: 8, Loss: 0.20369493961334229
Epoch: 9, Loss: 0.20293186604976654
Epoch: 10, Loss: 0.20079953968524933
Epoch: 11, Loss: 0.20225290954113007
Epoch: 12, Loss: 0.20300595462322235
Epoch: 13, Loss: 0.20154933631420135
Epoch: 14, Loss: 0.2010996788740158
Epoch: 15, Loss: 0.20065034925937653
Epoch: 16, Loss: 0.200552836060524
Epoch: 17, Loss: 0.20148074626922607
Epoch: 18, Loss: 0.19955630600452423
Epoch: 19, Loss: 0.20157022774219513
Epoch: 20, Loss: 0.19820167124271393
Epoch: 21, Loss: 0.20074070990085602
Epoch: 22, Loss: 0.2011839598417282
Epoch: 23, Loss: 0.2005639225244522
Epoch: 24, Loss: 0.19909369945526123
Epoch: 25, Loss: 0.20037685334682465
Epoch: 26, Loss: 0.20066677033901215
Epoch: 27, Loss: 0.19679921865463257
Epoch: 28, Loss: 

In [47]:
users_embeddings_reduced.shape

(10000, 1, 128)

In [50]:
users_embeddings_reduced[0].shape

(1, 128)

In [52]:
user_embedding = np.array(users_embeddings_reduced[0])  # Convert to 1D array if necessary
places_embeddings = np.vstack(places_embeddings_reduced)  # Ensure 2D array


print(f"User embedding shape: {user_embedding.shape}")  
print(f"Places embeddings shape: {places_embeddings.shape}")  

if user_embedding.ndim == 2:
    user_embedding = user_embedding.flatten()  


similarity = cosine_similarity([user_embedding], places_embeddings)
print(similarity)

User embedding shape: (1, 128)
Places embeddings shape: (411, 128)
[[0.22000822 0.22416669 0.21383332 0.2188088  0.2176635  0.22902298
  0.21646044 0.22188377 0.2078129  0.20123702 0.21962975 0.20220163
  0.20945288 0.2092567  0.1946919  0.21471001 0.20736882 0.21485125
  0.20936601 0.21685061 0.2112755  0.21434015 0.20831157 0.20050055
  0.20631531 0.19489984 0.22397767 0.21402976 0.21743461 0.2120898
  0.21188267 0.21823472 0.22270843 0.22175978 0.21294922 0.21814
  0.22050864 0.21436268 0.18832934 0.19278035 0.19369179 0.20345822
  0.19078702 0.2019707  0.20638423 0.20622893 0.22973126 0.21635363
  0.20969525 0.21271475 0.21561159 0.208705   0.21659373 0.22271559
  0.19257271 0.21594638 0.20868412 0.21036841 0.21551876 0.21808004
  0.223169   0.2140134  0.21183816 0.21344939 0.21587786 0.21884027
  0.2213657  0.2023616  0.22518617 0.21788584 0.21047227 0.2150505
  0.20466405 0.22109257 0.22015381 0.22618237 0.22185667 0.21399887
  0.21949199 0.22584347 0.2164698  0.21031867 0.210445

In [53]:
places_original = pd.read_excel("../data/Places Dataset.xlsx")

In [54]:
top_5_indices = similarity[0].argsort()[-5:][::-1]

In [56]:
print("Top 5 recommended places:")
for i, idx in enumerate(top_5_indices):
    place = places_original.iloc[idx]
    print(f"{i+1}. {place['name']} - {place['latest_reviews']}")

Top 5 recommended places:
1. Lake side cabana - ['The Lake Side Cabana has a stunning view, but the experience fell short for me. The service was slow, and the amenities felt neglected. I expected a relaxing getaway, but the noise from nearby activities ruined the peaceful atmosphere. It could be a lovely spot if they improved their maintenance and service.', 'I visited with friends, hoping for a fun day by the water. Unfortunately, the cabana was overcrowded, and the staff seemed overwhelmed. We enjoyed the scenery but spent more time waiting for service than actually enjoying the activities. ItÃ¢Â€Â™s a decent place, but it definitely needs better management.', 'As a couple, we hoped for a romantic escape at Lake Side Cabana, but it was rather disappointing. The setting is beautiful, but the lack of attention to cleanliness and the disorganization of the staff made it hard to enjoy our time. We found ourselves looking for other places to relax.', 'Our family trip to Lake Side Cabana 