# Sapphire Recommender

## Project Initialization

In [1]:
# Packages to install (not on Colab)

!pip install keybert -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/249.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m249.1/249.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Imports

import pandas as pd
import numpy as np
import string
import pandas.api.types as ptypes
import ast
import re
import nltk
from nltk.corpus import stopwords
from transformers import BertTokenizer, TFBertModel
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from keybert import KeyBERT

In [3]:
# Download NLTK stopwords

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Exploratory Data Analysis - Places

Let's load `places.xlsx` to see what we're working with.

In [4]:
# Load places data from datasets

places_df = pd.read_excel("./places.xlsx")
places_df.head(10)

Unnamed: 0,name,lat,lng,formatted_address,rating,user_ratings_total,latest_reviews
0,Arugam Bay Beach,6.840408,81.836848,"Arugam Bay Beach, Sri Lanka",4.8,1591.0,['Arugam Bay Beach is a surfer's paradise! I s...
1,Mirissa Beach,5.944703,80.459161,"Mirissa, Sri Lanka",4.6,1748.0,['Mirissa Beach is truly a gem on Sri LankaÃ¢Â...
2,Weligama Beach (surf and stay),5.972486,80.435714,"Weligama, Sri Lanka",4.4,325.0,['Weligama Beach is a fantastic spot for both ...
3,Ahangama,5.973975,80.362159,"Ahangama, Sri Lanka",,,['Ahangama was a bit disappointing for me as a...
4,Hikkaduwa Beach,6.137727,80.09906,"Hikkaduwa Beach, Sri Lanka",4.7,1438.0,['Hikkaduwa Beach is a delightful escape for s...
5,Tangalle,6.024338,80.794073,"Tangalle, Sri Lanka",,,['Tangalle was a bit of a letdown for me. The ...
6,Unawatuna Beach,6.009686,80.248424,"Unawatuna Beach, Sri Lanka",4.8,1868.0,['Unawatuna Beach is a slice of paradise! The ...
7,Pigeon Island,8.721837,81.204071,"Pigeon Island, Sri Lanka",4.5,174.0,['Pigeon Island is a gem! Snorkeling here was ...
8,Galle Dutch Fort,6.030459,80.215021,"Galle 80000, Sri Lanka",4.6,16934.0,"[""Galle Dutch Fort is a stunning blend of hist..."
9,Polonnaruwa Ancient City,7.945942,81.000329,"Polonnaruwa, Sri Lanka",4.3,878.0,['Polonnaruwa Ancient City is a stunning place...


First, we'll inspect the properties of this dataframe.

In [5]:
places_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 411 entries, 0 to 410
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   name                411 non-null    object 
 1   lat                 410 non-null    float64
 2   lng                 410 non-null    float64
 3   formatted_address   411 non-null    object 
 4   rating              355 non-null    float64
 5   user_ratings_total  355 non-null    float64
 6   latest_reviews      411 non-null    object 
dtypes: float64(4), object(3)
memory usage: 22.6+ KB


We'll handle the missing / null values appropriately.

In [6]:
# Fill NaN values in 'rating' and 'user_ratings_total' columns with the mean of their respective columns
places_df['rating'].fillna(places_df['rating'].mean(), inplace=True)
places_df['user_ratings_total'].fillna(places_df['user_ratings_total'].mean(), inplace=True)

# Drop rows where 'lat' and 'lng' have NaN values
places_df = places_df.dropna(subset=['lat'])
places_df = places_df.dropna(subset=['lng'])

places_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 410 entries, 0 to 410
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   name                410 non-null    object 
 1   lat                 410 non-null    float64
 2   lng                 410 non-null    float64
 3   formatted_address   410 non-null    object 
 4   rating              410 non-null    float64
 5   user_ratings_total  410 non-null    float64
 6   latest_reviews      410 non-null    object 
dtypes: float64(4), object(3)
memory usage: 25.6+ KB


Let's look at what is stored in a single cell of the `latest_reviews` column.

In [7]:
places_df.loc[3]["latest_reviews"]

"['Ahangama was a bit disappointing for me as a solo traveler. The surfing conditions were not as great as I expected, with inconsistent waves. The beach was nice, but it felt overcrowded at times. I was hoping for a more laid-back atmosphere. ItÃ¢Â€Â™s decent for a quick visit, but I wouldnÃ¢Â€Â™t recommend staying long.', 'As a couple, we found Ahangama quite charming, but it had its downsides. While the beach itself was beautiful, the facilities were lacking. Finding clean showers and restrooms was a challenge. We enjoyed a couple of surf lessons, but the instructors seemed rushed and not very attentive. Overall, it was an okay experience.', 'Our family trip to Ahangama was mixed. The kids loved the beach and the idea of learning to surf, but the surf school was disorganized. We spent a lot of time waiting around. The beach was pretty, but there was quite a bit of trash around. It could be a great spot with a bit more care and management.', 'I visited Ahangama with friends, and whil

Here, we can see that lists of strings, each holding what appears to be a tourist's review of the place, are stored under `latest_reviews`. These reviews can be used to glean information about these places.

## Preprocessing the Places Dataset

Here, we'll first preprocess the data in the `latest_reviews` column by

1. Removing stopwords
2. Removing square brackets and punctuation
3. Lower casing the text
4. Removing garbled words

In [8]:
def clean_latest_reviews(df):
    # Check if the dataframe contains the 'latest_reviews' column
    if "latest_reviews" not in df.columns:
        raise ValueError("The dataframe does not have a 'latest_reviews' column.")

    # Get the set of English stopwords
    stop_words = set(stopwords.words("english"))

    # Define a function to clean and fix each review
    def clean_review(review):
        # Remove square brackets and punctuation
        review = review.translate(str.maketrans("", "", string.punctuation))
        review = review.replace("[", "").replace("]", "")

        # Lowercase the text
        review = review.lower()

        # Split the review into words
        words = review.split()

        # Filter out words with unusual symbols or garbled text
        cleaned_words = [word for word in words if re.match(r"^[a-z]+$", word)]

        # Remove stopwords
        filtered_words = [word for word in cleaned_words if word not in stop_words]

        # Join the cleaned words back into a string
        cleaned_review = " ".join(filtered_words)
        return cleaned_review

    # Apply the clean_review function to the 'latest_reviews' column
    df["latest_reviews"] = df["latest_reviews"].apply(clean_review)

    return df


new_places_df = clean_latest_reviews(places_df)
new_places_df.head(10)

Unnamed: 0,name,lat,lng,formatted_address,rating,user_ratings_total,latest_reviews
0,Arugam Bay Beach,6.840408,81.836848,"Arugam Bay Beach, Sri Lanka",4.8,1591.0,arugam bay beach surfers paradise spent incred...
1,Mirissa Beach,5.944703,80.459161,"Mirissa, Sri Lanka",4.6,1748.0,mirissa beach truly gem sri southern coast sof...
2,Weligama Beach (surf and stay),5.972486,80.435714,"Weligama, Sri Lanka",4.4,325.0,weligama beach fantastic spot beginner experie...
3,Ahangama,5.973975,80.362159,"Ahangama, Sri Lanka",4.459437,1608.639437,ahangama bit disappointing solo traveler surfi...
4,Hikkaduwa Beach,6.137727,80.09906,"Hikkaduwa Beach, Sri Lanka",4.7,1438.0,hikkaduwa beach delightful escape solo travele...
5,Tangalle,6.024338,80.794073,"Tangalle, Sri Lanka",4.459437,1608.639437,tangalle bit letdown beaches beautiful felt ov...
6,Unawatuna Beach,6.009686,80.248424,"Unawatuna Beach, Sri Lanka",4.8,1868.0,unawatuna beach slice paradise water crystal c...
7,Pigeon Island,8.721837,81.204071,"Pigeon Island, Sri Lanka",4.5,174.0,pigeon island gem snorkeling highlight trip un...
8,Galle Dutch Fort,6.030459,80.215021,"Galle 80000, Sri Lanka",4.6,16934.0,galle dutch fort stunning blend history archit...
9,Polonnaruwa Ancient City,7.945942,81.000329,"Polonnaruwa, Sri Lanka",4.3,878.0,polonnaruwa ancient city stunning place steepe...


Now, we can inspect a single review string to see what it looks like.

In [9]:
new_places_df.loc[0]["latest_reviews"]

'arugam bay beach surfers paradise spent incredible days riding waves local surf schools fantastic beginners like atmosphere laidback friendly locals fellow travelers long day surfing sunsets simply magical beach bit crowded especially peak season adds lively vibe wait return friends unforgettable time arugam bay beach surfing conditions excellent managed catch great waves beach beautiful soft sand clear waters perfect swimming however noticed litter beach bit disappointing overall vibrant nightlife delicious food made definitely worth visit couple looking relaxation arugam bay beach offered perfect blend tranquility excitement enjoyed lazy days lounging beach indulging fresh seafood beachside restaurants surf scene lively easy find quieter spots unwind downside occasional noise nearby parties detract much experience lovely getaway visited arugam bay beach family children loved surf lessons found beach bit overcrowded atmosphere vibrant locals warm welcoming spent time exploring nearby

Let's also clean up the `name` column which also has garbled characters (discovered during manual EDA).

In [10]:
def clean_column_garbled(df, column_name):
    # Check if the dataframe contains the specified column
    if column_name not in df.columns:
        raise ValueError(f"The dataframe does not have a '{column_name}' column.")

    # Define a function to remove garbled characters but keep the original text structure
    def clean_text(text):
        # Remove garbled characters (anything that's not a letter, number, space, or standard punctuation)
        cleaned_text = re.sub(r"[^A-Za-z0-9\s\.\,\!\?\-\']", "", text)
        return cleaned_text

    # Apply the clean_text function to the specified column and store the result in a new column
    df[column_name] = df[column_name].apply(clean_text)

    return df

# Clean the 'name' column and store the result in 'clean_name' column
new_places_df = clean_column_garbled(new_places_df, "name")

# Display the first 10 rows of the updated new_places_df
new_places_df.head(10)

Unnamed: 0,name,lat,lng,formatted_address,rating,user_ratings_total,latest_reviews
0,Arugam Bay Beach,6.840408,81.836848,"Arugam Bay Beach, Sri Lanka",4.8,1591.0,arugam bay beach surfers paradise spent incred...
1,Mirissa Beach,5.944703,80.459161,"Mirissa, Sri Lanka",4.6,1748.0,mirissa beach truly gem sri southern coast sof...
2,Weligama Beach surf and stay,5.972486,80.435714,"Weligama, Sri Lanka",4.4,325.0,weligama beach fantastic spot beginner experie...
3,Ahangama,5.973975,80.362159,"Ahangama, Sri Lanka",4.459437,1608.639437,ahangama bit disappointing solo traveler surfi...
4,Hikkaduwa Beach,6.137727,80.09906,"Hikkaduwa Beach, Sri Lanka",4.7,1438.0,hikkaduwa beach delightful escape solo travele...
5,Tangalle,6.024338,80.794073,"Tangalle, Sri Lanka",4.459437,1608.639437,tangalle bit letdown beaches beautiful felt ov...
6,Unawatuna Beach,6.009686,80.248424,"Unawatuna Beach, Sri Lanka",4.8,1868.0,unawatuna beach slice paradise water crystal c...
7,Pigeon Island,8.721837,81.204071,"Pigeon Island, Sri Lanka",4.5,174.0,pigeon island gem snorkeling highlight trip un...
8,Galle Dutch Fort,6.030459,80.215021,"Galle 80000, Sri Lanka",4.6,16934.0,galle dutch fort stunning blend history archit...
9,Polonnaruwa Ancient City,7.945942,81.000329,"Polonnaruwa, Sri Lanka",4.3,878.0,polonnaruwa ancient city stunning place steepe...


Next, let's extract the main activities mentioned inside the reviews using `TfidVectorizer`.

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re

# Function to clean the review text
def clean_text(text):
    # Remove non-alphanumeric characters and lowercase the text
    cleaned_text = re.sub('[^A-Za-z0-9]+', ' ', text.lower())
    return cleaned_text

# Function to extract top 5 keywords using TF-IDF and filter out irrelevant activities
def extract_keywords_tfidf(review_text, unique_activities, tfidf_vectorizer, location_type):
    # Clean the review text
    cleaned_text = clean_text(review_text)

    # Transform the cleaned review text
    tfidf_matrix = tfidf_vectorizer.transform([cleaned_text])

    # Extract the TF-IDF scores for the activities in the vocabulary
    feature_names = tfidf_vectorizer.get_feature_names_out()
    tfidf_scores = dict(zip(feature_names, tfidf_matrix.toarray().flatten()))

    # Sort activities by TF-IDF score
    sorted_activities = sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True)

    # Filter activities based on location type (for example, avoid "amusement parks" for religious sites)
    if location_type == "religious":
        forbidden_activities = ["amusement parks", "nightlife", "casinos"]  # Add more if needed
    else:
        forbidden_activities = []

    # Return the top activities that are allowed and in the vocabulary
    filtered_activities = [
        activity for activity, score in sorted_activities[:7]
        if activity in unique_activities and activity not in forbidden_activities
    ]

    return filtered_activities

# Initialize the TF-IDF Vectorizer with the unique activities as the vocabulary
tfidf_vectorizer = TfidfVectorizer(vocabulary=unique_activities)

# Fit the vectorizer on the cleaned reviews
tfidf_vectorizer.fit(new_places_df['latest_reviews'].apply(lambda x: clean_text(x)))

# Example: Define a mapping for location types (this can be improved with more categories)
location_types = {
    "Lahugala Magul Maha Viharaya": "religious",
    # Add more mappings for other places
}

# Apply the TF-IDF extraction function to the 'latest_reviews' column with filtering based on location type
new_places_df['main_activities'] = new_places_df.apply(
    lambda row: extract_keywords_tfidf(row['latest_reviews'], unique_activities, tfidf_vectorizer, location_types.get(row['name'], 'general')),
    axis=1
)

# Convert lists into comma-separated strings for readability
new_places_df['main_activities'] = new_places_df['main_activities'].apply(lambda x: ' '.join(x))

# Display the updated DataFrame with relevant activities
new_places_df[['name', 'latest_reviews', 'main_activities']].head(10)

Unnamed: 0,name,latest_reviews,main_activities
0,Arugam Bay Beach,arugam bay beach surfers paradise spent incred...,surfing amusement parks animal encounters arch...
1,Mirissa Beach,mirissa beach truly gem sri southern coast sof...,surfing snorkeling amusement parks animal enco...
2,Weligama Beach surf and stay,weligama beach fantastic spot beginner experie...,surfing amusement parks animal encounters arch...
3,Ahangama,ahangama bit disappointing solo traveler surfi...,surfing amusement parks animal encounters arch...
4,Hikkaduwa Beach,hikkaduwa beach delightful escape solo travele...,snorkeling surfing amusement parks animal enco...
5,Tangalle,tangalle bit letdown beaches beautiful felt ov...,surfing amusement parks animal encounters arch...
6,Unawatuna Beach,unawatuna beach slice paradise water crystal c...,snorkeling paddleboarding amusement parks anim...
7,Pigeon Island,pigeon island gem snorkeling highlight trip un...,snorkeling amusement parks animal encounters a...
8,Galle Dutch Fort,galle dutch fort stunning blend history archit...,photography amusement parks animal encounters ...
9,Polonnaruwa Ancient City,polonnaruwa ancient city stunning place steepe...,amusement parks animal encounters archaeologic...


This is what the main activities for a single location looks like.

In [37]:
print(new_places_df.loc[83]["name"])
print(new_places_df.loc[83]["main_activities"])

Batadombalena
waterfalls amusement parks animal encounters archaeological sites architecture photography architecture tours art classes


## Exploratory Data Analysis - Visitors

We'll now load the `visitors.xlsx` to see its contents.

In [12]:
# Load visitors data from datasets

visitors_df = pd.read_excel("./visitors.xlsx")
visitors_df.head(10)

Unnamed: 0,User ID,Name,Email,Preferred Activities,Bucket list destinations Sri Lanka
0,1,Jennifer Quinn,jennifer.quinn@example.com,"['cycling', 'historical monuments', 'village h...","['Polonnaruwa', 'Hatton', 'Anuradhapura', 'Ell..."
1,2,Emily Perry,emily.perry@example.com,"['butterfly watching', 'hot springs', 'wildlif...","['Madunagala Hot Water Spring', 'Wilpattu Nati..."
2,3,Danielle Mcbride,danielle.mcbride@example.com,"['sea cruises', 'themed parks', 'craft worksho...","['Mirissa Beach', 'Negombo Lagoon', 'Batadomba..."
3,4,Angelica Wilson,angelica.wilson@example.com,"['fishing', 'hot springs', 'sailing']","['Maha Oya Hot Water Springs', 'Colombo Port C..."
4,5,Laurie Powers,laurie.powers@example.com,"['history tours', 'sailing', 'literary tours']","['Negombo Lagoon', 'Colombo Port City', 'Galle..."
5,6,Michelle Anderson,michelle.anderson@example.com,"['public art installations', 'temple pilgrimag...","['Colombo', 'Sigiriya', 'Mihintale', 'Galle Du..."
6,7,Louis Ramsey,louis.ramsey@example.com,"['fishing', 'golfing', 'historical monuments']","['Hikkaduwa', 'Kalpitiya', 'Polonnaruwa', 'Neg..."
7,8,Dominique Hammond,dominique.hammond@example.com,"['sailing', 'hot air ballooning', 'spiritual r...","['Trincomalee Harbour', 'Kandalama', ""Sri Pada..."
8,9,Tara Reilly,tara.reilly@example.com,"['cultural experiences', 'botanical gardens', ...","['Seethawaka Wet Zone Botanical Gardens', 'Sig..."
9,10,Stacy Anderson MD,stacy.md@example.com,"['boat safaris', 'sailing', 'caving']","['Batatotalena (Batadombalena) Cave', 'Colombo..."


Let's check the properties of this dataframe.

In [13]:
visitors_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   User ID                             10000 non-null  int64 
 1   Name                                10000 non-null  object
 2   Email                               10000 non-null  object
 3   Preferred Activities                10000 non-null  object
 4   Bucket list destinations Sri Lanka  10000 non-null  object
dtypes: int64(1), object(4)
memory usage: 390.8+ KB


No null or missing values are present in any columns.

Let's analyze the data stored in the `Preferred Activities` column to see if we can extract a list of unique activities to work with.

In [14]:
# Convert the pseudolist strings into actual lists
def convert_to_list(pseudo_list):
    try:
        return ast.literal_eval(pseudo_list)
    except ValueError:
        return []

# Apply the conversion function to the 'Preferred Activities' column
visitors_df['Preferred Activities List'] = visitors_df['Preferred Activities'].apply(convert_to_list)

# Flatten the list of activities into a single list
all_activities = [activity for sublist in visitors_df['Preferred Activities List'] for activity in sublist]

# Get the unique activities by converting to a set and back to a list
unique_activities = sorted(list(set(all_activities)))

# Display the unique activities
for activity in unique_activities:
  print(f"{activity}")
print(f"Length: {len(unique_activities)}")

amusement parks
animal encounters
archaeological sites
architecture photography
architecture tours
art classes
arts and culture
ayurvedic spa treatments
beach visits
beachfront dining
bird watching
boat safaris
botanical gardens
butterfly watching
camping
caving
city tours
craft workshops
cultural experiences
cultural festivals
cycling
elephant rides
fishing
golfing
hiking
historic sites
historic walks
historical monuments
history tours
horse shows
horseback riding
hot air ballooning
hot springs
kayaking
landscape photography
literary tours
local crafts
mountain biking
museum visits
outdoor adventures
paddleboarding
photography
planetarium visits
public art installations
river cruises
rock climbing
safaris
sailing
sailing lessons
scuba diving
sea cruises
sightseeing
snorkeling
spiritual retreats
surfing
tea tasting
temple pilgrimages
theater
themed parks
traditional ceremonies
turtle watching
village homestays
water parks
waterfalls
whale watching
wildlife viewing
yoga retreats
zip-lin

## Preprocessing the Visitor's Dataset

First, we'll clean and simplify the text in the rows of the `Preferred Activities` column by converting them into single strings.

In [17]:
def clean_preferred_activities(df):
    # Check if the dataframe contains the 'latest_reviews' column
    if "Preferred Activities" not in df.columns:
        raise ValueError("The dataframe does not have a 'Preferred Activities' column.")

    # Get the set of English stopwords
    stop_words = set(stopwords.words("english"))

    # Define a function to clean and fix each review
    def clean_review(review):
        # Remove square brackets and punctuation
        review = review.translate(str.maketrans("", "", string.punctuation))
        review = review.replace("[", "").replace("]", "")

        # Lowercase the text
        review = review.lower()

        # Split the review into words
        words = review.split()

        # Filter out words with unusual symbols or garbled text
        cleaned_words = [word for word in words if re.match(r"^[a-z]+$", word)]

        # Remove stopwords
        filtered_words = [word for word in cleaned_words if word not in stop_words]

        # Join the cleaned words back into a string
        cleaned_review = " ".join(filtered_words)
        return cleaned_review

    # Apply the clean_review function to the 'latest_reviews' column
    df["Preferred Activities"] = df["Preferred Activities"].apply(clean_review)

    return df


new_visitors_df = clean_preferred_activities(visitors_df)
new_visitors_df.head(10)

Unnamed: 0,User ID,Name,Email,Preferred Activities,Bucket list destinations Sri Lanka,Preferred Activities List
0,1,Jennifer Quinn,jennifer.quinn@example.com,cycling historical monuments village homestays,"['Polonnaruwa', 'Hatton', 'Anuradhapura', 'Ell...","[cycling, historical monuments, village homest..."
1,2,Emily Perry,emily.perry@example.com,butterfly watching hot springs wildlife viewing,"['Madunagala Hot Water Spring', 'Wilpattu Nati...","[butterfly watching, hot springs, wildlife vie..."
2,3,Danielle Mcbride,danielle.mcbride@example.com,sea cruises themed parks craft workshops,"['Mirissa Beach', 'Negombo Lagoon', 'Batadomba...","[sea cruises, themed parks, craft workshops]"
3,4,Angelica Wilson,angelica.wilson@example.com,fishing hot springs sailing,"['Maha Oya Hot Water Springs', 'Colombo Port C...","[fishing, hot springs, sailing]"
4,5,Laurie Powers,laurie.powers@example.com,history tours sailing literary tours,"['Negombo Lagoon', 'Colombo Port City', 'Galle...","[history tours, sailing, literary tours]"
5,6,Michelle Anderson,michelle.anderson@example.com,public art installations temple pilgrimages ar...,"['Colombo', 'Sigiriya', 'Mihintale', 'Galle Du...","[public art installations, temple pilgrimages,..."
6,7,Louis Ramsey,louis.ramsey@example.com,fishing golfing historical monuments,"['Hikkaduwa', 'Kalpitiya', 'Polonnaruwa', 'Neg...","[fishing, golfing, historical monuments]"
7,8,Dominique Hammond,dominique.hammond@example.com,sailing hot air ballooning spiritual retreats,"['Trincomalee Harbour', 'Kandalama', ""Sri Pada...","[sailing, hot air ballooning, spiritual retreats]"
8,9,Tara Reilly,tara.reilly@example.com,cultural experiences botanical gardens history...,"['Seethawaka Wet Zone Botanical Gardens', 'Sig...","[cultural experiences, botanical gardens, hist..."
9,10,Stacy Anderson MD,stacy.md@example.com,boat safaris sailing caving,"['Batatotalena (Batadombalena) Cave', 'Colombo...","[boat safaris, sailing, caving]"


## Model Initialization

Let's use the BERT embedding model and tokenizer.

In [18]:
# Setup tokenizer and embedding model

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

## Embedding Generation

Let's define a function to tokenize and generate embeddings for various texts.

In [19]:
def generate_bert_embeddings(text, tokenizer, model):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors='tf', padding=True, truncation=True, max_length=512)

    # Use the Hugging Face TFBertModel to get the embeddings
    outputs = model(inputs)

    # Return the pooled output (embedding for the [CLS] token)
    return outputs.pooler_output.numpy()

Now, we can get the embeddings of the main activities for each location.

In [38]:
# Apply the function to the relevant column in the DataFrame
new_places_df['activity_embeddings'] = new_places_df['main_activities'].apply(lambda x: generate_bert_embeddings(x, tokenizer, model))

new_places_df.head(10)

Unnamed: 0,name,lat,lng,formatted_address,rating,user_ratings_total,latest_reviews,main_activities,activity_embeddings,similarity
0,Arugam Bay Beach,6.840408,81.836848,"Arugam Bay Beach, Sri Lanka",4.8,1591.0,arugam bay beach surfers paradise spent incred...,surfing amusement parks animal encounters arch...,"[[-0.8109372, -0.52255476, -0.6692454, 0.64516...",1.117848
1,Mirissa Beach,5.944703,80.459161,"Mirissa, Sri Lanka",4.6,1748.0,mirissa beach truly gem sri southern coast sof...,surfing snorkeling amusement parks animal enco...,"[[-0.81544983, -0.5721724, -0.914304, 0.686624...",1.089932
2,Weligama Beach surf and stay,5.972486,80.435714,"Weligama, Sri Lanka",4.4,325.0,weligama beach fantastic spot beginner experie...,surfing amusement parks animal encounters arch...,"[[-0.8109372, -0.52255476, -0.6692454, 0.64516...",1.117848
3,Ahangama,5.973975,80.362159,"Ahangama, Sri Lanka",4.459437,1608.639437,ahangama bit disappointing solo traveler surfi...,surfing amusement parks animal encounters arch...,"[[-0.8109372, -0.52255476, -0.6692454, 0.64516...",1.117848
4,Hikkaduwa Beach,6.137727,80.09906,"Hikkaduwa Beach, Sri Lanka",4.7,1438.0,hikkaduwa beach delightful escape solo travele...,snorkeling surfing amusement parks animal enco...,"[[-0.82038665, -0.5541179, -0.88742596, 0.6797...",1.09901
5,Tangalle,6.024338,80.794073,"Tangalle, Sri Lanka",4.459437,1608.639437,tangalle bit letdown beaches beautiful felt ov...,surfing amusement parks animal encounters arch...,"[[-0.8109372, -0.52255476, -0.6692454, 0.64516...",1.117848
6,Unawatuna Beach,6.009686,80.248424,"Unawatuna Beach, Sri Lanka",4.8,1868.0,unawatuna beach slice paradise water crystal c...,snorkeling paddleboarding amusement parks anim...,"[[-0.820215, -0.5479467, -0.9273488, 0.679735,...",1.092951
7,Pigeon Island,8.721837,81.204071,"Pigeon Island, Sri Lanka",4.5,174.0,pigeon island gem snorkeling highlight trip un...,snorkeling amusement parks animal encounters a...,"[[-0.8361708, -0.582354, -0.8358872, 0.6795785...",1.080616
8,Galle Dutch Fort,6.030459,80.215021,"Galle 80000, Sri Lanka",4.6,16934.0,galle dutch fort stunning blend history archit...,photography amusement parks animal encounters ...,"[[-0.8600141, -0.53999096, -0.66660404, 0.7306...",1.103564
9,Polonnaruwa Ancient City,7.945942,81.000329,"Polonnaruwa, Sri Lanka",4.3,878.0,polonnaruwa ancient city stunning place steepe...,amusement parks animal encounters archaeologic...,"[[-0.8953587, -0.66461426, -0.972703, 0.838640...",1.124259


In [39]:
new_places_df = new_places_df.drop("similarity", axis="columns")
new_places_df

Unnamed: 0,name,lat,lng,formatted_address,rating,user_ratings_total,latest_reviews,main_activities,activity_embeddings
0,Arugam Bay Beach,6.840408,81.836848,"Arugam Bay Beach, Sri Lanka",4.800000,1591.000000,arugam bay beach surfers paradise spent incred...,surfing amusement parks animal encounters arch...,"[[-0.8109372, -0.52255476, -0.6692454, 0.64516..."
1,Mirissa Beach,5.944703,80.459161,"Mirissa, Sri Lanka",4.600000,1748.000000,mirissa beach truly gem sri southern coast sof...,surfing snorkeling amusement parks animal enco...,"[[-0.81544983, -0.5721724, -0.914304, 0.686624..."
2,Weligama Beach surf and stay,5.972486,80.435714,"Weligama, Sri Lanka",4.400000,325.000000,weligama beach fantastic spot beginner experie...,surfing amusement parks animal encounters arch...,"[[-0.8109372, -0.52255476, -0.6692454, 0.64516..."
3,Ahangama,5.973975,80.362159,"Ahangama, Sri Lanka",4.459437,1608.639437,ahangama bit disappointing solo traveler surfi...,surfing amusement parks animal encounters arch...,"[[-0.8109372, -0.52255476, -0.6692454, 0.64516..."
4,Hikkaduwa Beach,6.137727,80.099060,"Hikkaduwa Beach, Sri Lanka",4.700000,1438.000000,hikkaduwa beach delightful escape solo travele...,snorkeling surfing amusement parks animal enco...,"[[-0.82038665, -0.5541179, -0.88742596, 0.6797..."
...,...,...,...,...,...,...,...,...,...
406,Uppuveli Beach,8.607956,81.220013,"Trincomalee, Sri Lanka",4.300000,399.000000,uppuveli beach stunning escape soft sands clea...,snorkeling sailing amusement parks animal enco...,"[[-0.83731663, -0.5463664, -0.9070858, 0.71020..."
407,Koggala Beach,5.992272,80.310691,"Koggala Beach, Sri Lanka",4.300000,353.000000,koggala beach hidden gem soft sand clear water...,snorkeling amusement parks animal encounters a...,"[[-0.8361708, -0.582354, -0.8358872, 0.6795785..."
408,Marakolliya Beach,6.042222,80.823073,"Kapuhenwala Road, Sri Lanka",4.300000,180.000000,marakolliya beach hidden gem waves perfect sur...,surfing amusement parks animal encounters arch...,"[[-0.8109372, -0.52255476, -0.6692454, 0.64516..."
409,Pasikuda Beach,7.929994,81.561185,"Pasikuda Beach, Sri Lanka",4.400000,1142.000000,pasikuda beach hidden gem pristine waters perf...,amusement parks animal encounters archaeologic...,"[[-0.8953587, -0.66461426, -0.972703, 0.838640..."


Let's save the dataframe with embeddings.

In [21]:
# Save dataframe with embeddings to CSV file
new_places_df.to_csv('places_with_embeddings.csv', index=False)

## Recommendation Retrieval

First, we define a function to get cosine similarity.

In [22]:
def compute_cosine_similarity(embedding, embeddings):
    """
    Compute cosine similarity between a single embedding and all other embeddings.

    Args:
    - embedding (np.ndarray): The embedding vector for the input set of tourist interests.
    - embeddings (list of np.ndarray): List of all location embedding vectors in the dataset.

    Returns:
    - similarities (np.ndarray): Array of cosine similarities.
    """
    similarities = cosine_similarity(embedding.reshape(1, -1), np.vstack(embeddings)).flatten()
    return similarities

Next, we can define a function to retrieve the most relevant places for a tourist's interests.

In [41]:
import re
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def recommend_places(activities, df, tokenizer, model, top_n=5):
    """
    Recommend top N locations based on the embeddings of user preferred activities.

    Args:
    - activities (str): A string representing tourist's preferred activities.
    - df (pd.DataFrame): DataFrame containing place data and embeddings.
    - tokenizer (BertTokenizer): Hugging Face tokenizer.
    - model (BertModel): Hugging Face BERT model.
    - top_n (int): Number of similar places to recommend.

    Returns:
    - recommendations (pd.DataFrame): DataFrame of recommended places.
    """
    # Preprocess and get the embedding for the user's preferred activities
    cleaned_activities = re.sub('[^A-Za-z0-9]+', ' ', activities.lower())
    activity_embedding = generate_bert_embeddings(cleaned_activities, tokenizer, model)

    # Compute similarities between the input embedding and all place embeddings
    similarities = compute_cosine_similarity(activity_embedding, df['activity_embeddings'].tolist())

    # Add the similarity scores to the DataFrame
    df['similarity'] = similarities

    # Sort the DataFrame based on similarity scores in descending order
    df_sorted = df.sort_values(by='similarity', ascending=False)

    # Return the top N recommendations (excluding places with zero similarity)
    recommendations = df_sorted[df_sorted['similarity'] > 0].head(top_n)

    return recommendations[['name', 'formatted_address', 'rating', 'similarity']]


recommend_places("hiking safaris waterfalls", new_places_df, tokenizer, model)

Unnamed: 0,name,formatted_address,rating,similarity
246,Sandaraja Wana Arana x008f,"Aranayake, Sri Lanka",4.3,0.991033
205,Heyna camping site,"Weligatta, Sri Lanka",4.459437,0.991033
68,Meemure,"Meemure, Sri Lanka",4.459437,0.991033
111,Bambarakanda Falls,"Kalupahana - Ohiya Rd, Sri Lanka",4.5,0.991033
290,Blue moon camping,"Kandakuliya, Sri Lanka",4.5,0.991033


Since that did not yield appropriate or satisfactory results, let's try a different approach where we add "weight" to each interest.

In [50]:
import re
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def recommend_places(activities, df, tokenizer, model, top_n=5, activity_weighting=None):
    total_similarities = np.zeros(len(df))

    # Loop through each activity, process it separately, and compute similarity
    for activity in activities:
        # Preprocess and get the embedding for each individual activity
        cleaned_activity = re.sub('[^A-Za-z0-9]+', ' ', activity.lower())
        activity_embedding = generate_bert_embeddings(cleaned_activity, tokenizer, model)

        # Compute cosine similarities between the activity embedding and all place embeddings
        similarities = compute_cosine_similarity(activity_embedding, df['activity_embeddings'].tolist())

        # Apply activity weighting if provided (more important activities get higher influence)
        weight = activity_weighting.get(activity, 1) if activity_weighting else 1
        total_similarities += similarities * weight

    # Add the similarity scores to the DataFrame
    df['similarity'] = total_similarities / len(activities)  # Normalize by number of activities

    # Sort the DataFrame based on similarity scores in descending order
    df_sorted = df.sort_values(by='similarity', ascending=False)

    # Return the top N recommendations (excluding places with zero similarity)
    recommendations = df_sorted[df_sorted['similarity'] > 0].head(top_n)

    return recommendations[['name', 'formatted_address', 'rating', 'similarity']]


# Example usage:
activities = ["snorkelling", "safaris", "waterfalls"]

# Dynamically create the activity weighting dictionary
# Default weight is 1.0; you can adjust this as needed
default_weight = 1.0
activity_weighting = {activity: default_weight for activity in activities}

# Optionally adjust specific weights
# For example, if you want to give "snorkelling" a higher weight
activity_weighting[activities[0]] = 1.5

# Call the recommend_places function with the dynamically created activity weighting
recommend_places(activities, new_places_df, tokenizer, model, activity_weighting=activity_weighting)

Unnamed: 0,name,formatted_address,rating,similarity
61,Trincomalee,"Trincomalee, Sri Lanka",4.459437,1.125662
60,Negombo,"Negombo, Sri Lanka",4.459437,1.125301
74,Kalpitiya Lagoon,"Kalpitiya Lagoon, Sri Lanka",4.8,1.124877
70,Makandawa Conservation Center Makandawa Forest,"Kitulgala, Sri Lanka",4.9,1.123883
253,Menik Ganga - Dunhinda Falls,"Buttala Rd, Sri Lanka",2.7,1.123883


The above approach has yielded significantly better results.

In [None]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

# Stack the embeddings for KNN
place_embeddings = np.vstack(new_places_df['activity_embeddings'].values)
visitor_embeddings = np.vstack(visitors_df['activity_embeddings'].values)

# Initialize KNN
knn = NearestNeighbors(n_neighbors=5, metric='cosine')

# Fit the KNN model with place embeddings
knn.fit(place_embeddings)

def recommend_places_for_tourist(visitor_embedding, knn, places_df):
    """
    Recommend places for a tourist based on their embedding.

    Args:
    - visitor_embedding: The embedding vector for a tourist.
    - knn: The trained KNN model.
    - places_df: The DataFrame with place information and embeddings.

    Returns:
    - DataFrame of top recommended places.
    """
    distances, indices = knn.kneighbors(visitor_embedding.reshape(1, -1))
    recommendations = places_df.iloc[indices[0]].copy()
    recommendations['distance'] = distances[0]
    return recommendations[['name', 'formatted_address', 'rating', 'distance']]

# Example usage for recommending places to a specific tourist
tourist_idx = 0  # Choose a tourist
tourist_embedding = visitor_embeddings[tourist_idx]

recommended_places = recommend_places_for_tourist(tourist_embedding, knn, places_df)
print(recommended_places)