# Sapphire Recommender

## Project Initialization

In [21]:
# Imports

import pandas as pd
import numpy as np
import string
import pandas.api.types as ptypes
import ast
import re
import nltk
from nltk.corpus import stopwords
from transformers import BertTokenizer, TFBertModel

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [17]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /home/ravindu-
[nltk_data]     aratchige/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Exploratory Data Analysis - Places

In [2]:
# Load places data from datasets

places_df = pd.read_excel("./data/places.xlsx")
places_df.head(10)

Unnamed: 0,name,lat,lng,formatted_address,rating,user_ratings_total,latest_reviews
0,Arugam Bay Beach,6.840408,81.836848,"Arugam Bay Beach, Sri Lanka",4.8,1591.0,['Arugam Bay Beach is a surfer's paradise! I s...
1,Mirissa Beach,5.944703,80.459161,"Mirissa, Sri Lanka",4.6,1748.0,['Mirissa Beach is truly a gem on Sri LankaÃ¢Â...
2,Weligama Beach (surf and stay),5.972486,80.435714,"Weligama, Sri Lanka",4.4,325.0,['Weligama Beach is a fantastic spot for both ...
3,Ahangama,5.973975,80.362159,"Ahangama, Sri Lanka",,,['Ahangama was a bit disappointing for me as a...
4,Hikkaduwa Beach,6.137727,80.09906,"Hikkaduwa Beach, Sri Lanka",4.7,1438.0,['Hikkaduwa Beach is a delightful escape for s...
5,Tangalle,6.024338,80.794073,"Tangalle, Sri Lanka",,,['Tangalle was a bit of a letdown for me. The ...
6,Unawatuna Beach,6.009686,80.248424,"Unawatuna Beach, Sri Lanka",4.8,1868.0,['Unawatuna Beach is a slice of paradise! The ...
7,Pigeon Island,8.721837,81.204071,"Pigeon Island, Sri Lanka",4.5,174.0,['Pigeon Island is a gem! Snorkeling here was ...
8,Galle Dutch Fort,6.030459,80.215021,"Galle 80000, Sri Lanka",4.6,16934.0,"[""Galle Dutch Fort is a stunning blend of hist..."
9,Polonnaruwa Ancient City,7.945942,81.000329,"Polonnaruwa, Sri Lanka",4.3,878.0,['Polonnaruwa Ancient City is a stunning place...


In [4]:
places_df.loc[3]["latest_reviews"]

"['Ahangama was a bit disappointing for me as a solo traveler. The surfing conditions were not as great as I expected, with inconsistent waves. The beach was nice, but it felt overcrowded at times. I was hoping for a more laid-back atmosphere. ItÃ¢Â€Â™s decent for a quick visit, but I wouldnÃ¢Â€Â™t recommend staying long.', 'As a couple, we found Ahangama quite charming, but it had its downsides. While the beach itself was beautiful, the facilities were lacking. Finding clean showers and restrooms was a challenge. We enjoyed a couple of surf lessons, but the instructors seemed rushed and not very attentive. Overall, it was an okay experience.', 'Our family trip to Ahangama was mixed. The kids loved the beach and the idea of learning to surf, but the surf school was disorganized. We spent a lot of time waiting around. The beach was pretty, but there was quite a bit of trash around. It could be a great spot with a bit more care and management.', 'I visited Ahangama with friends, and whil

## Preprocessing Latest Reviews

1. Remove stopwords
2. Remove square brackets and punctuation
3. Lower case the text
4. Remove garbled words

In [19]:
def clean_latest_reviews(df):
    # Check if the dataframe contains the 'latest_reviews' column
    if "latest_reviews" not in df.columns:
        raise ValueError("The dataframe does not have a 'latest_reviews' column.")

    # Get the set of English stopwords
    stop_words = set(stopwords.words("english"))

    # Define a function to clean and fix each review
    def clean_review(review):
        # Remove square brackets and punctuation
        review = review.translate(str.maketrans("", "", string.punctuation))
        review = review.replace("[", "").replace("]", "")

        # Lowercase the text
        review = review.lower()

        # Split the review into words
        words = review.split()

        # Filter out words with unusual symbols or garbled text
        cleaned_words = [word for word in words if re.match(r"^[a-z]+$", word)]

        # Remove stopwords
        filtered_words = [word for word in cleaned_words if word not in stop_words]

        # Join the cleaned words back into a string
        cleaned_review = " ".join(filtered_words)
        return cleaned_review

    # Apply the clean_review function to the 'latest_reviews' column
    df["latest_reviews"] = df["latest_reviews"].apply(clean_review)

    return df


new_places_df = clean_latest_reviews(places_df)
new_places_df

Unnamed: 0,name,lat,lng,formatted_address,rating,user_ratings_total,latest_reviews
0,Arugam Bay Beach,6.840408,81.836848,"Arugam Bay Beach, Sri Lanka",4.8,1591.0,arugam bay beach surfers paradise spent incred...
1,Mirissa Beach,5.944703,80.459161,"Mirissa, Sri Lanka",4.6,1748.0,mirissa beach truly gem sri southern coast sof...
2,Weligama Beach (surf and stay),5.972486,80.435714,"Weligama, Sri Lanka",4.4,325.0,weligama beach fantastic spot beginner experie...
3,Ahangama,5.973975,80.362159,"Ahangama, Sri Lanka",,,ahangama bit disappointing solo traveler surfi...
4,Hikkaduwa Beach,6.137727,80.099060,"Hikkaduwa Beach, Sri Lanka",4.7,1438.0,hikkaduwa beach delightful escape solo travele...
...,...,...,...,...,...,...,...
406,Uppuveli Beach,8.607956,81.220013,"Trincomalee, Sri Lanka",4.3,399.0,uppuveli beach stunning escape soft sands clea...
407,Koggala Beach,5.992272,80.310691,"Koggala Beach, Sri Lanka",4.3,353.0,koggala beach hidden gem soft sand clear water...
408,Marakolliya Beach,6.042222,80.823073,"Kapuhenwala Road, Sri Lanka",4.3,180.0,marakolliya beach hidden gem waves perfect sur...
409,Pasikuda Beach,7.929994,81.561185,"Pasikuda Beach, Sri Lanka",4.4,1142.0,pasikuda beach hidden gem pristine waters perf...


In [20]:
new_places_df.loc[0]["latest_reviews"]

'arugam bay beach surfers paradise spent incredible days riding waves local surf schools fantastic beginners like atmosphere laidback friendly locals fellow travelers long day surfing sunsets simply magical beach bit crowded especially peak season adds lively vibe wait return friends unforgettable time arugam bay beach surfing conditions excellent managed catch great waves beach beautiful soft sand clear waters perfect swimming however noticed litter beach bit disappointing overall vibrant nightlife delicious food made definitely worth visit couple looking relaxation arugam bay beach offered perfect blend tranquility excitement enjoyed lazy days lounging beach indulging fresh seafood beachside restaurants surf scene lively easy find quieter spots unwind downside occasional noise nearby parties detract much experience lovely getaway visited arugam bay beach family children loved surf lessons found beach bit overcrowded atmosphere vibrant locals warm welcoming spent time exploring nearby

## Model Initialization

In [None]:
# Setup tokenizer and embedding model

