# Import libraries and data

In [41]:
import numpy as np 
import pandas as pd 
import re
from fuzzywuzzy import fuzz # for string matching
from fuzzywuzzy import process # for string matching
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm  # Import tqdm for progress logging
import logging
from textblob import TextBlob
import google.generativeai as genai
from collections import defaultdict
import time

In [43]:
places_df = pd.read_csv('Places Dataset.xlsx - places_final_dataset.csv')
users_df = pd.read_csv('Visitors Preference Dataset.xlsx - user_data_version_3_10K_Users.csv')

# Exploratory data analysis

In [44]:
places_df.head()

Unnamed: 0,name,lat,lng,formatted_address,rating,user_ratings_total,latest_reviews
0,Arugam Bay Beach,6.840408,81.836848,"Arugam Bay Beach, Sri Lanka",4.8,1591.0,['Arugam Bay Beach is a surfer's paradise! I s...
1,Mirissa Beach,5.944703,80.459161,"Mirissa, Sri Lanka",4.6,1748.0,['Mirissa Beach is truly a gem on Sri LankaÃ¢Â...
2,Weligama Beach (surf and stay),5.972486,80.435714,"Weligama, Sri Lanka",4.4,325.0,['Weligama Beach is a fantastic spot for both ...
3,Ahangama,5.973975,80.362159,"Ahangama, Sri Lanka",,,['Ahangama was a bit disappointing for me as a...
4,Hikkaduwa Beach,6.137727,80.09906,"Hikkaduwa Beach, Sri Lanka",4.7,1438.0,['Hikkaduwa Beach is a delightful escape for s...


In [45]:
users_df.head()

Unnamed: 0,User ID,Name,Email,Preferred Activities,Bucket list destinations Sri Lanka
0,1,Jennifer Quinn,jennifer.quinn@example.com,"['cycling', 'historical monuments', 'village h...","['Polonnaruwa', 'Hatton', 'Anuradhapura', 'Ell..."
1,2,Emily Perry,emily.perry@example.com,"['butterfly watching', 'hot springs', 'wildlif...","['Madunagala Hot Water Spring', 'Wilpattu Nati..."
2,3,Danielle Mcbride,danielle.mcbride@example.com,"['sea cruises', 'themed parks', 'craft worksho...","['Mirissa Beach', 'Negombo Lagoon', 'Batadomba..."
3,4,Angelica Wilson,angelica.wilson@example.com,"['fishing', 'hot springs', 'sailing']","['Maha Oya Hot Water Springs', 'Colombo Port C..."
4,5,Laurie Powers,laurie.powers@example.com,"['history tours', 'sailing', 'literary tours']","['Negombo Lagoon', 'Colombo Port City', 'Galle..."


In [46]:
#Function to analyze data
def customDescription(df: pd.DataFrame, numeric_only: bool = False):
    if numeric_only:
        df = df.select_dtypes(include=np.number)
    
    desc = pd.DataFrame(index=df.columns.to_list())
    desc['type'] = df.dtypes
    desc['count'] = df.count()
    desc['nunique'] = df.nunique()
    desc['null'] = df.isnull().sum()
    
    # Handle numeric columns separately
    if not df.select_dtypes(include=np.number).empty:
        numeric_desc = df.describe().T.drop(columns=['count','std','25%','50%','75%'], axis=1)
        for col in df.select_dtypes(include=np.number).columns:
            desc.loc[col, 'mean'] = numeric_desc.loc[col, 'mean']
            desc.loc[col, 'min'] = numeric_desc.loc[col, 'min']
            desc.loc[col, 'max'] = numeric_desc.loc[col, 'max']
    
    # Handle non-numeric columns separately
    if not df.select_dtypes(exclude=np.number).empty:
        non_numeric_desc = df.select_dtypes(exclude=np.number).describe().T
        desc = pd.concat([desc, non_numeric_desc.drop(columns=['count', 'unique', 'top', 'freq'], axis=1, errors='ignore')], axis=1)
    
    return desc

In [47]:
customDescription(users_df)

Unnamed: 0,type,count,nunique,null,mean,min,max
User ID,int64,10000,10000,0,5000.5,1.0,10000.0
Name,object,10000,9419,0,,,
Email,object,10000,9369,0,,,
Preferred Activities,object,10000,9830,0,,,
Bucket list destinations Sri Lanka,object,10000,9995,0,,,


In [48]:
customDescription(places_df)

Unnamed: 0,type,count,nunique,null,mean,min,max
name,object,411,398,0,,,
lat,float64,410,402,1,7.304668,5.941381,9.820859
lng,float64,410,398,1,80.583211,79.694183,81.859583
formatted_address,object,411,220,0,,,
rating,float64,355,24,56,4.459437,0.9,5.0
user_ratings_total,float64,355,316,56,1608.639437,27.0,26736.0
latest_reviews,object,411,411,0,,,


In [49]:
print("Shape of places table:",places_df.shape)
print("Shape of users table:",users_df.shape)

Shape of places table: (411, 7)
Shape of users table: (10000, 5)


##### Notes:

* places dataset includes 1 null value in lat & lng and 56 null values in rating & user_ratings_total.
* Seems like there are some duplicated data also.
* Need to check whether both prefferd activities and bucket list are taken from a dropdown list or simply typed by each user.
* If they are selected from a drop down, totally fine.
* Otherwise we need to identify and merge identical activities across different forms to streamline data.

In [50]:
# Find duplicated names
duplicated_names = places_df[places_df.duplicated(subset='name', keep=False)]
duplicated_names_sorted = duplicated_names.sort_values('name')

# Display the results
print(f"Number of entries with duplicated names: {len(duplicated_names)}")
print("\nEntries with duplicated names:")
duplicated_names_sorted

Number of entries with duplicated names: 26

Entries with duplicated names:


Unnamed: 0,name,lat,lng,formatted_address,rating,user_ratings_total,latest_reviews
104,Ambuluwawa Biodiversity Complex,7.161627,80.547197,"Gampola , Sri Lanka",4.5,3248.0,['Ambuluwawa Biodiversity Complex is a hidden ...
398,Ambuluwawa Biodiversity Complex,7.161629,80.547189,"Gampola, Sri Lanka",4.7,3254.0,['The Ambuluwawa Biodiversity Complex is a hid...
19,Bundala National Park,6.199086,81.210493,"bundala , Sri Lanka",4.3,654.0,['Bundala National Park is a hidden gem for na...
405,Bundala National Park,6.193822,81.187854,"Weligatta, Sri Lanka",4.3,561.0,['Bundala National Park is a hidden gem! I arr...
52,Coral Sanctuary Boat Ticket Issue Center,6.137342,80.099253,ÃƒÂ Ã‚Â·Ã‚Â„ÃƒÂ Ã‚Â·Ã‚Â’ÃƒÂ Ã‚Â¶Ã‚ÂšÃƒÂ Ã‚Â·Ã‚...,4.1,209.0,['I had a fantastic time at the Coral Sanctuar...
391,Coral Sanctuary Boat Ticket Issue Center,6.137358,80.099252,"Hikkaduwa, Sri Lanka",4.4,131.0,['The Coral Sanctuary Boat Ticket Issue Center...
22,Dambulla Royal Cave Temple and Golden Temple,7.854914,80.65057,"Dambulla, Sri Lanka",4.2,11598.0,['The Dambulla Royal Cave Temple is a breathta...
395,Dambulla Royal Cave Temple and Golden Temple,7.854914,80.65057,"Dambulla, Sri Lanka",4.3,11690.0,['Dambulla Royal Cave Temple is a must-visit f...
127,Elephant Transit Home,6.426689,80.816536,"Udawalawa, Sri Lanka",4.6,1849.0,['Visiting the Elephant Transit Home was a hea...
400,Elephant Transit Home,6.426689,80.816536,"Udawalawa, Sri Lanka",4.6,1784.0,['The Elephant Transit Home is a heartwarming ...


##### Notes:
* It appears that the places dataset have some non english characters. We need to clean the data in the future steps. Then we can look into a solution for these duplicates. Let's look at it in the later parts.
* It seems like we need to do some modifications on the `formatted_address` also.

In [51]:
# Find duplicated emails
duplicated_emails = users_df[users_df.duplicated(subset='Email', keep=False)]

# Sort by email to group duplicates together
duplicated_emails_sorted = duplicated_emails.sort_values('Email')

# Display the results
print(f"Number of entries with duplicated emails: {len(duplicated_emails)}")
print("\nEntries with duplicated emails:")
duplicated_emails_sorted

Number of entries with duplicated emails: 1132

Entries with duplicated emails:


Unnamed: 0,User ID,Name,Email,Preferred Activities,Bucket list destinations Sri Lanka
1009,1010,Aaron Phillips,aaron.phillips@example.com,"['butterfly watching', 'sailing', 'elephant ri...","['Colombo Port City', 'Knuckles', 'Udawalawe',..."
3779,3780,Aaron Phillips,aaron.phillips@example.com,"['local crafts', 'hot air ballooning', 'hiking']","[""Sri Pada / Adam's Peak"", 'Ambalangoda', 'Kan..."
2876,2877,Aaron Walker,aaron.walker@example.com,"['boat safaris', 'yoga retreats', 'museum visi...","['Colombo National Museum', 'Ratnapura Gem Mus..."
7371,7372,Aaron Walker,aaron.walker@example.com,"['scuba diving', 'yoga retreats', 'photography']","['Kalpitiya', 'Yala National Park', 'Sinharaja..."
1845,1846,Alicia Jones,alicia.jones@example.com,"['historic sites', 'animal encounters', 'arts ...","['Jaffna Public Library', 'Mihintale', 'Gangar..."
...,...,...,...,...,...
6529,6530,William Wilson,william.wilson@example.com,"['sailing lessons', 'sightseeing', 'public art...","['Pidurangala Rock', 'Negombo', 'Horton Plains..."
4365,4366,Zachary Jones,zachary.jones@example.com,"['river cruises', 'literary tours', 'public ar...","['Madu River', 'Kandy', 'Bentota River', 'Mart..."
5122,5123,Zachary Jones,zachary.jones@example.com,"['beachfront dining', 'landscape photography',...","['Ella Gap', 'Galle', 'Sigiriya', 'Ahangama', ..."
3641,3642,Zachary Martin,zachary.martin@example.com,"['caving', 'kayaking', 'temple pilgrimages']","['Belilena Caves', 'Jaya Sri Maha Bodhi', ""Sri..."


##### Notes:
* It appears that the same person has entered their preferences multiple times. We'll retain these entries,assuming they represent different time frames when the individual visits the country on separate occasions.


In [52]:
unique_activities = users_df['Preferred Activities'].unique()
activities = set()

for activity in unique_activities:
    activity_set = activity.strip("[]'").replace("'", "").split(", ")
    activities.update(activity_set)

sorted_activities = sorted(activities)

print(f"Number of unique activities: {len(sorted_activities)}")
print("Unique activities in alphabetical order:")
for activity in sorted_activities:
    print(activity)

Number of unique activities: 68
Unique activities in alphabetical order:
amusement parks
animal encounters
archaeological sites
architecture photography
architecture tours
art classes
arts and culture
ayurvedic spa treatments
beach visits
beachfront dining
bird watching
boat safaris
botanical gardens
butterfly watching
camping
caving
city tours
craft workshops
cultural experiences
cultural festivals
cycling
elephant rides
fishing
golfing
hiking
historic sites
historic walks
historical monuments
history tours
horse shows
horseback riding
hot air ballooning
hot springs
kayaking
landscape photography
literary tours
local crafts
mountain biking
museum visits
outdoor adventures
paddleboarding
photography
planetarium visits
public art installations
river cruises
rock climbing
safaris
sailing
sailing lessons
scuba diving
sea cruises
sightseeing
snorkeling
spiritual retreats
surfing
tea tasting
temple pilgrimages
theater
themed parks
traditional ceremonies
turtle watching
village homestays
wat

##### Notes:
* This analysis shows that the activities are from a dropdown selection or a selection of activities that the user can choose from. All good!


In [53]:
unique_bucket_places = users_df['Bucket list destinations Sri Lanka'].unique()
bucket_places = set()

for bucket_place in unique_bucket_places:
    bucket_place_set = bucket_place.strip("[]'").replace("'", "").split(", ")
    bucket_places.update(bucket_place_set)

# Remove unwanted signs and whitespace at the beginning and end of place names
cleaned_bucket_places = {place.strip('"').strip() for place in bucket_places}

sorted_bucket_places = sorted(cleaned_bucket_places)

print(f"Number of unique bucket list destinations: {len(sorted_bucket_places)}")
print("Unique bucket list destinations in alphabetical order:")
for bucket_place in sorted_bucket_places:
    print(bucket_place)

Number of unique bucket list destinations: 159
Unique bucket list destinations in alphabetical order:
Ahangama
Ahungalla
Ambalangoda
Ambalangoda Mask Workshop
Ambuluwawa Tower
Anawilundawa Wetlands
Anuradapura
Anuradhapura
Arankelle Forest Monastery
Arugam Bay Beach
Bakers Falls
Bambarakanda Falls
Bambarakiri Ella
Batadombalena Craft Centre
Batatotalena (Batadombalena) Cave
Belihuloya
Belilena Caves
Bentota
Bentota Beach
Bentota River
Bolgoda Lake
Bomburu Ella Waterfall
Bopath Falls
Bundala National Park
Colombo
Colombo City Tour
Colombo National Museum
Colombo Port City
Dambulla
Dambulla Royal Cave Temple and Golden Temple
Devon Falls
Diyaluma Falls
Dry Zone Botanic Gardens
Dunhinda Waterfall
Dutch Museum
Elephant Transit Home
Ella
Ella Gap
Ella Rock
Excel World
Folk Museum
Galle
Galle City Tour
Galle Dutch Fort
Galle Fort
Galle Lighthouse
Gangaramaya Temple
Hakgala Botanical Garden
Hambantota
Haputale
Hatton
Hikkaduwa
Hikkaduwa Beach
Hikkaduwa Coral Sanctuary
Hiriketiya
Hiriketiya Be

##### Notes:
* This analysis shows that the bucket list places are manually entered by the user.
* We need to identify and merge identical places across different forms to streamline data.

In [54]:
def find_similar_places(places, threshold=90):
    similar_places = {}
    for place in places:
        matches = process.extract(place, places, limit=len(places), scorer=fuzz.token_sort_ratio)
        similar = [match for match in matches if match[1] >= threshold and match[0] != place]
        if similar:
            similar_places[place] = similar
    return similar_places

# Use the bucket_places set directly
unique_places = list(bucket_places)  

# Find similar places with similarity > 90%
similar_places = find_similar_places(unique_places, threshold=90)

print("Places with similarity")
for place, matches in similar_places.items():
    print(f"\nSimilar to '{place}':")
    for match in matches:
        print(f"  - '{match[0]}' (similarity: {match[1]}%)")

# Print total count of similar place groups
print(f"\nTotal number of place groups with high similarity: {len(similar_places)}")

Places with similarity

Similar to 'Pearl Bay':
  - 'Perl Bay' (similarity: 94%)

Similar to 'Perl Bay':
  - 'Pearl Bay' (similarity: 94%)

Similar to 'Jungle Beach':
  - 'Jungle beach' (similarity: 100%)

Similar to 'Anuradhapura':
  - 'Anuradapura' (similarity: 96%)

Similar to 'Kitulgala':
  - 'Kithulgala' (similarity: 95%)

Similar to 'Jungle beach':
  - 'Jungle Beach' (similarity: 100%)

Similar to 'Kithulgala':
  - 'Kitulgala' (similarity: 95%)

Similar to 'Polonnaruwa':
  - 'Polonaruwa' (similarity: 95%)

Similar to 'Anuradapura':
  - 'Anuradhapura' (similarity: 96%)

Similar to 'Polonaruwa':
  - 'Polonnaruwa' (similarity: 95%)

Total number of place groups with high similarity: 10


##### Notes:
* We'll try to solve this problem in the later parts.

# Data Preprocessing

#### Let's remove non-English characters in places dataset

In [55]:
import re

def clean_text(text):
    # Remove all characters except alphabets, numbers, and punctuation
    #cleaned = re.sub(r'[^\w\s.,!?;:()-]', '', str(text))
    cleaned = re.sub(r'[^A-Za-z\s.,]', '', str(text))
    # Normalize spaces
    cleaned = re.sub(r'\s+', ' ', cleaned)
    return cleaned.strip()

# Apply the cleaning function to the 'name', 'latest_reviews' and 'formatted_address' columns in the places dataframe
places_df['name'] = places_df['name'].apply(clean_text)
places_df['latest_reviews'] = places_df['latest_reviews'].apply(clean_text)
places_df['formatted_address'] = places_df['formatted_address'].apply(clean_text)

#### Let's manipulate the duplicates in places dataset

In [56]:
# Find duplicated names
duplicated_names = places_df[places_df.duplicated(subset='name', keep=False)]
duplicated_names_sorted = duplicated_names.sort_values('name')

# Display the results
print(f"Number of entries with duplicated names: {len(duplicated_names)}")

# Group duplicates and aggregate
def aggregate_duplicates(group):
    first_record = group.iloc[0]
    second_record = group.iloc[1] if len(group) > 1 else None
    
    # Take lat, lng, formatted_address from the first record, if present, otherwise from the second
    lat = first_record['lat'] if pd.notna(first_record['lat']) else (second_record['lat'] if second_record is not None else None)
    lng = first_record['lng'] if pd.notna(first_record['lng']) else (second_record['lng'] if second_record is not None else None)
    formatted_address = first_record['formatted_address'] if pd.notna(first_record['formatted_address']) else (second_record['formatted_address'] if second_record is not None else None)
    
    # Calculate weighted average rating
    total_ratings = group['user_ratings_total'].sum()
    if total_ratings > 0:
        weighted_rating = (group['rating'] * group['user_ratings_total']).sum() / total_ratings
    else:
        weighted_rating = group['rating'].mean()
    
    # Combine latest_reviews
    combined_reviews = ' '.join(group['latest_reviews'].dropna())
    
    return pd.Series({
        'lat': lat,
        'lng': lng,
        'formatted_address': formatted_address,
        'rating': weighted_rating,
        'user_ratings_total': total_ratings,
        'latest_reviews': combined_reviews
    })

# Apply the aggregation
places_deduplicated = places_df.groupby('name').apply(aggregate_duplicates).reset_index()

# Print the results
print(f"Number of entries before deduplication: {len(places_df)}")
print(f"Number of entries after deduplication: {len(places_deduplicated)}")

# Update the original places dataframe
places_df = places_deduplicated

Number of entries with duplicated names: 28
Number of entries before deduplication: 411
Number of entries after deduplication: 397


  places_deduplicated = places_df.groupby('name').apply(aggregate_duplicates).reset_index()


In [57]:
# Removing 'Anuradhapura' and 'Colombo' because we have 'Anuradhapura New Town' and 'Colombo City Tour'
places_df = places_df[~places_df['name'].isin(['Anuradhapura', 'Colombo'])]
places_df = places_df.reset_index(drop=True)


#### Let's fix the `formatted_address` section in places dataset

In [58]:
# Remove 'Sri Lanka' from the formatted_address column
places_df['formatted_address'] = places_df['formatted_address'].str.replace(', Sri Lanka', '', regex=False)

In [59]:
places_df[places_df['formatted_address'] == ""]

Unnamed: 0,name,lat,lng,formatted_address,rating,user_ratings_total,latest_reviews
56,Coral Sanctuary Boat Ticket Issue Center,6.137342,80.099253,,4.215588,340.0,I had a fantastic time at the Coral Sanctuary ...
93,Fort Entrance Old Gate,6.028114,80.218447,,4.8,231.0,"The Fort Entrance, or Old Gate, is a captivati..."
171,Kudawa Beach Kalpitiya,8.227305,79.728156,,4.3,329.0,Kudawa Beach Kalpitiya is a hidden gem The sof...
203,Mayan Water Park,7.068894,79.902236,,3.7,562.0,Mayan Water Park was a decent outing for my fa...


In [60]:
places_df[places_df['formatted_address'] == "Sri"]

Unnamed: 0,name,lat,lng,formatted_address,rating,user_ratings_total,latest_reviews
150,Kanneliya National Rain Forest Reserve,6.257642,80.359621,Sri,4.754603,315.0,Kanneliya National Rain Forest Reserve is a hi...


In [61]:
places_df[places_df['formatted_address'] == "Sri Lanka"]

Unnamed: 0,name,lat,lng,formatted_address,rating,user_ratings_total,latest_reviews
5,Alankuda Casuarina Beach,8.048135,79.709916,Sri Lanka,4.3,99.0,Alankuda Casuarina Beach is a hidden gem The p...
26,Bambarakiri Ella,7.495246,80.699001,Sri Lanka,4.4,2004.0,"Bambarakiri Ella is a beautiful, serene spot t..."
29,Bathalagoda Tank,7.531868,80.450986,Sri Lanka,4.7,60.0,Bathalagoda Tank is such a hidden gem I visite...
82,Dutch Bay Beach,8.573372,81.239508,Sri Lanka,4.3,396.0,Dutch Bay Beach is an absolute gem The cleanli...
121,Horton Plains National Park,6.809446,80.802333,Sri Lanka,4.7,8564.0,Horton Plains National Park was an incredible ...
153,Kattukkarai Kulam,8.862747,80.015727,Sri Lanka,4.2,121.0,Kattukkarai Kulam is a hidden gem in the North...
167,Kokkilai Beach,9.064594,80.918706,Sri Lanka,3.9,140.0,Kokkilai Beach is a hidden gem The tranquil at...
168,Korathota Royal Temple,6.915038,80.002135,Sri Lanka,4.8,1226.0,Korathota Royal Temple is a hidden gem The cli...
177,Leisure World,,,Sri Lanka,,0.0,"Leisure World has potential, but my experience..."
187,Madunagala Hot Water Spring,6.25363,80.981711,Sri Lanka,4.54925,2268.0,Madunagala Hot Water Spring is a hidden gem Th...


In [62]:
places_df.loc[places_df['name'] == 'Coral Sanctuary Boat Ticket Issue Center', 'formatted_address'] = 'Hikkaduwa'
places_df.loc[places_df['name'] == 'Fort Entrance Old Gate', 'formatted_address'] = 'Galle'
places_df.loc[places_df['name'] == 'Kudawa Beach Kalpitiya', 'formatted_address'] = 'Kalpitiya'
places_df.loc[places_df['name'] == 'Mayan Water Park', 'formatted_address'] = 'Negombo'
places_df.loc[places_df['name'] == 'Kanneliya National Rain Forest Reserve', 'formatted_address'] = 'Kanneliya'
places_df.loc[places_df['name'] == 'Alankuda Casuarina Beach', 'formatted_address'] = 'Kalpitiya'
places_df.loc[places_df['name'] == 'Bambarakiri Ella', 'formatted_address'] = 'Kandy'
places_df.loc[places_df['name'] == 'Bathalagoda Tank', 'formatted_address'] = 'Kurunegala'
places_df.loc[places_df['name'] == 'Dutch Bay Beach', 'formatted_address'] = 'Trincomalee'
places_df.loc[places_df['name'] == 'Horton Plains National Park', 'formatted_address'] = 'Nuwara Eliya'
places_df.loc[places_df['name'] == 'Kattukkarai Kulam', 'formatted_address'] = 'Vavuniya'
places_df.loc[places_df['name'] == 'Kokkilai Beach', 'formatted_address'] = 'Kokkilai'
places_df.loc[places_df['name'] == 'Korathota Royal Temple', 'formatted_address'] = 'Korathota'
places_df.loc[places_df['name'] == 'Leisure World', 'formatted_address'] = 'Hanwella'
places_df.loc[places_df['name'] == 'Madunagala Hot Water Spring', 'formatted_address'] = 'Hambantota'
places_df.loc[places_df['name'] == 'Manelwatta Temple', 'formatted_address'] = 'Gampaha'
places_df.loc[places_df['name'] == 'Midigama Right', 'formatted_address'] = 'Midigama'
places_df.loc[places_df['name'] == 'Minneriya National Park', 'formatted_address'] = 'Habarana'
places_df.loc[places_df['name'] == 'Rajanawa Waterfall', 'formatted_address'] = 'Ratnapura'
places_df.loc[places_df['name'] == 'Sinharaja Rain Forest', 'formatted_address'] = 'Sinharaja Forest Reserve'
places_df.loc[places_df['name'] == 'Thalawila Beach', 'formatted_address'] = 'Kalpitiya'
places_df.loc[places_df['name'] == 'Udawalawe National Park', 'formatted_address'] = 'Udawalawe'
places_df.loc[places_df['name'] == 'Vankalai Bird Sanctuary', 'formatted_address'] = 'Mannar'
places_df.loc[places_df['name'] == 'Wilpattu National Park', 'formatted_address'] = 'Puttalam'	
places_df.loc[places_df['name'] == 'Yala National Park', 'formatted_address'] = 'Hambantota'
places_df.loc[places_df['name'] == 'Aanda Ella Fall', 'formatted_address'] = 'Ratnapura'	

#### Handeling missing values

In [63]:
# Check for null values in lat or lng columns
null_lat_lng = places_df[places_df['lat'].isnull() | places_df['lng'].isnull()]
null_lat_lng

Unnamed: 0,name,lat,lng,formatted_address,rating,user_ratings_total,latest_reviews
177,Leisure World,,,Hanwella,,0.0,"Leisure World has potential, but my experience..."


In [64]:
# Manually add lat and lng
places_df.loc[places_df['name'] == 'Leisure World', 'lat'] = 6.9167
places_df.loc[places_df['name'] == 'Leisure World', 'lng'] = 80.0667

In [65]:
# Check for null values in the rating column
null_ratings = places_df[places_df['rating'].isnull()]
num_null_ratings = null_ratings.shape[0]
print(f"\nNumber of places with null ratings: {num_null_ratings}")
null_ratings


Number of places with null ratings: 53


Unnamed: 0,name,lat,lng,formatted_address,rating,user_ratings_total,latest_reviews
2,Ahangama,5.973975,80.362159,Ahangama,,0.0,Ahangama was a bit disappointing for me as a s...
3,Ahungalla,6.313278,80.040918,Ahungalla,,0.0,"Ahungalla seemed promising, but I found it a b..."
9,Ambalangoda,6.244152,80.05908,Ambalangoda,,0.0,"Ambalangoda had potential, but it fell short o..."
12,Anawilundawa Wetland,7.709595,79.82127,Anawilundawa Wetland,,0.0,I visited Anawilundawa Wetland expecting a vib...
34,Belihuloya,6.718399,80.774091,Belihuloya,,0.0,Belihuloya was a bit disappointing for me. The...
62,Dambulla,7.874217,80.651129,Dambulla,,0.0,Dambulla was a bit of a letdown for me. The ho...
76,Diyagala Escape,6.706992,80.439074,Ratnapura,,0.0,Diyagala Escape was a bit disappointing overal...
83,Dutch Cannon,7.711577,81.702434,Batticaloa,,0.0,Visiting the Dutch Cannon was a bit disappoint...
98,Galle,6.032895,80.216791,Galle,,0.0,"Galle has some charm, but honestly, I expected..."
108,Haputale,6.765414,80.952565,Haputale,,0.0,Haputale is a beautiful location with stunning...


##### Notes:
* As these places are more like main cities there isn't ratings foe these places in google maps. We need to find a other way to fill them.

In [66]:
sorted_bucket_places

['Ahangama',
 'Ahungalla',
 'Ambalangoda',
 'Ambalangoda Mask Workshop',
 'Ambuluwawa Tower',
 'Anawilundawa Wetlands',
 'Anuradapura',
 'Anuradhapura',
 'Arankelle Forest Monastery',
 'Arugam Bay Beach',
 'Bakers Falls',
 'Bambarakanda Falls',
 'Bambarakiri Ella',
 'Batadombalena Craft Centre',
 'Batatotalena (Batadombalena) Cave',
 'Belihuloya',
 'Belilena Caves',
 'Bentota',
 'Bentota Beach',
 'Bentota River',
 'Bolgoda Lake',
 'Bomburu Ella Waterfall',
 'Bopath Falls',
 'Bundala National Park',
 'Colombo',
 'Colombo City Tour',
 'Colombo National Museum',
 'Colombo Port City',
 'Dambulla',
 'Dambulla Royal Cave Temple and Golden Temple',
 'Devon Falls',
 'Diyaluma Falls',
 'Dry Zone Botanic Gardens',
 'Dunhinda Waterfall',
 'Dutch Museum',
 'Elephant Transit Home',
 'Ella',
 'Ella Gap',
 'Ella Rock',
 'Excel World',
 'Folk Museum',
 'Galle',
 'Galle City Tour',
 'Galle Dutch Fort',
 'Galle Fort',
 'Galle Lighthouse',
 'Gangaramaya Temple',
 'Hakgala Botanical Garden',
 'Hambantota'

In [67]:
places_df['name']

0             Aanda Ella Fall
1          Aberdeen Waterfall
2                    Ahangama
3                   Ahungalla
4            Alahana Pirivena
                ...          
390        Yala National Park
391    Yapahuwa Rock Fortress
392        jungle muru safari
393        riapla Mask Museum
394                 rumassala
Name: name, Length: 395, dtype: object

#### Identifying places on both datasets

In [68]:
# Convert places['name'] to a set for efficient comparison
places_set = set(places_df['name'].str.lower().str.strip())

# Convert sorted_bucket_places to a set of lowercase strings for comparison
bucket_places_set = set(place.lower().strip() for place in sorted_bucket_places)

# Find common places
common_places = places_set.intersection(bucket_places_set)

# Sort the common places for easier reading
common_places_sorted = sorted(common_places)

# Print the results
print(f"Number of places in both datasets: {len(common_places)}")
print("\nPlaces that appear in both 'places' dataset and bucket list:")
for place in common_places_sorted:
    print(f"- {place}")

# Optional: Find places only in 'places' dataset
places_only = places_set - bucket_places_set
print(f"\nNumber of places only in 'places' dataset: {len(places_only)}")

# Optional: Find places only in bucket list
bucket_only = bucket_places_set - places_set
print(f"Number of places only in bucket list: {len(bucket_only)}")

Number of places in both datasets: 104

Places that appear in both 'places' dataset and bucket list:
- ahangama
- ahungalla
- ambalangoda
- arugam bay beach
- bakers falls
- bambarakanda falls
- bambarakiri ella
- belihuloya
- bolgoda lake
- bomburu ella waterfall
- bopath falls
- bundala national park
- colombo city tour
- colombo national museum
- dambulla
- dambulla royal cave temple and golden temple
- devon falls
- diyaluma falls
- dunhinda waterfall
- dutch museum
- elephant transit home
- galle
- galle dutch fort
- galle lighthouse
- gangaramaya temple
- hakgala botanical garden
- haputale
- hikkaduwa
- hikkaduwa beach
- hiriketiya beach
- horton plains national park
- jaffna public library
- jaya sri maha bodhi
- jungle beach
- kalpitiya
- kalpitiya lagoon
- kandalama
- kandy
- kandy city centre
- kandy lake
- kandy national museum
- kanneliya national rain forest reserve
- kitulgala
- knuckles
- koggala beach
- kumana national park
- laxapana falls
- leisure world
- lionel wen

In [69]:
places_only

{'aanda ella fall',
 'aberdeen waterfall',
 'alahana pirivena',
 'alankuda casuarina beach',
 'all saints church, galle church of ceylon',
 'alupotha ella waterfall',
 'aluthnuwara rajamaha viharaya , aluthnuwara',
 'ambuluwawa biodiversity complex',
 'ambuluwawa temple',
 'anawilundawa wetland',
 'angammedilla national park',
 'anuradhapura new town',
 'arankale buddhist monastery',
 'archaeological museum mihintale',
 'archaeology museum kotte',
 'arthurs seat view point, kandy',
 'arukuveli beach',
 'athugala viharaya',
 'auslink hotel walapane sri lanka',
 'balana fort',
 'baobab tree pallimunai, mannar, sri lanka',
 'batadombalena',
 'bathalagoda tank',
 'batticaloa dutch fort',
 'batticaloa gate',
 'batticaloa lagoon',
 'batticaloa lighthouse',
 'belilena cave',
 'bentota ganga',
 'bird watching tower',
 'black galle fort zwart bastion',
 'blue moon camping',
 'boat trip kalpitiya dolphin watching',
 'bogoda raja maha viharaya',
 'bowatenna reservoir',
 'box stone pettigala',
 'b

In [70]:
bucket_only

{'ambalangoda mask workshop',
 'ambuluwawa tower',
 'anawilundawa wetlands',
 'anuradapura',
 'anuradhapura',
 'arankelle forest monastery',
 'batadombalena craft centre',
 'batatotalena (batadombalena) cave',
 'belilena caves',
 'bentota',
 'bentota beach',
 'bentota river',
 'colombo',
 'colombo port city',
 'dry zone botanic gardens',
 'ella',
 'ella gap',
 'ella rock',
 'excel world',
 'folk museum',
 'galle city tour',
 'galle fort',
 'hambantota',
 'hatton',
 'hikkaduwa coral sanctuary',
 'hiriketiya',
 'horton plains',
 'kandy temple',
 'kanniya hot springs',
 'kithulgala',
 'kitugala forest',
 'kosgoda turtle hatchery',
 'madu river',
 'mahalenama cave',
 'mahapelessa hot springs',
 'museum of modern and contemporary art',
 'nallur kandaswamy kovil',
 'negambo',
 'passikuda beach',
 'perl bay',
 'polonaruwa',
 'ratnapura gem museum',
 'riverstone gap',
 'royal botanical gardens',
 'sri pada / adams peak',
 'st clairs falls',
 'udawalawe',
 'unawatuna lagoon',
 'vaddha village c

In [71]:
def find_similar_places(source_places, target_places, threshold=90):
    similar_places = {}
    for place in source_places:
        matches = process.extractBests(place, target_places, score_cutoff=threshold, limit=3)
        if matches:
            similar_places[place] = matches
    return similar_places

# Convert sets to lists for fuzzywuzzy
places_only_list = list(places_only)
bucket_only_list = list(bucket_only)

# Find similar places with similarity > 90%
similar_places = find_similar_places(places_only_list, bucket_only_list, threshold=90)

# Print results
print("Similar places (not exactly same) between places_only and bucket_only with similarity > 90%:")
for place, matches in similar_places.items():
    print(f"\nSimilar to '{place}':")
    for match in matches:
        print(f"  - '{match[0]}' (similarity: {match[1]}%)")

# Print summary
print(f"\nTotal number of places in places_only with similar matches (>90% similarity): {len(similar_places)}")


Similar places (not exactly same) between places_only and bucket_only with similarity > 90%:

Similar to 'hidden water fall tiny ella':
  - 'ella' (similarity: 90%)

Similar to 'port city colombo':
  - 'colombo port city' (similarity: 95%)
  - 'colombo' (similarity: 90%)

Similar to 'duwili ella waterfall':
  - 'ella' (similarity: 90%)

Similar to 'madola ella':
  - 'ella' (similarity: 90%)

Similar to 'maha viharaya, anuradhapura':
  - 'anuradhapura' (similarity: 90%)

Similar to 'spa ceylon heritage spa boutique ii galle fort':
  - 'galle fort' (similarity: 90%)

Similar to 'kirindi ella waterfall':
  - 'ella' (similarity: 90%)

Similar to 'riverston':
  - 'riverstone gap' (similarity: 90%)

Similar to 'pasikuda beach':
  - 'passikuda beach' (similarity: 97%)

Similar to 'neluwa doovili ella':
  - 'ella' (similarity: 90%)

Similar to 'dry zone botanic gardens, hambantota':
  - 'dry zone botanic gardens' (similarity: 90%)
  - 'hambantota' (similarity: 90%)

Similar to 'huluganga ella 

##### Notes:
* Looks like there is a notable number of differences in places in two datasets. We need to look into this in the later parts.

In [72]:
sorted_activities = [activity.replace('safaris', 'wild life safaris').replace('hot air ballooning', 'air ballooning') for activity in sorted_activities]
sorted_activities

['amusement parks',
 'animal encounters',
 'archaeological sites',
 'architecture photography',
 'architecture tours',
 'art classes',
 'arts and culture',
 'ayurvedic spa treatments',
 'beach visits',
 'beachfront dining',
 'bird watching',
 'boat wild life safaris',
 'botanical gardens',
 'butterfly watching',
 'camping',
 'caving',
 'city tours',
 'craft workshops',
 'cultural experiences',
 'cultural festivals',
 'cycling',
 'elephant rides',
 'fishing',
 'golfing',
 'hiking',
 'historic sites',
 'historic walks',
 'historical monuments',
 'history tours',
 'horse shows',
 'horseback riding',
 'air ballooning',
 'hot springs',
 'kayaking',
 'landscape photography',
 'literary tours',
 'local crafts',
 'mountain biking',
 'museum visits',
 'outdoor adventures',
 'paddleboarding',
 'photography',
 'planetarium visits',
 'public art installations',
 'river cruises',
 'rock climbing',
 'wild life safaris',
 'sailing',
 'sailing lessons',
 'scuba diving',
 'sea cruises',
 'sightseeing',

### Let's extract relevant activities and their corresponding satisfaction scores for each location

In [73]:
places_df['extracted_activities'] = None
places_df['activity_scores'] = None

In [74]:
key1 = "AIzaSyCpSOdFzbl8XUgib6gyH9huHlekUiAc5cY"
key2 = "AIzaSyDwKelXBe5yGeoyXneX8a0dN3hWEhrLJss"

# Configure the Google Generative AI
def configure_genai(key):
    genai.configure(api_key=key)
    generation_config = {
        "temperature": 0.7,
        "top_p": 0.9,
        "max_output_tokens": 1000,
        "response_mime_type": "text/plain",
    }
    return genai.GenerativeModel(
        model_name="gemini-1.5-flash",
        generation_config=generation_config,
    )

def extract_activities_and_scores(review, candidate_activities, model):
    prompt = f"Extract all the activities and their satisfaction scores from the following reviews: {review}. The activities should be from the following list: {candidate_activities}. Give the response in this format: Activity name(in lowercase as in the given list) - satisfaction score(0-5). If there are many activities, add them line by line in the same format. remove markdown syntaxs and don't add details that are not required."
    chat_session = model.start_chat(history=[])
    response = chat_session.send_message(prompt)
    return response.text

def parse_and_average_extracted_data(extracted_data):
    activity_scores = defaultdict(list)
    lines = extracted_data.strip().split('\n')
    
    for line in lines:
        if ' - ' in line:
            activity, score = line.split(' - ')
            activity = activity.strip()
            score = float(score.strip())
            activity_scores[activity].append(score)

    averaged_activities = []
    averaged_scores = []
    
    for activity, scores in activity_scores.items():
        averaged_activities.append(activity)
        averaged_scores.append(sum(scores) / len(scores))

    return averaged_activities, averaged_scores

# Process rows in batches of 10
batch_size = 10
total_rows = len(places_df)

for start_idx in range(0, total_rows, batch_size):
    end_idx = min(start_idx + batch_size, total_rows)
    
    # Alternate between keys
    current_key = key1 if (start_idx // batch_size) % 2 == 0 else key2
    model = configure_genai(current_key)
    
    for i in range(start_idx, end_idx):
        review = places_df.iloc[i]['latest_reviews']
        extracted_data = extract_activities_and_scores(review, sorted_activities, model)
        
        activities, scores = parse_and_average_extracted_data(extracted_data)
        
        places_df.at[i, 'extracted_activities'] = activities
        places_df.at[i, 'activity_scores'] = scores
        
        del activities, scores, review, extracted_data
    
    print(f"Processed rows {start_idx} to {end_idx-1}")
    
    # Add a delay between batches to avoid hitting rate limits
    if end_idx < total_rows:
        time.sleep(30)  # Wait for 60 seconds between batches

Processed rows 0 to 9
Processed rows 10 to 19
Processed rows 20 to 29
Processed rows 30 to 39
Processed rows 40 to 49
Processed rows 50 to 59
Processed rows 60 to 69
Processed rows 70 to 79
Processed rows 80 to 89
Processed rows 90 to 99
Processed rows 100 to 109
Processed rows 110 to 119
Processed rows 120 to 129
Processed rows 130 to 139
Processed rows 140 to 149
Processed rows 150 to 159
Processed rows 160 to 169
Processed rows 170 to 179
Processed rows 180 to 189
Processed rows 190 to 199
Processed rows 200 to 209
Processed rows 210 to 219
Processed rows 220 to 229
Processed rows 230 to 239
Processed rows 240 to 249
Processed rows 250 to 259
Processed rows 260 to 269
Processed rows 270 to 279
Processed rows 280 to 289
Processed rows 290 to 299
Processed rows 300 to 309
Processed rows 310 to 319
Processed rows 320 to 329
Processed rows 330 to 339
Processed rows 340 to 349
Processed rows 350 to 359
Processed rows 360 to 369
Processed rows 370 to 379
Processed rows 380 to 389
Processe

##### Notes:
* We processed the places_df in batches of 10 rows to extract activities and their scores from the reviews.
* We alternated between two gemini API keys (key1 and key2) to avoid hitting rate limits.
* The extracted activities and their scores were stored in the 'extracted_activities' and 'activity_scores' columns of places_df.

* We also experimented with an alternative method for extracting activities and scores using vector cosine similarity. However, this approach didn't yield sufficiently accurate results compared to the Gemini-based method described above. While the vector similarity method is computationally efficient, it lacks the contextual understanding provided by the language model. We ultimately decided to use the more accurate Gemini-based approach, despite its higher computational cost.

In [75]:
# # Set up logging
# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# # Sample candidate activities
# candidate_activities = sorted_activities

# # Function to perform sentiment analysis and return a satisfaction score
# def sentiment_to_score(sentiment):
#     return round(((sentiment + 1) / 2) * 4 + 1, 2)

# # Function to extract specific part of review related to the activity
# def get_activity_context(review, activity):
#     sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', review)
#     activity_mentions = [sent for sent in sentences if fuzz.partial_ratio(activity.lower(), sent.lower()) > 80]
    
#     if activity_mentions:
#         return " ".join(activity_mentions)
#     else:
#         return review

# # Function to preprocess text
# def preprocess_text(text):
#     if not isinstance(text, str):
#         return ""
#     text = text.lower()
#     text = re.sub(r'[^\w\s]', '', text)
#     words = text.split()
#     stop_words = set(['the', 'a', 'an', 'in', 'on', 'at', 'to', 'for', 'of', 'and', 'is', 'are'])
#     return " ".join([word for word in words if word not in stop_words])

# # Function to extract activities and calculate satisfaction score
# def extract_activities_with_scores(review):
#     try:
#         if not isinstance(review, str) or len(review.strip()) == 0:
#             logging.warning(f"Invalid review: {review}")
#             return [candidate_activities[0]], [3.0]  # Default to first activity with neutral score

#         # Preprocess the review and candidate activities
#         preprocessed_review = preprocess_text(review)
#         preprocessed_activities = [preprocess_text(activity) for activity in candidate_activities]

#         # Combine the preprocessed review with the preprocessed candidate activities
#         documents = [preprocessed_review] + preprocessed_activities

#         # Convert the documents to TF-IDF vectors
#         vectorizer = TfidfVectorizer().fit_transform(documents)
#         vectors = vectorizer.toarray()

#         # Calculate cosine similarity between the review and each candidate activity
#         cosine_similarities = cosine_similarity(vectors[0:1], vectors[1:]).flatten()

#         # Get the indices of the top 3 activities
#         top_indices = cosine_similarities.argsort()[-3:][::-1]
#         top_activities = [candidate_activities[i] for i in top_indices]
        
#         # Extract satisfaction score for each activity based on review sentiment
#         activity_scores = []
#         for activity in top_activities:
#             # Get the specific part of the review that mentions the activity
#             activity_context = get_activity_context(review, activity)
            
#             # Analyze the sentiment for the extracted context
#             blob = TextBlob(activity_context)
#             sentiment = blob.sentiment.polarity
#             score = sentiment_to_score(sentiment)
#             activity_scores.append(score)

#         # Ensure at least one activity is returned
#         if not top_activities:
#             logging.warning(f"No activities found for review: {review[:100]}...")
#             return [candidate_activities[0]], [3.0]  # Default to first activity with neutral score

#         return top_activities, activity_scores
#     except Exception as e:
#         logging.error(f"Error processing review: {e}\nReview: {review[:100]}...")
#         return [candidate_activities[0]], [3.0]  # Default to first activity with neutral score

# # Apply the function to the latest_reviews column with a progress bar
# tqdm.pandas(desc="Extracting activities and calculating scores for places")
# places_df['extracted_activities'], places_df['activity_scores'] = zip(*places_df['latest_reviews'].progress_apply(extract_activities_with_scores))

# # Ensure all places have at least one activity
# places_df['extracted_activities'] = places_df['extracted_activities'].apply(lambda x: x if len(x) > 0 else [candidate_activities[0]])
# places_df['activity_scores'] = places_df['activity_scores'].apply(lambda x: x if len(x) > 0 else [3.0])

# print(f"Places with no activities: {(places_df['extracted_activities'].apply(len) == 0).sum()}")
# print(f"Places with activities: {(places_df['extracted_activities'].apply(len) > 0).sum()}")

In [76]:
def weighted_rating(row, m=50, C=places_df['rating'].mean()):
    v = row['user_ratings_total']
    R = row['rating']
    return (v / (v + m) * R) + (m / (v + m) * C)

places_df['rating'] = places_df.apply(weighted_rating, axis=1)
places_df.drop(columns=['user_ratings_total'], inplace=True)

places_df.head(10)

Unnamed: 0,name,lat,lng,formatted_address,rating,latest_reviews,extracted_activities,activity_scores
0,Aanda Ella Fall,6.712021,80.460996,Ratnapura,4.280646,Aanda Ella Fall is a hidden gem The hike to th...,"[hiking, caving, waterfalls]","[3.5, 3.6666666666666665, 3.0]"
1,Aberdeen Waterfall,6.949149,80.501514,Ginigathhena,4.790742,Aberdeen Waterfall is a stunning natural wonde...,"[hiking, waterfalls]","[3.8, 4.6]"
2,Ahangama,5.973975,80.362159,Ahangama,,Ahangama was a bit disappointing for me as a s...,"[surfing, beach visits]","[2.2, 3.2]"
3,Ahungalla,6.313278,80.040918,Ahungalla,,"Ahungalla seemed promising, but I found it a b...","[beach visits, beachfront dining, arts and cul...","[2.5, 2.4, 2.0, 2.0, 2.0, 3.0, 2.0, 2.0]"
4,Alahana Pirivena,7.961924,81.003995,Polonnaruwa,4.700491,Visiting Alahana Pirivena was a spiritual jour...,"[archaeological sites, historic sites, arts an...","[5.0, 5.0, 4.0, 4.0, 3.0, 3.0, 4.0, 4.0, 5.0, ..."
5,Alankuda Casuarina Beach,8.048135,79.709916,Kalpitiya,4.353584,Alankuda Casuarina Beach is a hidden gem The p...,"[beach visits, beachfront dining, hiking, snor...","[4.428571428571429, 3.0, 4.0, 4.0, 4.0]"
6,"All Saints Church, Galle Church of Ceylon",6.027534,80.217448,Galle,4.412032,All Saints Church is a hidden gem in Galle The...,"[architecture photography, arts and culture, h...","[4.2, 4.166666666666667, 4.166666666666667, 4...."
7,Alupotha Ella Waterfall,6.618278,80.409508,Magurugoda Alupotha Rd,4.417349,Alupotha Ella Waterfall is truly a gem The sou...,"[waterfalls, photography, hiking]","[4.0, 4.0, 4.0]"
8,"Aluthnuwara Rajamaha Viharaya , Aluthnuwara",7.227319,80.485243,Mawanella Aluthnuwara Rd,4.826857,Visiting Aluthnuwara Rajamaha Viharaya was a s...,"[architecture photography, architecture tours,...","[3.6666666666666665, 3.6666666666666665, 3.5, ..."
9,Ambalangoda,6.244152,80.05908,Ambalangoda,,"Ambalangoda had potential, but it fell short o...","[turtle watching, local crafts, fishing]","[2.8333333333333335, 2.1666666666666665, 1.333..."


##### Notes:

* We implemented a weighted rating system to provide a more balanced and reliable rating for each place.
* This approach combines the average rating with the number of ratings, giving more weight to places with a higher number of reviews. The formula used is known as the Bayesian average, which helps to mitigate the impact of places with very few ratings that might skew the results.

In [77]:
# Remove [, ], ', " signs in Preferred Activities and Bucket list destinations Sri Lanka
users_df['Preferred Activities'] = users_df['Preferred Activities'].str.replace(r"[\[\]'\"']", "", regex=True)
users_df['Bucket list destinations Sri Lanka'] = users_df['Bucket list destinations Sri Lanka'].str.replace(r"[\[\]'\"']", "", regex=True)

##### Eliminating unnecessary features for future modeling

In [78]:
places_df.drop(columns=['latest_reviews'], inplace=True)
users_df.drop(columns=['Name'], inplace=True)
users_df.drop(columns=['User ID'], inplace=True)
users_df.drop(columns=['Email'], inplace=True)

##### Saving final preprocessed datasets

In [110]:
users_df.to_csv('users_preprocessed.csv', index=False)

In [111]:
places_df.to_csv('places_preprocessed.csv', index=False)