In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from math import sin, cos, sqrt, atan2, radians

In [2]:
users_df = pd.read_excel("../data/Visitors Preference Dataset.xlsx")
places_df = pd.read_excel("../data/Places Dataset.xlsx")

In [3]:
users_df.head()

Unnamed: 0,User ID,Name,Email,Preferred Activities,Bucket list destinations Sri Lanka
0,1,Jennifer Quinn,jennifer.quinn@example.com,"['cycling', 'historical monuments', 'village h...","['Polonnaruwa', 'Hatton', 'Anuradhapura', 'Ell..."
1,2,Emily Perry,emily.perry@example.com,"['butterfly watching', 'hot springs', 'wildlif...","['Madunagala Hot Water Spring', 'Wilpattu Nati..."
2,3,Danielle Mcbride,danielle.mcbride@example.com,"['sea cruises', 'themed parks', 'craft worksho...","['Mirissa Beach', 'Negombo Lagoon', 'Batadomba..."
3,4,Angelica Wilson,angelica.wilson@example.com,"['fishing', 'hot springs', 'sailing']","['Maha Oya Hot Water Springs', 'Colombo Port C..."
4,5,Laurie Powers,laurie.powers@example.com,"['history tours', 'sailing', 'literary tours']","['Negombo Lagoon', 'Colombo Port City', 'Galle..."


In [4]:
places_df.head()

Unnamed: 0,name,lat,lng,formatted_address,rating,user_ratings_total,latest_reviews
0,Arugam Bay Beach,6.840408,81.836848,"Arugam Bay Beach, Sri Lanka",4.8,1591.0,['Arugam Bay Beach is a surfer's paradise! I s...
1,Mirissa Beach,5.944703,80.459161,"Mirissa, Sri Lanka",4.6,1748.0,['Mirissa Beach is truly a gem on Sri LankaÃ¢Â...
2,Weligama Beach (surf and stay),5.972486,80.435714,"Weligama, Sri Lanka",4.4,325.0,['Weligama Beach is a fantastic spot for both ...
3,Ahangama,5.973975,80.362159,"Ahangama, Sri Lanka",,,['Ahangama was a bit disappointing for me as a...
4,Hikkaduwa Beach,6.137727,80.09906,"Hikkaduwa Beach, Sri Lanka",4.7,1438.0,['Hikkaduwa Beach is a delightful escape for s...


In [5]:
print(users_df['Preferred Activities'].apply(type).value_counts())
print(users_df['Bucket list destinations Sri Lanka'].apply(type).value_counts())

Preferred Activities
<class 'str'>    10000
Name: count, dtype: int64
Bucket list destinations Sri Lanka
<class 'str'>    10000
Name: count, dtype: int64


In [6]:
users_df['Preferred Activities'] = users_df['Preferred Activities'].apply(
    lambda x: eval(x) if isinstance(x, str) and x.strip() != '' else np.nan
)
users_df['Bucket list destinations Sri Lanka'] = users_df['Bucket list destinations Sri Lanka'].apply(
    lambda x: eval(x) if isinstance(x, str) and x.strip() != '' else np.nan
)

In [7]:
users_df.head()

Unnamed: 0,User ID,Name,Email,Preferred Activities,Bucket list destinations Sri Lanka
0,1,Jennifer Quinn,jennifer.quinn@example.com,"[cycling, historical monuments, village homest...","[Polonnaruwa, Hatton, Anuradhapura, Ella, Hapu..."
1,2,Emily Perry,emily.perry@example.com,"[butterfly watching, hot springs, wildlife vie...","[Madunagala Hot Water Spring, Wilpattu Nationa..."
2,3,Danielle Mcbride,danielle.mcbride@example.com,"[sea cruises, themed parks, craft workshops]","[Mirissa Beach, Negombo Lagoon, Batadombalena ..."
3,4,Angelica Wilson,angelica.wilson@example.com,"[fishing, hot springs, sailing]","[Maha Oya Hot Water Springs, Colombo Port City..."
4,5,Laurie Powers,laurie.powers@example.com,"[history tours, sailing, literary tours]","[Negombo Lagoon, Colombo Port City, Galle Dutc..."


In [8]:
users_df['Preferred Activities String'] = users_df['Preferred Activities'].apply(lambda x: " ".join(x))

In [9]:
places_df['latest_reviews'] = places_df['latest_reviews'].apply(
    lambda x: ''.join(e for e in x if e.isalnum() or e.isspace())
)

In [10]:
places_df['latest_reviews'] = places_df['latest_reviews'].apply(lambda x: x if isinstance(x, str) else "")
places_df['Reviews Combined'] = places_df['latest_reviews'].apply(lambda x: " ".join(x.split()))

In [11]:
places_df.head()

Unnamed: 0,name,lat,lng,formatted_address,rating,user_ratings_total,latest_reviews,Reviews Combined
0,Arugam Bay Beach,6.840408,81.836848,"Arugam Bay Beach, Sri Lanka",4.8,1591.0,Arugam Bay Beach is a surfers paradise I spent...,Arugam Bay Beach is a surfers paradise I spent...
1,Mirissa Beach,5.944703,80.459161,"Mirissa, Sri Lanka",4.6,1748.0,Mirissa Beach is truly a gem on Sri LankaÃÂÂs ...,Mirissa Beach is truly a gem on Sri LankaÃÂÂs ...
2,Weligama Beach (surf and stay),5.972486,80.435714,"Weligama, Sri Lanka",4.4,325.0,Weligama Beach is a fantastic spot for both be...,Weligama Beach is a fantastic spot for both be...
3,Ahangama,5.973975,80.362159,"Ahangama, Sri Lanka",,,Ahangama was a bit disappointing for me as a s...,Ahangama was a bit disappointing for me as a s...
4,Hikkaduwa Beach,6.137727,80.09906,"Hikkaduwa Beach, Sri Lanka",4.7,1438.0,Hikkaduwa Beach is a delightful escape for sol...,Hikkaduwa Beach is a delightful escape for sol...


In [12]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix_places = tfidf.fit_transform(places_df['Reviews Combined'])

In [13]:
tfidf_matrix_places.shape

(411, 4329)

In [14]:
def recommend_places_by_activity(user_id):
    user_activities = users_df.loc[users_df['User ID'] == user_id, 'Preferred Activities String'].values[0]
    user_tfidf = tfidf.transform([user_activities])
    cosine_sim = cosine_similarity(user_tfidf, tfidf_matrix_places)
    place_indices = cosine_sim.argsort()[0][-5:][::-1]
    return places_df.iloc[place_indices][['name', 'formatted_address', 'rating']]

In [15]:
recommend_places_by_activity(1)

Unnamed: 0,name,formatted_address,rating
95,Polonnaruwa,"Polonnaruwa, Sri Lanka",
23,Anuradhapura,"Anuradhapura, Sri Lanka",
100,Viharamahadevi Park,"Colombo, Sri Lanka",4.1
340,Mandathivu Beach,"Mandaitivu South, Sri Lanka",4.8
38,Colombo National Museum,"Colombo , Sri Lanka",4.6


In [23]:
users_df.loc[users_df['User ID'] == 1, 'Bucket list destinations Sri Lanka'].values[0]

['Polonnaruwa', 'Hatton', 'Anuradhapura', 'Ella', 'Haputale']

In [28]:
users_df.head()

Unnamed: 0,User ID,Name,Email,Preferred Activities,Bucket list destinations Sri Lanka,Preferred Activities String
0,1,Jennifer Quinn,jennifer.quinn@example.com,"[cycling, historical monuments, village homest...","[Polonnaruwa, Hatton, Anuradhapura, Ella, Hapu...",cycling historical monuments village homestays
1,2,Emily Perry,emily.perry@example.com,"[butterfly watching, hot springs, wildlife vie...","[Madunagala Hot Water Spring, Wilpattu Nationa...",butterfly watching hot springs wildlife viewing
2,3,Danielle Mcbride,danielle.mcbride@example.com,"[sea cruises, themed parks, craft workshops]","[Mirissa Beach, Negombo Lagoon, Batadombalena ...",sea cruises themed parks craft workshops
3,4,Angelica Wilson,angelica.wilson@example.com,"[fishing, hot springs, sailing]","[Maha Oya Hot Water Springs, Colombo Port City...",fishing hot springs sailing
4,5,Laurie Powers,laurie.powers@example.com,"[history tours, sailing, literary tours]","[Negombo Lagoon, Colombo Port City, Galle Dutc...",history tours sailing literary tours


In [25]:
def calculate_cosine_similarity(user_id):
    user_activities = users_df.loc[users_df['User ID'] == user_id, 'Preferred Activities String'].values[0]
    user_tfidf = tfidf.transform([user_activities])
    cosine_sim = cosine_similarity(user_tfidf, tfidf_matrix_places)
    return cosine_sim

In [26]:
similarities = calculate_cosine_similarity(1)

In [27]:
place_indices = similarities.argsort()[0][-5:][::-1]
for i in place_indices:
    print(places_df.iloc[i]['name'], similarities[0][i])
    

Polonnaruwa 0.14764631524205662
Anuradhapura 0.13526658586909307
Viharamahadevi Park 0.12470632547034428
Mandathivu Beach 0.056545772984757395
Colombo National Museum 0.05243563022105097


In [17]:
def calculate_distance(lat1, lon1, lat2, lon2):
    R = 6373.0

    lat1 = radians(lat1)
    lon1 = radians(lon1)
    lat2 = radians(lat2)
    lon2 = radians(lon2)
    
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    
    distance = R * c
    return distance

In [18]:
calculate_distance(6.9271, 79.8612, 6.9271, 79.8612)

0.0