In [1]:
#imports

In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ast
import re

In [3]:
#file input

In [4]:
visitors_df = pd.read_excel('Visitors Preference Dataset.xlsx')
places_df = pd.read_excel('Places Dataset.xlsx')

# Check the first few rows of the data
print(visitors_df.head())
print(places_df.head())

   User ID              Name                         Email  \
0        1    Jennifer Quinn    jennifer.quinn@example.com   
1        2       Emily Perry       emily.perry@example.com   
2        3  Danielle Mcbride  danielle.mcbride@example.com   
3        4   Angelica Wilson   angelica.wilson@example.com   
4        5     Laurie Powers     laurie.powers@example.com   

                                Preferred Activities  \
0  ['cycling', 'historical monuments', 'village h...   
1  ['butterfly watching', 'hot springs', 'wildlif...   
2  ['sea cruises', 'themed parks', 'craft worksho...   
3              ['fishing', 'hot springs', 'sailing']   
4     ['history tours', 'sailing', 'literary tours']   

                  Bucket list destinations Sri Lanka  
0  ['Polonnaruwa', 'Hatton', 'Anuradhapura', 'Ell...  
1  ['Madunagala Hot Water Spring', 'Wilpattu Nati...  
2  ['Mirissa Beach', 'Negombo Lagoon', 'Batadomba...  
3  ['Maha Oya Hot Water Springs', 'Colombo Port C...  
4  ['Negombo La

In [6]:
# Data Preprocessing

In [5]:
# Check for missing values
print("Missing values in visitors dataset:\n", visitors_df.isnull().sum())
print("Missing values in places dataset:\n", places_df.isnull().sum())

Missing values in visitors dataset:
 User ID                               0
Name                                  0
Email                                 0
Preferred Activities                  0
Bucket list destinations Sri Lanka    0
dtype: int64
Missing values in places dataset:
 name                   0
lat                    1
lng                    1
formatted_address      0
rating                56
user_ratings_total    56
latest_reviews         0
dtype: int64


In [9]:
#missing values impute

In [8]:
places_df['rating'].fillna(places_df['rating'].mean(), inplace=True)
print("Missing values after handling:\n", places_df.isnull().sum())

Missing values after handling:
 name                   0
lat                    1
lng                    1
formatted_address      0
rating                 0
user_ratings_total    56
latest_reviews         0
dtype: int64


In [6]:
# Convert list-like string columns back to list

In [9]:
visitors_df['Preferred Activities'] = visitors_df['Preferred Activities'].apply(ast.literal_eval)
visitors_df['Bucket list destinations Sri Lanka'] = visitors_df['Bucket list destinations Sri Lanka'].apply(ast.literal_eval)

In [7]:
# Lowercase and clean text for preferred activities  bucket list destinations

In [10]:
visitors_df['Preferred Activities'] = visitors_df['Preferred Activities'].apply(lambda x: [item.lower() for item in x])
visitors_df['Bucket list destinations Sri Lanka'] = visitors_df['Bucket list destinations Sri Lanka'].apply(lambda x: [item.lower() for item in x])

In [9]:
# Vectorization using TF-IDF for 'Preferred Activities' column

In [14]:
tfidf = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
tfidf_matrix = tfidf.fit_transform(visitors_df['Preferred Activities'])

In [15]:
print(tfidf_matrix.shape)

(10000, 68)


In [20]:
#recommendation model

In [12]:
def get_recommendations(place_name, cosine_sim=cosine_sim):
    idx = places_df[places_df['name'] == place_name].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]  # Top 5 similar places
    place_indices = [i[0] for i in sim_scores]
    return places_df['name'].iloc[place_indices]

In [22]:
print(get_recommendations("Arugam Bay Beach"))

308                   Dutch Bay Beach
36                   Hiriketiya Beach
2      Weligama Beach (surf and stay)
410                   Unawatuna Beach
1                       Mirissa Beach
Name: name, dtype: object


In [23]:
#user-specific recommendations

In [13]:
def get_user_specific_recommendations(user_id):
    user = visitors_df[visitors_df['UserID'] == user_id].iloc[0]
    preferred_activities = safe_eval(user['Preferred Activities'])
    
    # Create a simple scoring system
    places_df['score'] = places_df['latest_reviews'].apply(lambda x: sum(1 for review in x if any(activity.lower() in str(review).lower() for activity in preferred_activities)))
    
    # Sort places by score and return top 5
    return places_df.sort_values('score', ascending=False)['name'].head()

In [27]:
#further