In [13]:
#imports
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, LabelEncoder
import ast
import re
from scipy import stats

In [14]:
#file input
visitors_df = pd.read_excel('Visitors Preference Dataset.xlsx')
places_df = pd.read_excel('Places Dataset.xlsx')

# Check the first few rows of the data
print(visitors_df.head())
print(places_df.head())

   User ID              Name                         Email  \
0        1    Jennifer Quinn    jennifer.quinn@example.com   
1        2       Emily Perry       emily.perry@example.com   
2        3  Danielle Mcbride  danielle.mcbride@example.com   
3        4   Angelica Wilson   angelica.wilson@example.com   
4        5     Laurie Powers     laurie.powers@example.com   

                                Preferred Activities  \
0  ['cycling', 'historical monuments', 'village h...   
1  ['butterfly watching', 'hot springs', 'wildlif...   
2  ['sea cruises', 'themed parks', 'craft worksho...   
3              ['fishing', 'hot springs', 'sailing']   
4     ['history tours', 'sailing', 'literary tours']   

                  Bucket list destinations Sri Lanka  
0  ['Polonnaruwa', 'Hatton', 'Anuradhapura', 'Ell...  
1  ['Madunagala Hot Water Spring', 'Wilpattu Nati...  
2  ['Mirissa Beach', 'Negombo Lagoon', 'Batadomba...  
3  ['Maha Oya Hot Water Springs', 'Colombo Port C...  
4  ['Negombo La

In [15]:
# Pre-processing Steps
# 1. Handle missing values
# 2. Data normalization (if applicable)
#pre-processing

In [16]:
# Check for missing values in both datasets
print(visitors_df.isnull().sum())
print(places_df.isnull().sum())

# Filling missing values for 'lat' and 'lng' with 0,
# and using mean values for 'rating' and 'user_ratings_total'
places_df['lat'].fillna(0, inplace=True)
places_df['lng'].fillna(0, inplace=True)
places_df['rating'].fillna(places_df['rating'].mean(), inplace=True)
places_df['user_ratings_total'].fillna(places_df['user_ratings_total'].mean(), inplace=True)

# Verify there are no more missing values
places_df.isnull().sum()


User ID                               0
Name                                  0
Email                                 0
Preferred Activities                  0
Bucket list destinations Sri Lanka    0
dtype: int64
name                   0
lat                    1
lng                    1
formatted_address      0
rating                56
user_ratings_total    56
latest_reviews         0
dtype: int64


name                  0
lat                   0
lng                   0
formatted_address     0
rating                0
user_ratings_total    0
latest_reviews        0
dtype: int64

In [17]:
# Check for missing values in the Visitors Preference Dataset
visitors_missing_values = visitors_df.isnull().sum()
print("\nMissing values in Visitors Preference Dataset:")
print(visitors_missing_values)

# Filling missing values for 'lat' and 'lng' with 0,
# and using mean values for 'rating' and 'user_ratings_total'
places_df['lat'].fillna(0, inplace=True)
places_df['lng'].fillna(0, inplace=True)
places_df['rating'].fillna(places_df['rating'].mean(), inplace=True)
places_df['user_ratings_total'].fillna(places_df['user_ratings_total'].mean(), inplace=True)

# Verify there are no more missing values
places_df.isnull().sum()



Missing values in Visitors Preference Dataset:
User ID                               0
Name                                  0
Email                                 0
Preferred Activities                  0
Bucket list destinations Sri Lanka    0
dtype: int64


name                  0
lat                   0
lng                   0
formatted_address     0
rating                0
user_ratings_total    0
latest_reviews        0
dtype: int64

In [18]:
#  Convert stringified lists to actual lists in the 'Preferred Activities' 
# and 'Bucket list destinations Sri Lanka' columns
visitors_df['Preferred Activities'] = visitors_df['Preferred Activities'].apply(ast.literal_eval)
visitors_df['Bucket list destinations Sri Lanka'] = visitors_df['Bucket list destinations Sri Lanka'].apply(ast.literal_eval)

In [19]:
# Fill missing values in the 'rating' and 'user_ratings_total' columns in the Places Dataset
places_df['rating'].fillna(places_df['rating'].mean(), inplace=True)
places_df['user_ratings_total'].fillna(places_df['user_ratings_total'].mean(), inplace=True)

In [20]:
#  Correct formatting issues in the 'latest_reviews' column
places_df['latest_reviews'] = places_df['latest_reviews'].str.replace('Ã¢Â', '')

In [21]:
def clean_text(text):
    # Remove non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    # Remove unwanted encoded characters (e.g., xa0, _x008f_)
    text = re.sub(r'\\xa0|\\u[0-9A-Fa-f]+|\\x[0-9A-Fa-f]+|_x[0-9A-Fa-f]+_', '', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply cleaning to relevant columns in places_df
places_df['name'] = places_df['name'].apply(clean_text)
places_df['latest_reviews'] = places_df['latest_reviews'].apply(clean_text)

# Check the cleaned data
print(places_df.head())



                             name       lat        lng  \
0                Arugam Bay Beach  6.840408  81.836848   
1                   Mirissa Beach  5.944703  80.459161   
2  Weligama Beach (surf and stay)  5.972486  80.435714   
3                        Ahangama  5.973975  80.362159   
4                 Hikkaduwa Beach  6.137727  80.099060   

             formatted_address    rating  user_ratings_total  \
0  Arugam Bay Beach, Sri Lanka  4.800000         1591.000000   
1           Mirissa, Sri Lanka  4.600000         1748.000000   
2          Weligama, Sri Lanka  4.400000          325.000000   
3          Ahangama, Sri Lanka  4.459437         1608.639437   
4   Hikkaduwa Beach, Sri Lanka  4.700000         1438.000000   

                                      latest_reviews  
0  ['Arugam Bay Beach is a surfer's paradise! I s...  
1  ['Mirissa Beach is truly a gem on Sri Lankas s...  
2  ['Weligama Beach is a fantastic spot for both ...  
3  ['Ahangama was a bit disappointing for me a

In [22]:
# Check for missing values in the Places Dataset
places_missing_values = places_df.isnull().sum()
print("\nMissing values in Places Dataset:")
print(places_missing_values)

# Filling missing values for 'lat' and 'lng' with 0,
# and using mean values for 'rating' and 'user_ratings_total'
places_df['lat'].fillna(0, inplace=True)
places_df['lng'].fillna(0, inplace=True)
places_df['rating'].fillna(places_df['rating'].mean(), inplace=True)
places_df['user_ratings_total'].fillna(places_df['user_ratings_total'].mean(), inplace=True)

# Verify there are no more missing values
places_df.isnull().sum()



Missing values in Places Dataset:
name                  0
lat                   0
lng                   0
formatted_address     0
rating                0
user_ratings_total    0
latest_reviews        0
dtype: int64


name                  0
lat                   0
lng                   0
formatted_address     0
rating                0
user_ratings_total    0
latest_reviews        0
dtype: int64

In [23]:
# Remove duplicates in Visitors Preference Dataset
visitors_df.drop_duplicates(subset=['User ID', 'Email'], inplace=True)

# Remove duplicates in Places Dataset
places_df.drop_duplicates(subset=['name', 'formatted_address'], inplace=True)

In [24]:
# Normalize text columns in Visitors Preference Dataset
visitors_df['Name'] = visitors_df['Name'].str.lower()
visitors_df['Preferred Activities'] = visitors_df['Preferred Activities'].apply(lambda x: [activity.lower() for activity in x])
visitors_df['Bucket list destinations Sri Lanka'] = visitors_df['Bucket list destinations Sri Lanka'].apply(lambda x: [destination.lower() for destination in x])

# Normalize text columns in Places Dataset
places_df['name'] = places_df['name'].str.lower()
places_df['formatted_address'] = places_df['formatted_address'].str.lower()
places_df['latest_reviews'] = places_df['latest_reviews'].str.lower()


In [25]:
# Check for outliers in Places Dataset
places_df[['rating', 'user_ratings_total']].describe()

# Optionally, remove outliers in user_ratings_total (e.g., values above the 99th percentile)
threshold = places_df['user_ratings_total'].quantile(0.99)
places_df = places_df[places_df['user_ratings_total'] <= threshold]


In [26]:
# Tokenize Preferred Activities into separate columns
preferred_activities_df = pd.DataFrame(visitors_df['Preferred Activities'].tolist(), index=visitors_df['User ID'])

# Tokenize Bucket List Destinations into separate columns
bucket_list_df = pd.DataFrame(visitors_df['Bucket list destinations Sri Lanka'].tolist(), index=visitors_df['User ID'])


In [27]:
# Binning ratings into categories
places_df['rating_category'] = pd.cut(places_df['rating'], bins=[0, 3.5, 4.5, 5], labels=['low', 'medium', 'high'])

In [28]:
# Remove very short reviews
places_df = places_df[places_df['latest_reviews'].str.len() > 10]

In [29]:
# Use one-hot encoding for the rating categories
places_df = pd.get_dummies(places_df, columns=['rating_category'], drop_first=True)

In [30]:
def place_recommendation(user_id, visitors_df, places_df):
    # Retrieve user's preferred activities
    user_preferences = visitors_df.loc[visitors_df['User ID'] == user_id, 'Preferred Activities']
    if user_preferences.empty:
        return "User not found"
    
    # Directly use the preferred activities list
    user_activities = user_preferences.values[0]
    
    # Collect place recommendations based on activity keywords in both names and descriptions
    recommendations = []
    for activity in user_activities:
        escaped_activity = re.escape(activity.lower())
        for index, row in places_df.iterrows():
            place_name = row['name'].lower()
            place_description = row.get('latest_reviews', '').lower()
            if re.search(escaped_activity, place_name) or re.search(escaped_activity, place_description):
                recommendations.append(row['name'])
    
    return recommendations if recommendations else "No recommendations found"

In [31]:
#testing

In [32]:
# Test case for place_recommendation
def test_place_recommendation():
    # Test with an existing user
    result_existing_user = place_recommendation(1, visitors_df, places_df)
    print(f"Recommendations for user 1: {result_existing_user}")
    
    # Test with a non-existent user
    result_non_existent_user = place_recommendation(999, visitors_df, places_df)
    print(f"Recommendations for non-existent user: {result_non_existent_user}")
    
    # Test with a user having no matching activities
    # (Let's assume user with ID 4 has preferences not matching any place)
    result_no_match = place_recommendation(4, visitors_df, places_df)
    print(f"Recommendations for user with no matching activities: {result_no_match}")

# Run the test cases
test_place_recommendation()


Recommendations for user 1: ['viharamahadevi park', 'anuradhapura', 'polonnaruwa']
Recommendations for non-existent user: ['sinharaja forest reserve', 'belihuloya', 'batadombalena', 'seetha eliya', 'kanneliya national rain forest reserve |', 'knuckles forest reserve', 'piduruthalagala conservation forest', 'dhushan ella', 'hali-ela , sri lanka', 'neluwa doovili ella', 'pareiyan ella falls |', 'riverston', 'viharamahadevi park', 'kandy', 'polonnaruwa']
Recommendations for user with no matching activities: ['maritime museum', 'kalpitiya', 'negombo', 'trincomalee', 'bolgoda lake', 'kalpitiya lagoon', 'ambalangoda', 'jaffna lagoon area', 'panakala lagoon', 'chundikulam bird sanctuary', 'batticaloa lagoon', 'mandathivu beach', 'kiranchi beach', '| | okanda beach', 'mahakarambewa wewa', 'katukeliyawa wewa', 'kadawatha wewa', 'maha oya hot water springs', 'madunagala hot water spring', 'uppuveli', 'port city colombo', 'negombo', 'trincomalee', 'elephant point beach', 'uppuveli beach']
