In [28]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
from itertools import combinations, permutations
import ast
import dill

In [29]:
places_df = pd.read_csv('places_preprocessed.csv')
users_df = pd.read_csv('users_preprocessed.csv')

In [30]:
def recommend_locations(user_activities, places, user_bucket_list, top_n_per_activity=5):
    # Combine all activities into a single string for each place
    places['combined_activities'] = places['extracted_activities'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))
    
    # Create a TF-IDF Vectorizer with lowered max_df and increased min_df
    vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
    
    try:
        # Fit and transform the combined activities
        tfidf_matrix = vectorizer.fit_transform(places['combined_activities'])
        
        # Transform the user's preferred activities
        user_activities_str = ' '.join(user_activities)
        user_tfidf = vectorizer.transform([user_activities_str])
        
        # Calculate cosine similarities between user activities and place activities
        cosine_similarities = cosine_similarity(user_tfidf, tfidf_matrix).flatten()
        
        # Get the indices of the top_n_per_activity places with the highest similarity scores for each activity
        top_indices = set()
        for activity in user_activities:
            activity_tfidf = vectorizer.transform([activity])
            activity_similarities = cosine_similarity(activity_tfidf, tfidf_matrix).flatten()
            activity_top_indices = activity_similarities.argsort()[-top_n_per_activity:][::-1]
            top_indices.update(activity_top_indices)
        
        # Convert the set to a sorted list based on overall cosine similarity and keep only the top 15 unique places
        top_indices = sorted(top_indices, key=lambda idx: cosine_similarities[idx], reverse=True)[:15]
        
        # Get the recommended locations
        recommended_locations = places.iloc[top_indices].copy()
    except ValueError:
        # If vectorization fails, return all places without duplicates
        print("Vectorization failed. Returning all unique places.")
        recommended_locations = places.drop_duplicates(subset=['name'])
    
    # Add unique places from user's bucket list
    for bucket_item in user_bucket_list:
        matched_places = places[
            (places['name'].str.lower() == bucket_item.lower()) | 
            (places['formatted_address'].str.lower() == bucket_item.lower())
        ]
        # Exclude already recommended places
        matched_places = matched_places[~matched_places.index.isin(recommended_locations.index)]
        if not matched_places.empty:
            recommended_locations = pd.concat([recommended_locations, matched_places], ignore_index=True)
            # Ensure no duplicates after adding bucket list places
            recommended_locations = recommended_locations.drop_duplicates(subset=['name'], keep='first')
            # Optionally limit the total number of recommendations
            if len(recommended_locations) >= 15:
                recommended_locations = recommended_locations.head(15)
                break
    
    return recommended_locations

user_activities = users_df.iloc[1]['Preferred Activities'].strip("[]").replace("'", "").split(", ")
user_activities = [
    activity.lower().strip()
    .replace('safaris', 'wild life safaris')
    .replace('hot air ballooning', 'air ballooning') 
    for activity in user_activities
]
user_bucket_list = users_df.iloc[1]['Bucket list destinations Sri Lanka'].strip("[]").replace("'", "").split(", ")
recommended_locations = recommend_locations(user_activities, places_df, user_bucket_list)
print(recommended_locations[['name']])

                                          name
0                   Maha Oya Hot Water Springs
1                     Jayanthi Wewa Hot Spring
2                    Kanniya Hot Water Springs
3                     Sinharaja Forest Reserve
4                        Pitawala Patana Trail
5                  Madunagala Hot Water Spring
6                  Horton Plains National Park
7                                     Knuckles
8                        Crocodile Lake Panama
9                         Anawilundawa Wetland
10  Sinharaja Rain forest, Waddagala, Kalawana
11                                 Panama Wewa
12         Piduruthalagala Conservation Forest
13                         Bird Watching Tower
14                                   rumassala


In [31]:
def get_places_for_each_activity(user_activities, recommended_locations):
    activity_places = {activity: [] for activity in user_activities}
    
    for activity in user_activities:
        for index, row in recommended_locations.iterrows():
            extracted_activities = row['extracted_activities']
            activity_scores = row['activity_scores']
            
            # Convert string representations to lists if necessary
            if isinstance(extracted_activities, str):
                extracted_activities = ast.literal_eval(extracted_activities)
            if isinstance(activity_scores, str):
                activity_scores = ast.literal_eval(activity_scores)
            
            if activity in extracted_activities:
                activity_index = extracted_activities.index(activity)
                activity_score = activity_scores[activity_index]
                activity_places[activity].append((row['name'], activity_score))
    
    return activity_places

activity_places = get_places_for_each_activity(user_activities, recommended_locations)
for activity, places in activity_places.items():
    print(f"Activity: {activity}")
    for place, score in places:
        print(f"Place: {place}, Score: {score}")
    print()

Activity: butterfly watching
Place: Sinharaja Forest Reserve, Score: 3.0
Place: Pitawala Patana Trail, Score: 5.0
Place: Horton Plains National Park, Score: 5.0
Place: Knuckles, Score: 5.0

Activity: hot springs
Place: Maha Oya Hot Water Springs, Score: 4.0
Place: Jayanthi Wewa Hot Spring, Score: 3.75
Place: Kanniya Hot Water Springs, Score: 3.8
Place: Madunagala Hot Water Spring, Score: 4.785714285714286

Activity: wildlife viewing
Place: Sinharaja Forest Reserve, Score: 3.0
Place: Pitawala Patana Trail, Score: 4.25
Place: Horton Plains National Park, Score: 5.0
Place: Knuckles, Score: 4.0
Place: Crocodile Lake Panama, Score: 4.5
Place: Anawilundawa Wetland, Score: 2.2
Place: Sinharaja Rain forest, Waddagala, Kalawana, Score: 3.6
Place: Panama Wewa, Score: 4.2
Place: Piduruthalagala Conservation Forest, Score: 4.0



In [32]:
# Add bucket list destinations to the score calculation
def get_top_location_sets_with_bucket_list(activity_places, user_bucket_list, top_n=10):
    unique_places = set()
    for places in activity_places.values():
        for place, score in places:
            unique_places.add(place)
    
    # Generate all possible combinations of 5 locations
    all_combinations = list(combinations(unique_places, 5))
    
    # Filter combinations to ensure each activity is covered
    valid_combinations = []
    for combo in all_combinations:
        activity_counts = []
        for activity, places in activity_places.items():
            count = sum(1 for place, score in places if place in combo)
            activity_counts.append(count)
        
        # Check if the combination is valid
        if 0 not in activity_counts and activity_counts.count(1) < 2:
            valid_combinations.append(combo)
    
    if not valid_combinations:
        print("No valid combinations found. Consider adjusting the combination size or activity constraints.")
        return []
    
    # Calculate the sum of activity scores for each valid combination
    combo_scores = []
    for combo in valid_combinations:
        score_sum = 0
        for activity, places in activity_places.items():
            for place, score in places:
                if place in combo:
                    score_sum += score
        # Add 1 for each location if that location or its formatted_address is in the bucket list
        for place in combo:
            if place in user_bucket_list or any(place.lower() in address.lower() for address in user_bucket_list):
                score_sum += 1
        combo_scores.append((combo, score_sum))
    
    # Sort combinations by the highest sum of activity scores
    combo_scores.sort(key=lambda x: x[1], reverse=True)
    
    # Get the top_n combinations
    top_combinations = combo_scores[:top_n]
    
    # Add the rating to the score_sum for the top_n combinations
    final_combinations = []
    for combo, score_sum in top_combinations:
        rating_sum = 0
        rating_count = 0
        for place in combo:
            place_rating = places_df[places_df['name'] == place]['rating'].values
            if len(place_rating) > 0 and not pd.isna(place_rating[0]):
                rating_sum += place_rating[0]
                rating_count += 1
        if rating_count > 0:
            avg_rating = rating_sum / rating_count
            score_sum += avg_rating
        final_combinations.append((combo, score_sum))
    
    # Sort the final combinations by the updated score_sum
    final_combinations.sort(key=lambda x: x[1], reverse=True)
    
    return final_combinations[:top_n]

bucket_list = places_df[places_df['name'].isin(user_bucket_list) | places_df['formatted_address'].isin(user_bucket_list)]['name'].tolist()
top_location_sets = get_top_location_sets_with_bucket_list(activity_places, bucket_list, top_n=5)
if top_location_sets:
    for i, (combo, score) in enumerate(top_location_sets):
        print(f"Set {i+1}:")
        for place in combo:
            print(f"Place: {place}")
        print(f"Total Score: {score}")
        print()
else:
    print("No top location sets could be generated based on the current criteria.")

Set 1:
Place: Pitawala Patana Trail
Place: Horton Plains National Park
Place: Sinharaja Forest Reserve
Place: Knuckles
Place: Madunagala Hot Water Spring
Total Score: 45.643500546986125

Set 2:
Place: Pitawala Patana Trail
Place: Horton Plains National Park
Place: Knuckles
Place: Madunagala Hot Water Spring
Place: Crocodile Lake Panama
Total Score: 44.12208210392103

Set 3:
Place: Pitawala Patana Trail
Place: Horton Plains National Park
Place: Knuckles
Place: Madunagala Hot Water Spring
Place: Panama Wewa
Total Score: 43.942999181101555

Set 4:
Place: Pitawala Patana Trail
Place: Horton Plains National Park
Place: Sinharaja Forest Reserve
Place: Knuckles
Place: Maha Oya Hot Water Springs
Total Score: 43.815265180911176

Set 5:
Place: Pitawala Patana Trail
Place: Horton Plains National Park
Place: Kanniya Hot Water Springs
Place: Sinharaja Forest Reserve
Place: Knuckles
Total Score: 43.60893958738988



In [33]:
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in kilometers
    dlat = np.radians(lat2 - lat1)
    dlon = np.radians(lon2 - lon1)
    a = np.sin(dlat / 2) ** 2 + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.sin(dlon / 2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return R * c

def calculate_min_travel_distance(places):
    min_distance = float('inf')
    best_route = None
    for perm in permutations(places):
        distance = 0
        for i in range(len(perm) - 1):
            lat1, lon1 = places_df[places_df['name'] == perm[i]][['lat', 'lng']].values[0]
            lat2, lon2 = places_df[places_df['name'] == perm[i + 1]][['lat', 'lng']].values[0]
            distance += haversine(lat1, lon1, lat2, lon2)
        if distance < min_distance:
            min_distance = distance
            best_route = perm
    return min_distance, best_route

# Normalize the Total Score and Minimum Travel Distance
total_scores = [score for _, score in top_location_sets]
min_total_score = min(total_scores)
max_total_score = max(total_scores)

distances = []
for combo, _ in top_location_sets:
    min_distance, _ = calculate_min_travel_distance(combo)
    distances.append(min_distance)
min_distance = min(distances)
max_distance = max(distances)

normalized_scores = [(score - min_total_score) / (max_total_score - min_total_score) for score in total_scores]
normalized_distances = [(max_distance - distance) / (max_distance - min_distance) for distance in distances]

# Combine the Objectives
alpha = 0.7  
final_scores = [alpha * norm_score + (1 - alpha) * norm_distance for norm_score, norm_distance in zip(normalized_scores, normalized_distances)]

# Compute the Final Score for Each Set
for i, ((combo, score), final_score) in enumerate(zip(top_location_sets, final_scores)):
    min_distance, best_route = calculate_min_travel_distance(combo)
    print(f"Set {i+1}:")
    for place in best_route:
        print(f"Place: {place}")
    print(f"Total Score: {score}")
    print(f"Minimum Travel Distance: {min_distance} km")
    print(f"Final Score: {final_score}")
    print()

Set 1:
Place: Pitawala Patana Trail
Place: Knuckles
Place: Horton Plains National Park
Place: Sinharaja Forest Reserve
Place: Madunagala Hot Water Spring
Total Score: 45.643500546986125
Minimum Travel Distance: 201.30200083388704 km
Final Score: 1.0

Set 2:
Place: Pitawala Patana Trail
Place: Knuckles
Place: Horton Plains National Park
Place: Madunagala Hot Water Spring
Place: Crocodile Lake Panama
Total Score: 44.12208210392103
Minimum Travel Distance: 252.63555735298897 km
Final Score: 0.24482679350616485

Set 3:
Place: Pitawala Patana Trail
Place: Knuckles
Place: Horton Plains National Park
Place: Madunagala Hot Water Spring
Place: Panama Wewa
Total Score: 43.942999181101555
Minimum Travel Distance: 251.18540344209 km
Final Score: 0.1897585622647221

Set 4:
Place: Sinharaja Forest Reserve
Place: Horton Plains National Park
Place: Knuckles
Place: Pitawala Patana Trail
Place: Maha Oya Hot Water Springs
Total Score: 43.815265180911176
Minimum Travel Distance: 207.99953712795278 km
Fina

##### Notes:
* We normalized the total scores and minimum travel distances for the top location sets.
* We combined the objectives of maximizing the total score and minimizing the travel distance using a weighted approach.
* We computed the final score for each set of locations by balancing both objectives.
* Our final decision was to use an equal weight (alpha = 0.5) for both the total score and the travel distance.
* We successfully identified the best route for each set of locations based on the minimum travel distance.
* The final model is now ready to recommend locations based on user activities and preferences.

# Final Model building and Saving

In [34]:
class LocationRecommender:

    def __init__(self, places_df):
        if isinstance(places_df, list):
            try:
                self.places_df = pd.DataFrame(places_df)
                print("Converted input list to DataFrame.")
            except Exception as e:
                raise ValueError(f"Failed to convert list to DataFrame: {e}")
        elif isinstance(places_df, pd.DataFrame):
            self.places_df = places_df
        else:
            raise TypeError("places_df must be a pandas DataFrame or a list of dictionaries.")

        # Ensure essential columns exist
        required_columns = {'name', 'lat', 'lng', 'formatted_address', 'extracted_activities', 'activity_scores', 'rating'}
        missing_columns = required_columns - set(self.places_df.columns)
        if missing_columns:
            raise ValueError(f"The following required columns are missing from places_df: {missing_columns}")

    def recommend_locations(self, user_activities, user_bucket_list, top_n_per_activity=5):
        if not isinstance(self.places_df, pd.DataFrame):
            raise AttributeError("places_df is not a pandas DataFrame.")
        
        places = self.places_df.copy()
        
        if 'extracted_activities' in places.columns:
            places['combined_activities'] = places['extracted_activities'].apply(
                lambda x: ' '.join(x) if isinstance(x, list) else (' '.join(ast.literal_eval(x)) if isinstance(x, str) else str(x))
            )
        else:
            raise AttributeError("'extracted_activities' column is missing from places_df.")
        
        vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
        
        try:
            tfidf_matrix = vectorizer.fit_transform(places['combined_activities'])
            user_activities_str = ' '.join(user_activities)
            user_tfidf = vectorizer.transform([user_activities_str])
            cosine_similarities = cosine_similarity(user_tfidf, tfidf_matrix).flatten()
            
            top_indices = set()
            for activity in user_activities:
                activity_tfidf = vectorizer.transform([activity])
                activity_similarities = cosine_similarity(activity_tfidf, tfidf_matrix).flatten()
                activity_top_indices = activity_similarities.argsort()[-top_n_per_activity:][::-1]
                top_indices.update(activity_top_indices)
            
            top_indices = sorted(top_indices, key=lambda idx: cosine_similarities[idx], reverse=True)[:15]
            recommended_locations = places.iloc[top_indices].copy()
        except ValueError as ve:
            print(f"Vectorization failed: {ve}. Returning all unique places.")
            recommended_locations = places.drop_duplicates(subset=['name'])
        
        for bucket_item in user_bucket_list:
            matched_places = places[
                (places['name'].str.lower() == bucket_item.lower()) | 
                (places['formatted_address'].str.lower() == bucket_item.lower())
            ]
            matched_places = matched_places[~matched_places.index.isin(recommended_locations.index)]
            if not matched_places.empty:
                recommended_locations = pd.concat([recommended_locations, matched_places], ignore_index=True)
                recommended_locations = recommended_locations.drop_duplicates(subset=['name'], keep='first')
                if len(recommended_locations) >= 15:
                    recommended_locations = recommended_locations.head(15)
                    break
        
        return recommended_locations

    def get_places_for_each_activity(self, user_activities, recommended_locations):
        activity_places = {activity: [] for activity in user_activities}
        
        for activity in user_activities:
            for index, row in recommended_locations.iterrows():
                extracted_activities = row['extracted_activities']
                activity_scores = row['activity_scores']
                
                if isinstance(extracted_activities, str):
                    try:
                        extracted_activities = ast.literal_eval(extracted_activities)
                    except Exception as e:
                        print(f"Error parsing extracted_activities for place {row['name']}: {e}")
                        continue
                if isinstance(activity_scores, str):
                    try:
                        activity_scores = ast.literal_eval(activity_scores)
                    except Exception as e:
                        print(f"Error parsing activity_scores for place {row['name']}: {e}")
                        continue
                
                if activity in extracted_activities:
                    activity_index = extracted_activities.index(activity)
                    if activity_index < len(activity_scores):
                        activity_score = activity_scores[activity_index]
                        activity_places[activity].append((row['name'], activity_score))
                    else:
                        print(f"Activity index out of range for place {row['name']}.")
        
        return activity_places

    def get_top_location_sets_with_bucket_list(self, activity_places, user_bucket_list, top_n=10):
        unique_places = set()
        for places in activity_places.values():
            for place, score in places:
                unique_places.add(place)
        
        all_combinations = list(combinations(unique_places, 5))
        valid_combinations = []
        for combo in all_combinations:
            activity_counts = []
            for activity, places in activity_places.items():
                count = sum(1 for place, score in places if place in combo)
                activity_counts.append(count)
            
            if 0 not in activity_counts and activity_counts.count(1) < 2:
                valid_combinations.append(combo)
        
        if not valid_combinations:
            print("No valid combinations found. Consider adjusting the combination size or activity constraints.")
            return []
        
        combo_scores = []
        for combo in valid_combinations:
            score_sum = 0
            for activity, places in activity_places.items():
                for place, score in places:
                    if place in combo:
                        score_sum += score
            for place in combo:
                if place in user_bucket_list or any(place.lower() in address.lower() for address in user_bucket_list):
                    score_sum += 1
            combo_scores.append((combo, score_sum))
        
        combo_scores.sort(key=lambda x: x[1], reverse=True)
        top_combinations = combo_scores[:top_n]
        
        final_combinations = []
        for combo, score_sum in top_combinations:
            rating_sum = 0
            rating_count = 0
            for place in combo:
                place_rating = self.places_df[self.places_df['name'] == place]['rating'].values
                if len(place_rating) > 0 and not pd.isna(place_rating[0]):
                    rating_sum += place_rating[0]
                    rating_count += 1
            if rating_count > 0:
                avg_rating = rating_sum / rating_count
                score_sum += avg_rating
            final_combinations.append((combo, score_sum))
        
        final_combinations.sort(key=lambda x: x[1], reverse=True)
        
        return final_combinations[:top_n]

    @staticmethod
    def haversine(lat1, lon1, lat2, lon2):
        R = 6371
        dlat = np.radians(lat2 - lat1)
        dlon = np.radians(lon2 - lon1)
        a = np.sin(dlat / 2) ** 2 + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.sin(dlon / 2) ** 2
        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
        return R * c

    def calculate_min_travel_distance(self, places):
        min_distance = float('inf')
        best_route = None
        for perm in permutations(places):
            distance = 0
            for i in range(len(perm) - 1):
                place1 = self.places_df[self.places_df['name'] == perm[i]]
                place2 = self.places_df[self.places_df['name'] == perm[i + 1]]
                if place1.empty or place2.empty:
                    print(f"Place not found: {perm[i]} or {perm[i + 1]}")
                    distance = float('inf')
                    break
                lat1, lon1 = place1[['lat', 'lng']].values[0]
                lat2, lon2 = place2[['lat', 'lng']].values[0]
                distance += self.haversine(lat1, lon1, lat2, lon2)
            if distance < min_distance:
                min_distance = distance
                best_route = perm
        return min_distance, best_route

    def recommend_top_places(self, user_activities, user_bucket_list):
        user_activities = [
            activity.lower().strip()
            .replace('safaris', 'wild life safaris')
            .replace('hot air ballooning', 'air ballooning') 
            for activity in user_activities
        ]
        
        recommended_locations = self.recommend_locations(user_activities, user_bucket_list)
        activity_places = self.get_places_for_each_activity(user_activities, recommended_locations)
        bucket_list = self.places_df[
            self.places_df['name'].isin(user_bucket_list) | 
            self.places_df['formatted_address'].isin(user_bucket_list)
        ]['name'].tolist()
        top_location_sets = self.get_top_location_sets_with_bucket_list(activity_places, bucket_list, top_n=10)

        if not top_location_sets:
            print("No top location sets found.")
            return []
        
        total_scores = [score for _, score in top_location_sets]
        min_total_score = min(total_scores)
        max_total_score = max(total_scores)

        distances = []
        for combo, _ in top_location_sets:
            min_distance, _ = self.calculate_min_travel_distance(combo)
            distances.append(min_distance)
        min_distance = min(distances)
        max_distance = max(distances)

        if max_total_score - min_total_score == 0:
            normalized_scores = [1 for _ in total_scores]
        else:
            normalized_scores = [(score - min_total_score) / (max_total_score - min_total_score) for score in total_scores]
        
        if max_distance - min_distance == 0:
            normalized_distances = [1 for _ in distances]
        else:
            normalized_distances = [(max_distance - distance) / (max_distance - min_distance) for distance in distances]

        alpha = 0.7
        final_scores = [
            alpha * norm_score + (1 - alpha) * norm_distance 
            for norm_score, norm_distance in zip(normalized_scores, normalized_distances)
        ]

        if not final_scores:
            print("No final scores calculated.")
            return []
        
        max_final_score_index = final_scores.index(max(final_scores))
        combo, score = top_location_sets[max_final_score_index]
        min_distance, best_route = self.calculate_min_travel_distance(combo)
        final_score = final_scores[max_final_score_index]

        return best_route

In [35]:
recommender = LocationRecommender(places_df)

with open('Recommendation Model.pkl', 'wb') as file:
    dill.dump(recommender, file)

# Example usage

In [39]:
with open('Recommendation Model.pkl', 'rb') as file:
    loaded_recommender = dill.load(file)

#Enter user number 
user_number = 0

# Example input
user_activities = users_df.iloc[user_number]['Preferred Activities'].strip("[]").replace("'", "").split(", ")
user_bucket_list = users_df.iloc[user_number]['Bucket list destinations Sri Lanka'].strip("[]").replace("'", "").split(", ")

print("User Preferred Activities:")
for activity in user_activities:
    print(f"- {activity}")

print("\nUser Bucket List Destinations in Sri Lanka:")
for destination in user_bucket_list:
    print(f"- {destination}")

best_route = loaded_recommender.recommend_top_places(user_activities, user_bucket_list)
print(f"\nFinal recommended places:")
for place in best_route:
    print(f"Place: {place}")

User Preferred Activities:
- cycling
- historical monuments
- village homestays

User Bucket List Destinations in Sri Lanka:
- Polonnaruwa
- Hatton
- Anuradhapura
- Ella
- Haputale

Final recommended places:
Place: Velgam Vehera Buddhist Temple
Place: Polonnaruwa Ancient City
Place: Deegavapi Maha Stupa
Place: Udawatta Kele Sanctuary
Place: Viharamahadevi Park
