In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import ast
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
from IPython.display import display
from IPython.display import Image

/kaggle/input/travel/users.csv
/kaggle/input/travel/places.csv
/kaggle/input/travel/user_interactions.csv


In [23]:
def read_data():
    places_df = pd.read_csv("/kaggle/input/travel/places.csv")
    user_interactions_df = pd.read_csv("/kaggle/input/travel/user_interactions.csv")
    users_df = pd.read_csv("/kaggle/input/travel/users.csv")
    return places_df, user_interactions_df, users_df

In [24]:
original_places_df, original_user_interactions_df, original_users_df = read_data()

In [2]:
!pip install lightfm

  Building wheel for lightfm (setup.py) ... [?25l[?25hdone
  Created wheel for lightfm: filename=lightfm-1.17-cp310-cp310-linux_x86_64.whl size=808329 sha256=a6a756a573cfa68de9c55f295226fa62bbd705e5c6ece6cd80d61faaa5aaca10
  Stored in directory: /root/.cache/pip/wheels/4f/9b/7e/0b256f2168511d8fa4dae4fae0200fdbd729eb424a912ad636
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.17


In [3]:
def preprocess_places_data(places_df):
    tags = list(set([i.strip() for i in ','.join(places_df['tags']).split(',')]))
    for tag in tags:
        places_df[tag] = places_df['tags'].apply(lambda x: 1 if tag in x else 0)
    category_features = pd.get_dummies(places_df['category'], prefix='category', dtype=int)
    places_df = pd.concat([places_df, category_features], axis=1)
    return places_df


def preprocess_user_interaction_data(user_interactions_df):
    interest_strength = {"like": 1,
                         "visit" : 3,
                         "add_to_list": 2
                         }
    
    user_interactions_df['weighted_interaction'] = user_interactions_df['interaction_type'].apply(lambda x: interest_strength[x])
    # display(user_interactions_df)

    # Create interaction matrix

    return user_interactions_df

def preprocess_users_data(user_df):
    user_df['list_of_places'] = user_df['list_of_places'].apply(lambda x: ast.literal_eval(x))
    return user_df

places_df = preprocess_places_data(original_places_df.copy())
user_interaction_df = preprocess_user_interaction_data(original_user_interactions_df.copy())
user_df = preprocess_users_data(original_users_df.copy())


NameError: name 'original_places_df' is not defined

There are no data that has list_of_places lower than 5, so we dont need to worry about no recommendation/zero-start
I am assuming there are no redundant data as in a person that has marked one place as liked, visited and add_to_list

NO NA in all 3 datasets so no need to handle Null values

In [23]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from collections import defaultdict
import time
import ast
from typing import List, Dict, Set, Tuple
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
from lightfm import LightFM
from lightfm.data import Dataset

from IPython.display import display
from IPython.display import Image

class PlaceRecommender:
    def __init__(self):
        self.users_df = None
        self.places_df = None
        self.interactions_df = None
        self.place_features = None
        self.user_profiles = None
        # self.mlb = MultiLabelBinarizer()
        self.train_interactions = pd.DataFrame()
        self.test_interactions = pd.DataFrame()
        self.svd_model = None
        self.interaction_matrix = None
        self.user_factors = None
        self.place_factors = None
        # self.popularity = None ???? We can add this too for recommending highly popular places
        self.lightfm_model = None
        self.lightfm_dataset = None
        self.user_features_matrix = None
        self.item_features_matrix = None
        self.user_id_map = {}
        self.place_id_map = {}
    
    
    def preprocess_places_data(self, places_df):
        tags = list(set([i.strip() for i in ','.join(places_df['tags']).split(',')]))
        for tag in tags:
            places_df[tag] = places_df['tags'].apply(lambda x: 1 if tag in x else 0)
        category_features = pd.get_dummies(places_df['category'], prefix='category', dtype=int)
        tags.extend(list(category_features.columns.values))
        places_df = pd.concat([places_df, category_features], axis=1)
        scaler = StandardScaler()
        place_features = places_df[tags]
        place_features = pd.DataFrame(
            scaler.fit_transform(place_features),
            index = place_features.index,
            columns=place_features.columns
        )
        
        return places_df, place_features


    def preprocess_user_interaction_data(self, user_interactions_df):
        interest_strength = {"like": 1,
                             "visit" : 3,
                             "add_to_list": 2
                             }
        
        user_interactions_df['weighted_interaction'] = user_interactions_df['interaction_type'].apply(lambda x: interest_strength[x])
        # Maybe interaction matrix?
        return user_interactions_df
    
    def preprocess_users_data(self, user_df):
        user_df['list_of_places'] = user_df['list_of_places'].apply(lambda x: ast.literal_eval(x))
        return user_df

    def preprocess_lightfm_data(self):
        self.lightfm_dataset = Dataset()
    
        # Get all possible user feature values
        user_features = []
        for column in self.users_df.columns[1:]:  # Exclude user_id
            user_features.extend(self.users_df[column].astype(str).unique())
        
        # Get all possible item feature values
        item_features = list(self.places_df['category'].unique())
        # Add all unique tags
        all_tags = set()
        for tags in self.places_df['tags']:
            if isinstance(tags, list):
                all_tags.update(tags)
            else:  # If tags is a string
                all_tags.update(str(tags).split(','))
        item_features.extend(list(all_tags))
        
        # Fit the dataset with all possible feature values
        self.lightfm_dataset.fit(
            users=self.interactions_df['user_id'].unique(),
            items=self.interactions_df['place_id'].unique(),
            user_features=user_features,
            item_features=item_features
        )
        
        # Build interaction matrix
        (interactions, weights) = self.lightfm_dataset.build_interactions(
            ((row['user_id'], row['place_id'], row['weighted_interaction']) 
             for _, row in self.train_interactions.iterrows())
        )
        
        # Build user features
        user_features_list = []
        for user_id in self.interactions_df['user_id'].unique():
            if user_id in self.users_df['user_id'].values:
                user_row = self.users_df[self.users_df['user_id'] == user_id].iloc[0]
                # Convert all values to strings
                features = [str(val) for val in user_row.values[1:]]
                user_features_list.append((user_id, features))
        
        user_features = self.lightfm_dataset.build_user_features(user_features_list)
        
        # Build item features
        item_features_list = []
        for place_id in self.interactions_df['place_id'].unique():
            if place_id in self.places_df.index:
                place_row = self.places_df.loc[place_id]
                features = []
                features.append(str(place_row['category']))
                if isinstance(place_row['tags'], list):
                    features.extend([str(tag) for tag in place_row['tags']])
                else:
                    features.extend([str(tag.strip()) for tag in place_row['tags'].split(',')]) #this is not needed
                item_features_list.append((place_id, features))
        
        item_features = self.lightfm_dataset.build_item_features(item_features_list)
        
        return interactions, weights, user_features, item_features


    def train_lightfm(self):
        """Train LightFM model."""
        print("Training LightFM model...")
        self.lightfm_model = LightFM(
            learning_rate=0.005,
            loss='warp',
            no_components=50,
            random_state=42
        )
        interactions, weights, user_features, item_features = self.preprocess_lightfm_data()
        self.lightfm_model.fit(
            interactions=interactions,
            sample_weight=weights,
            user_features=user_features,
            item_features=item_features,
            epochs=30,
            verbose=True
        )
        
        self.user_features_matrix = user_features
        self.item_features_matrix = item_features
        
    def get_lightfm_scores(self, user_id: int) -> np.ndarray:
        """Get recommendation scores from LightFM model."""
        if user_id not in self.user_id_map:
            return np.zeros(len(self.places_df))
            
        # Get internal user ID
        user_idx = self.lightfm_dataset.mapping()[0][user_id]
        
        # Predict scores for all items
        scores = self.lightfm_model.predict(
            user_ids=[user_idx],
            item_ids=np.arange(len(self.places_df)),
            user_features=self.user_features_matrix,
            item_features=self.item_features_matrix
        )
        
        return scores
        
    def load_and_preprocess_data(self, users_data, places_data, interactions_data, test_size=0.2):
        """Load and preprocess all necessary data."""
        # Load dataframes
        self.users_df = self.preprocess_users_data(users_data)
        self.places_df, self.place_features = self.preprocess_places_data(places_data)
        self.interactions_df = self.preprocess_user_interaction_data(interactions_data)
        grouped_user_interactions_df = self.interactions_df.groupby('user_id')
        for _, group in grouped_user_interactions_df:
            if len(group) > 2: #We would need to worry about zero start users.
                train, test = train_test_split(group, test_size=test_size, random_state=42)
                # display(train, test)
                self.train_interactions = pd.concat([self.train_interactions, train])
                self.test_interactions = pd.concat([self.test_interactions, test])
                
        user_indices = self.interactions_df['user_id'].astype('category').cat.codes #we might need this for data that is not sequential, right now is fine
        place_indices = self.interactions_df['place_id'].astype('category').cat.codes
        # display(place_indices, place_indices.unique().shape)

        self.interaction_matrix = csr_matrix((self.interactions_df['weighted_interaction'], (user_indices, place_indices)), shape=(len(user_indices.unique()), len(place_indices)))
        # display(self.interaction_matrix.shape)
        self.svd_model = TruncatedSVD(n_components=50, random_state=42)
        self.user_factors = self.svd_model.fit_transform(self.interaction_matrix)
        self.place_factors = self.svd_model.components_.T
        ### We can also add how popular a place is?
        self.train_lightfm()
    
        
    
    def build_user_profiles(self):
        
        self.user_profiles = defaultdict(lambda: np.zeros(self.place_features.shape[1]))
        
        # Process explicit interactions (likes and add_to_list)
        for _, row in self.interactions_df.iterrows():
            user_id = row['user_id']
            place_id = row['place_id']
            interaction_type = row['interaction_type']
            
            # Weight different interaction types
            weight = row['weighted_interaction']
            
            # Update user profile
            if place_id in self.places_df.index:
                place_feature_values = self.place_features.iloc[place_id-1].values.astype(np.float64)
                self.user_profiles[user_id] += weight * place_feature_values
        
        # Normalize
        for user_id in self.user_profiles:
            profile = self.user_profiles[user_id]
            if np.any(profile):
                self.user_profiles[user_id] = profile / np.linalg.norm(profile)
    
    def get_recommendations(self, user_id, n_recommendations= 1):
        """Weight to cf and similarities is 0.4 and 0.6 respectively, these hyperparameters can be tuned to find a better answer"""
        start_time = time.time()
        
        if user_id not in self.user_profiles:
            return []
        
        # Calculate similarity between user profile and all places
        cf_scores = np.zeros(len(self.places_df))
        user_profile = self.user_profiles[user_id]
        similarities = cosine_similarity([user_profile], self.place_features)[0]
        place_indices = self.interactions_df['place_id'].astype('category').cat.codes.unique()
        user_idx = self.interactions_df['user_id'].astype('category').cat.codes[self.interactions_df['user_id'] == user_id].iloc[0]
        cf_predictions = np.dot(self.user_factors[user_idx], self.place_factors.T)
        for idx, score in zip(place_indices, cf_predictions):
            cf_scores[idx] = score
        lightfm_scores = self.get_lightfm_scores(user_id)
        # display(similarities.shape, cf_scores.shape)
        combined_scores = 0.3 * similarities + 0.4 * cf_scores + 0.3 * lightfm_scores
        # print(combined_scores)
        
        # Get places the user hasn't interacted with
        user_interactions = set(self.train_interactions[self.train_interactions['user_id'] == user_id]['place_id'])
        # display(user_interactions)
        # Creating list of (place_id, similarity) tuples for places user hasn't interacted with
        place_similarities = [
            (place_id, sim) for place_id, sim in enumerate(combined_scores, 1)
            if place_id not in user_interactions
        ]
        # display(place_similarities)
        #####HMMM UNSURE WHY Chaning learning rate has no effect or changinf weight of each 
        # Sorting by similarity
        recommendations = sorted(place_similarities, key=lambda x: x[1], reverse=True)[:n_recommendations]
        # print(recommendations)
        
        detailed_recommendations = []
        for place_id, similarity in recommendations:
            place = self.places_df.iloc[place_id-1]
            detailed_recommendations.append({
                'place_id': place_id,
                'place_name': place['place_name'],
                'category': place['category'],
                'tags': place['tags'],
                'location': place['location'],
                'similarity_score': similarity,
                'response_time': time.time() - start_time
            })
        # display(detailed_recommendations)
        return detailed_recommendations

    def evaluate_precision_at_k(self, test_users, k):
        precisions = []
        
        for user_id in self.test_interactions['user_id'].unique():
            recommendations = self.get_recommendations(user_id, k)
            if not recommendations:
                continue
                
            # Get test set places for this user
            test_places = set(self.test_interactions[self.test_interactions['user_id'] == user_id]['place_id'])
            
            # Calculate precision
            recommended_places = {rec['place_id'] for rec in recommendations}
            if recommended_places:
                precision = len(test_places.intersection(recommended_places)) / len(recommended_places)
                precisions.append(precision)
        return np.mean(precisions) if precisions else 0.0

    def evaluate_recall_at_k(self, test_users, k):
        recalls = []
        
        for user_id in self.test_interactions['user_id'].unique():
            recommendations = self.get_recommendations(user_id, k)
            if not recommendations:
                continue
                
            # Get test set places for this user
            test_places = set(self.test_interactions[self.test_interactions['user_id'] == user_id]['place_id'])
            
            if test_places:
                recommended_places = {rec['place_id'] for rec in recommendations}
                recall = len(test_places.intersection(recommended_places)) / len(test_places)
                recalls.append(recall)
        
        return np.mean(recalls) if recalls else 0.0

    def evaluate_map(self, test_users, k):
        ap_scores = []
        
        for user_id in self.test_interactions['user_id'].unique():
            recommendations = self.get_recommendations(user_id, k)
            if not recommendations:
                continue
                
            test_places = set(self.test_interactions[self.test_interactions['user_id'] == user_id]['place_id'])
            
            if not test_places:
                continue
            relevant_count = 0
            precisions = []
            
            for i, rec in enumerate(recommendations, 1):
                if rec['place_id'] in test_places:
                    relevant_count += 1
                    precisions.append(relevant_count / i)
            
            if precisions:
                ap_scores.append(sum(precisions) / len(test_places))
        
        return np.mean(ap_scores) if ap_scores else 0.0


    def evaluate_response_time(self, test_users, k):
        response_times = []
        
        for user_id in test_users:
            start_time = time.time()
            self.get_recommendations(user_id, n_recommendations=k)
            response_time = time.time() - start_time
            response_times.append(response_time)
        
        return {
            'mean_response_time': np.mean(response_times),
            'max_response_time': np.max(response_times),
            'min_response_time': np.min(response_times),
            'std_response_time': np.std(response_times)
        }

    def run_comprehensive_evaluation(self, test_users, k=1):
        results = {
            'precision_at_k': self.evaluate_precision_at_k(test_users, k),
            'recall_at_k': self.evaluate_recall_at_k(test_users, k),
            'mean_average_precision': self.evaluate_map(test_users, k),
            'response_time_metrics': self.evaluate_response_time(test_users, k)
        }
        return results

def main():
    # Initialize recommender
    k = 10
    recommender = PlaceRecommender()
    
    # Load and preprocess data
    places_df = pd.read_csv("/kaggle/input/travel/places.csv")
    user_interactions_df = pd.read_csv("/kaggle/input/travel/user_interactions.csv")
    users_df = pd.read_csv("/kaggle/input/travel/users.csv")
    
    recommender.load_and_preprocess_data(users_df, places_df, user_interactions_df)
    recommender.build_user_profiles()
    test_users = list(range(1,10)) 
    
    evaluation_results = recommender.run_comprehensive_evaluation(test_users, k)
    # display(evaluation_results)
    # Print evaluation results
    print("\nEvaluation Results:")
    print(f"Precision@{k}: {evaluation_results['precision_at_k']:.3f}")
    print(f"Recall@{k} {evaluation_results['recall_at_k']:.3f}")
    print(f"Mean Average Precision: {evaluation_results['mean_average_precision']:.3f}")
    print("\nResponse Time Metrics:")
    for metric, value in evaluation_results['response_time_metrics'].items():
        print(f"{metric}: {value*1000:.2f}ms")

if __name__ == "__main__":
    main()    

Training LightFM model...


Epoch: 100%|██████████| 30/30 [00:00<00:00, 629.97it/s]



Evaluation Results:
Precision@10: 0.032
Recall@10 0.216
Mean Average Precision: 0.148

Response Time Metrics:
mean_response_time: 3.22ms
max_response_time: 3.78ms
min_response_time: 3.06ms
std_response_time: 0.21ms
