In [10]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import logging
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Configure logging
logging.basicConfig(
    filename='recommendation_system.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Function to load and prepare data
def load_data(events_path, categories_path, genres_path, user_preferences_path):
    """
    Load and prepare data from CSV files
    """
    logging.info("Loading data from CSV files")
    
    # Load CSV files
    events_df = pd.read_csv(events_path)
    categories_df = pd.read_csv(categories_path)
    genres_df = pd.read_csv(genres_path)
    user_preferences_df = pd.read_csv(user_preferences_path)
    
    # Convert date columns to datetime format if needed
    if 'date' in events_df.columns:
        events_df['date'] = pd.to_datetime(events_df['date'])
    
    logging.info(f"Loaded {len(events_df)} events, {len(categories_df)} categories, {len(genres_df)} genres, and {len(user_preferences_df)} user preferences")
    
    return events_df, categories_df, genres_df, user_preferences_df

# Create user-genre matrix for categories with genres
def create_user_genre_matrix(user_preferences_df, genres_df):
    """
    Create a matrix where rows are users and columns are genres
    """
    logging.info("Creating user-genre matrix")
    
    # Filter for preferences with actual genres (not 999)
    genre_preferences = user_preferences_df[user_preferences_df['genre'] != 999]
    
    # Create a matrix with users as rows and genres as columns
    user_genre_matrix = pd.pivot_table(
        genre_preferences,
        values='preference_id',  # Just need a value to indicate preference exists
        index='user_id',
        columns='genre',
        aggfunc='count',
        fill_value=0
    )
    
    # Convert to binary (1 if user likes genre, 0 otherwise)
    user_genre_matrix = (user_genre_matrix > 0).astype(int)
    
    logging.info(f"Created user-genre matrix with shape {user_genre_matrix.shape}")
    
    return user_genre_matrix

# Create user-category matrix for categories without genres
def create_user_category_matrix(user_preferences_df, categories_df):
    """
    Create a matrix where rows are users and columns are categories without genres
    """
    logging.info("Creating user-category matrix")
    
    # Get categories without genres
    categories_without_genres = categories_df[categories_df['has_genre'] == 0]
    
    # Filter preferences for those categories
    category_preferences = user_preferences_df[
        (user_preferences_df['genre'] == 999) & 
        (user_preferences_df['category_id'].isin(categories_without_genres['id']))
    ]
    
    # Create matrix
    user_category_matrix = pd.pivot_table(
        category_preferences,
        values='preference_id',
        index='user_id',
        columns='category_id',
        aggfunc='count',
        fill_value=0
    )
    
    # Convert to binary
    user_category_matrix = (user_category_matrix > 0).astype(int)
    
    logging.info(f"Created user-category matrix with shape {user_category_matrix.shape}")
    
    return user_category_matrix

# Compute user similarity
def compute_user_similarity(user_genre_matrix, user_category_matrix):
    """
    Compute similarity between users based on their preferences
    """
    logging.info("Computing user similarity")
    
    # Get all unique users - convert to list to avoid set error
    all_users = list(set(user_genre_matrix.index).union(set(user_category_matrix.index)))
    
    # Create a combined feature matrix
    combined_features = pd.DataFrame(index=all_users)
    
    # Add genre features (if user exists in user_genre_matrix)
    for user in all_users:
        if user in user_genre_matrix.index:
            for col in user_genre_matrix.columns:
                combined_features.loc[user, f'genre_{col}'] = user_genre_matrix.loc[user, col]
        else:
            for col in user_genre_matrix.columns:
                combined_features.loc[user, f'genre_{col}'] = 0
    
    # Add category features (if user exists in user_category_matrix)
    for user in all_users:
        if user in user_category_matrix.index:
            for col in user_category_matrix.columns:
                combined_features.loc[user, f'category_{col}'] = user_category_matrix.loc[user, col]
        else:
            for col in user_category_matrix.columns:
                combined_features.loc[user, f'category_{col}'] = 0
    
    # Fill NaN values with 0 (for users missing in one of the matrices)
    combined_features = combined_features.fillna(0)
    
    # Compute cosine similarity
    similarity_matrix = pd.DataFrame(
        cosine_similarity(combined_features),
        index=combined_features.index,
        columns=combined_features.index
    )
    
    logging.info(f"Computed similarity matrix with shape {similarity_matrix.shape}")
    
    return similarity_matrix, combined_features

# Get recommendations for a user
def get_recommendations(user_id, similarity_matrix, events_df, categories_df, genres_df, user_preferences_df, top_n=5):
    """
    Get event recommendations for a specific user
    """
    logging.info(f"Generating recommendations for user {user_id}")
    
    # Check if user exists in similarity matrix
    if user_id not in similarity_matrix.index:
        logging.warning(f"User {user_id} not found in similarity matrix")
        return pd.DataFrame()
    
    # Get similar users (excluding the target user)
    similar_users = similarity_matrix.loc[user_id].drop(user_id).sort_values(ascending=False)
    
    # Get top similar users
    top_similar_users = similar_users.head(10).index.tolist()
    
    # Get preferences of similar users
    similar_users_prefs = user_preferences_df[user_preferences_df['user_id'].isin(top_similar_users)]
    
    # Get existing preferences of the target user
    user_existing_prefs = user_preferences_df[user_preferences_df['user_id'] == user_id]
    
    # Categories liked by the user
    user_liked_categories = user_existing_prefs['category_id'].unique()
    
    # Genres liked by the user (exclude 999 which means no genre)
    user_liked_genres = user_existing_prefs[user_existing_prefs['genre'] != 999]['genre'].unique()
    
    # Count preferences by category and genre among similar users
    category_counts = similar_users_prefs['category_id'].value_counts()
    genre_counts = similar_users_prefs[similar_users_prefs['genre'] != 999]['genre'].value_counts()
    
    # Create a score for each event
    event_scores = {}
    
    for _, event in events_df.iterrows():
        event_id = event['id']
        category_id = event['category_id']
        
        # Start with base score
        score = 0
        
        # Add score based on category popularity among similar users
        if category_id in category_counts:
            score += category_counts[category_id] * 2  # Higher weight for category
        
        # Check if this category exists in categories_df and has genres
        category_match = categories_df[categories_df['id'] == category_id]
        if not category_match.empty:
            category_has_genre = category_match['has_genre'].values[0]
            
            if category_has_genre:
                # If event has genre (like concerts), find the genre info
                # For simplicity, let's assume genre info is in the event description
                event_description = event['description'].lower() if isinstance(event['description'], str) else ""
                event_title = event['title'].lower() if isinstance(event['title'], str) else ""
                
                # Look for each genre in the description
                for _, genre_row in genres_df[genres_df['category_id'] == category_id].iterrows():
                    genre_id = genre_row['id']
                    genre_name = genre_row['name'].lower()
                    
                    # If genre is mentioned in description or title
                    if genre_name in event_description or genre_name in event_title:
                        # Add score based on genre popularity
                        if genre_id in genre_counts:
                            score += genre_counts[genre_id]
                        
                        # Boost score if user already likes this genre
                        if genre_id in user_liked_genres:
                            score += 5
        else:
            # Category not found in categories_df, log a warning
            logging.warning(f"Category ID {category_id} not found in categories dataframe for event {event_id}")
        
        # Boost score if user already likes this category
        if category_id in user_liked_categories:
            score += 3
        
        # Store the score
        event_scores[event_id] = score
    
    # Sort events by score
    sorted_events = sorted(event_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Get top N events
    top_event_ids = [event_id for event_id, _ in sorted_events[:top_n]]
    
    # Filter events dataframe for top events
    recommended_events = events_df[events_df['id'].isin(top_event_ids)].copy()
    
    # If no recommendations match the criteria, return empty dataframe
    if recommended_events.empty:
        logging.warning(f"No recommendations generated for user {user_id}")
        return pd.DataFrame()
    
    # Add score to the dataframe
    recommended_events['recommendation_score'] = recommended_events['id'].map(dict(sorted_events))
    
    # Sort by score
    recommended_events = recommended_events.sort_values('recommendation_score', ascending=False)
    
    # Log recommendations
    logging.info(f"Generated {len(recommended_events)} recommendations for user {user_id}")
    
    # Save to showcase file
    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
    recommended_events.to_csv(f'recommendations_user_{user_id}_{timestamp}.csv', index=False)
    
    return recommended_events

# Evaluate the recommendation system
def evaluate_system(user_preferences_df, recommendations_df):
    """
    Simple evaluation of the recommendation system
    """
    logging.info("Evaluating recommendation system")
    
    # Calculate basic metrics
    unique_users = len(user_preferences_df['user_id'].unique())
    total_recommendations = len(recommendations_df)
    
    # Plot distribution of recommendation scores
    plt.figure(figsize=(10, 6))
    sns.histplot(recommendations_df['recommendation_score'], kde=True)
    plt.title('Distribution of Recommendation Scores')
    plt.xlabel('Score')
    plt.ylabel('Count')
    plt.savefig('recommendation_score_distribution.png')
    
    logging.info(f"Generated recommendations for {unique_users} users")
    logging.info(f"Total recommendations: {total_recommendations}")
    
    return {
        'unique_users': unique_users,
        'total_recommendations': total_recommendations,
        'avg_recommendations_per_user': total_recommendations / unique_users if unique_users > 0 else 0
    }

# Add a simple testing function for small test datasets
def test_with_sample_data():
    """
    Test the recommendation system with a small sample dataset
    """
    # Sample data
    events_data = {
        'id': [1, 2, 3, 4, 5],
        'title': ['Rock Concert', 'Jazz Night', 'Comedy Show', 'Food Festival', 'Art Exhibition'],
        'description': ['Amazing rock concert', 'Smooth jazz evening', 'Hilarious comedy', 'Delicious food', 'Beautiful art'],
        'category_id': [1, 1, 2, 9, 7]
    }
    
    categories_data = {
        'id': [1, 2, 7, 9],
        'name': ['Concert', 'Teatru', 'Targuri si expozitii', 'Evenimente culinare'],
        'has_genre': [1, 1, 0, 0],
        'description': ['Music events', 'Theater events', 'Exhibitions', 'Food events']
    }
    
    genres_data = {
        'id': [1, 2, 3, 13, 14],
        'name': ['Rock', 'Pop', 'Jazz', 'Drama', 'Comedie'],
        'category_id': [1, 1, 1, 2, 2]
    }
    
    user_preferences_data = {
        'preference_id': [1, 2, 3, 4, 5, 6],
        'user_id': [1001, 1001, 1001, 1002, 1002, 1002],
        'category_id': [1, 2, 7, 1, 2, 9],
        'genre': [1, 14, 999, 3, 13, 999]
    }
    
    # Create DataFrames
    events_df = pd.DataFrame(events_data)
    categories_df = pd.DataFrame(categories_data)
    genres_df = pd.DataFrame(genres_data)
    user_preferences_df = pd.DataFrame(user_preferences_data)
    
    # Create matrices
    user_genre_matrix = create_user_genre_matrix(user_preferences_df, genres_df)
    user_category_matrix = create_user_category_matrix(user_preferences_df, categories_df)
    
    # Compute similarity
    similarity_matrix, _ = compute_user_similarity(user_genre_matrix, user_category_matrix)
    
    # Get recommendations for a user
    user_id = 1001
    recommendations = get_recommendations(
        user_id,
        similarity_matrix,
        events_df,
        categories_df,
        genres_df,
        user_preferences_df,
        top_n=3
    )
    
    print(f"Test recommendations for user {user_id}:")
    if not recommendations.empty:
        print(recommendations[['id', 'title', 'description', 'category_id', 'recommendation_score']])
    else:
        print("No recommendations generated")
    
    return recommendations

# Main function to run the recommendation system
def main():
    """
    Main function to run the recommendation system
    """
    logging.info("Starting recommendation system")
    
    # Load data
    events_df, categories_df, genres_df, user_preferences_df = load_data(
        'events.csv',
        'categories.csv',
        'genres.csv',
        'user_preferences.csv'
    )
    
    # Create matrices
    user_genre_matrix = create_user_genre_matrix(user_preferences_df, genres_df)
    user_category_matrix = create_user_category_matrix(user_preferences_df, categories_df)
    
    # Compute similarity
    similarity_matrix, combined_features = compute_user_similarity(user_genre_matrix, user_category_matrix)
    
    # Generate recommendations for all users
    all_recommendations = []
    
    for user_id in user_preferences_df['user_id'].unique():
        recommendations = get_recommendations(
            user_id, 
            similarity_matrix, 
            events_df, 
            categories_df, 
            genres_df, 
            user_preferences_df
        )
        
        if not recommendations.empty:
            all_recommendations.append(recommendations)
    
    # Combine all recommendations
    if all_recommendations:
        all_recommendations_df = pd.concat(all_recommendations)
        
        # Save all recommendations to a showcase file
        all_recommendations_df.to_csv('all_recommendations.csv', index=False)
        
        # Evaluate the system
        evaluation_metrics = evaluate_system(user_preferences_df, all_recommendations_df)
        
        # Save evaluation metrics
        with open('evaluation_metrics.txt', 'w') as f:
            for key, value in evaluation_metrics.items():
                f.write(f"{key}: {value}\n")
        
        logging.info(f"Evaluation metrics: {evaluation_metrics}")
    else:
        logging.warning("No recommendations generated")
    
    logging.info("Recommendation system completed")

# Function to demonstrate sample usage
def sample_usage():
    """
    Demonstrate sample usage of the recommendation system
    """
    try:
        # Attempt to load data
        try:
            events_df = pd.read_csv('events.csv')
            categories_df = pd.read_csv('categories.csv')
            genres_df = pd.read_csv('genres.csv')
            user_preferences_df = pd.read_csv('user_preferences.csv')
            
            print("CSV files loaded successfully!")
        except Exception as e:
            print(f"Error loading CSV files: {e}")
            print("Testing with sample data instead...")
            return test_with_sample_data()
        
        # Create matrices
        user_genre_matrix = create_user_genre_matrix(user_preferences_df, genres_df)
        user_category_matrix = create_user_category_matrix(user_preferences_df, categories_df)
        
        # Compute similarity
        similarity_matrix, _ = compute_user_similarity(user_genre_matrix, user_category_matrix)
        
        # Get recommendations for a specific user
        user_id = 1001  # Example user
        recommendations = get_recommendations(
            user_id,
            similarity_matrix,
            events_df,
            categories_df,
            genres_df,
            user_preferences_df,
            top_n=3
        )
        
        print(f"Top recommendations for user {user_id}:")
        if not recommendations.empty:
            print(recommendations[['id', 'title', 'description', 'category_id', 'recommendation_score']])
        else:
            print("No recommendations were generated.")
            
        return recommendations
    
    except Exception as e:
        print(f"An error occurred: {e}")
        import traceback
        traceback.print_exc()
        return None

# Run the system if this script is executed directly
if __name__ == "__main__":
    # Uncomment to run the full system
    # main()
    
    # For demonstration, show sample usage
    sample_usage()

CSV files loaded successfully!
Top recommendations for user 1001:
                                                  id  title  description  \
0  1,"123 Main St, Cityville",75.00,"2025-03-07 1...    NaN          NaN   

   category_id  recommendation_score  
0          NaN                     0  
