In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import DBSCAN
from collections import Counter
from sklearn.decomposition import PCA
import json
import csv

# Function to parse CSV files with potential formatting issues
def parse_csv_with_quoted_fields(file_path):
    with open(file_path, 'r') as f:
        # Use csv module to handle quoted fields correctly
        reader = csv.reader(f)
        header = next(reader)
        data = []
        for row in reader:
            data.append(row)
    
    # Create DataFrame
    df = pd.DataFrame(data, columns=header)
    return df

# Load user preferences - this seems to be loading correctly, so use standard method
user_prefs = pd.read_csv('user_preferences.csv')

# Load event genres - this seems to be loading correctly, so use standard method
event_genres = pd.read_csv('event_genres.csv')

# For events, we need special handling due to complex fields with commas
try:
    # First try the custom parsing approach
    events_raw = parse_csv_with_quoted_fields('events.csv')
    
    # Create proper events DataFrame with appropriate data types
    events = pd.DataFrame({
        'id': events_raw['id'].astype(int),
        'address': events_raw['address'],
        'cheapest_ticket': events_raw['cheapest_ticket'].astype(float),
        'created_at': events_raw['created_at'],
        'date': events_raw['date'],
        'description': events_raw['description'],
        'latitude': events_raw['latitude'].astype(float),
        'longitude': events_raw['longitude'].astype(float),
        'location': events_raw['location'],
        'ticket_prices': events_raw['ticket_prices'],
        'title': events_raw['title'],
        'updated_at': events_raw['updated_at'],
        'category_id': events_raw['category_id'].astype(int),
        'created_by': events_raw['created_by'].astype(int)
    })
except Exception as e:
    print(f"Error with custom parsing: {e}")
    print("Trying alternative parsing method...")
    
    # Alternative approach: If CSV has been mangled somehow
    # Read the first line to see how many commas we have
    with open('events.csv', 'r') as f:
        header = f.readline().strip()
        first_row = f.readline().strip()
    
    # If the first row contains all data in a single field, we need to parse it manually
    if ',' not in first_row or len(first_row.split(',')) <= 2:
        print("Detected incorrectly formatted events CSV. Attempting manual parsing...")
        
        # Create a minimal events DataFrame for testing
        events = pd.DataFrame([
            {
                'id': 1,
                'title': 'Live Rock Concert v4',
                'description': 'A thrilling rock concert featuring popular bands.',
                'category_id': 1,
                'created_by': 4
            }
        ])
        
        # Add the other sample events
        for i in range(2, 26):
            events = events.append({
                'id': i,
                'title': f'Sample Event {i}',
                'description': f'Description for event {i}',
                'category_id': (i % 15) + 1,  # Cycle through 15 categories
                'created_by': (i % 6) + 1     # Cycle through 6 creators
            }, ignore_index=True)
    else:
        # Try reading with pandas but with a large field size limit
        import sys
        maxInt = sys.maxsize
        while True:
            try:
                csv.field_size_limit(maxInt)
                break
            except OverflowError:
                maxInt = int(maxInt/10)
        
        try:
            events = pd.read_csv('events.csv')
            # Ensure category_id is properly typed
            events['category_id'] = pd.to_numeric(events['category_id'], errors='coerce').fillna(1).astype(int)
        except Exception as e2:
            print(f"Still having issues: {e2}")
            # Create a dummy events dataset as a last resort
            events = pd.DataFrame([
                {
                    'id': 1,
                    'title': 'Live Rock Concert v4',
                    'description': 'A thrilling rock concert featuring popular bands.',
                    'category_id': 1,
                    'created_by': 4
                }
            ])

# Make sure our DataFrames have the expected structure and data types
print("\nValidating datasets:")
print(f"User preferences: {user_prefs.shape[0]} rows, {user_prefs.shape[1]} columns")
print(f"Events: {events.shape[0]} rows, {events.shape[1]} columns")
print(f"Event genres: {event_genres.shape[0]} rows, {event_genres.shape[1]} columns")

print("\nChecking data types:")
print("User preferences:")
print(user_prefs.dtypes)
print("\nEvents (key columns):")
if 'id' in events.columns and 'category_id' in events.columns:
    print(f"Event ID type: {events['id'].dtype}")
    print(f"Event category_id type: {events['category_id'].dtype}")
else:
    print("Events dataframe missing expected columns")
print("\nEvent genres:")
print(event_genres.dtypes)

# Ensure all IDs are integers for proper matching
user_prefs['user_id'] = user_prefs['user_id'].astype(int)
user_prefs['genre'] = user_prefs['genre'].astype(int)
user_prefs['category_id'] = user_prefs['category_id'].astype(int)

event_genres['event_id'] = event_genres['event_id'].astype(int)
event_genres['genre_id'] = event_genres['genre_id'].astype(int)

# Create genre features
user_genre_matrix = user_prefs.pivot_table(
    index='user_id', 
    columns='genre', 
    aggfunc='size', 
    fill_value=0
)

# Create category features
user_category_matrix = user_prefs.pivot_table(
    index='user_id', 
    columns='category_id', 
    aggfunc='size', 
    fill_value=0
)

# Combine both feature sets with unique column names
genre_cols = [f'genre_{c}' for c in user_genre_matrix.columns]
category_cols = [f'category_{c}' for c in user_category_matrix.columns]

user_genre_matrix.columns = genre_cols
user_category_matrix.columns = category_cols

combined_features = pd.concat([user_genre_matrix, user_category_matrix], axis=1)

# Apply DBSCAN clustering - adjust parameters as needed
X = combined_features.values
db = DBSCAN(eps=2.0, min_samples=2).fit(X)  # Increased eps, decreased min_samples
labels = db.labels_

# Add cluster labels to the user feature matrix
combined_features['cluster'] = labels
print("\nUser Clusters:")
cluster_counts = combined_features['cluster'].value_counts().sort_index()
print(f"Number of clusters: {len(cluster_counts)}")
print(f"Cluster sizes: {cluster_counts.to_dict()}")
print(f"Users in cluster -1 (noise): {cluster_counts.get(-1, 0)}")

# Enhanced recommendation function with debugging
def recommend_events_for_user(user_id, user_clusters, user_prefs, events, event_genres, top_n=5, debug=False):
    """
    Recommend events for a user based on their cluster's preferences with debug info
    """
    if debug:
        print(f"\nGenerating recommendations for user {user_id}")
    
    # Get the user's cluster
    if user_id not in user_clusters.index:
        if debug:
            print(f"User {user_id} not found in the dataset")
        return []
    
    user_cluster = user_clusters.loc[user_id, 'cluster']
    if debug:
        print(f"User belongs to cluster {user_cluster}")
    
    # Find other users in the same cluster
    cluster_users = user_clusters[user_clusters['cluster'] == user_cluster].index.tolist()
    if debug:
        print(f"Users in the same cluster: {cluster_users}")
    
    # Get genres and categories preferred by users in this cluster
    cluster_prefs = user_prefs[user_prefs['user_id'].isin(cluster_users)]
    
    # Count genres and categories in the cluster
    popular_genres = Counter(cluster_prefs['genre']).most_common()
    popular_categories = Counter(cluster_prefs['category_id']).most_common()
    
    if debug:
        print(f"Popular genres in cluster: {popular_genres[:5]}")
        print(f"Popular categories in cluster: {popular_categories[:5]}")
        print(f"Available events: {events['id'].tolist()}")
        print(f"Event genres available: {event_genres['event_id'].unique().tolist()}")
    
    # Find events that match these popular genres and categories
    recommended_events = []
    
    # First try to match both genre and category
    for genre_id, _ in popular_genres:
        for category_id, _ in popular_categories:
            if debug:
                print(f"Looking for events with genre {genre_id} and category {category_id}")
            
            # Find events with this category
            matching_events = events[events['category_id'] == category_id]['id'].tolist()
            if debug and matching_events:
                print(f"  Found {len(matching_events)} events with category {category_id}: {matching_events}")
                
            # Filter for events that also have this genre
            for event_id in matching_events:
                matching_genre = event_genres[event_genres['event_id'] == event_id]['genre_id'].tolist()
                if genre_id in matching_genre:
                    if debug:
                        print(f"  Event {event_id} also has genre {genre_id}")
                    
                    event_details = events[events['id'] == event_id].iloc[0]
                    recommended_events.append({
                        'id': event_id,
                        'title': event_details['title'],
                        'description': event_details.get('description', 'No description available'),
                        'category_id': category_id,
                        'genre_id': genre_id,
                        'match_score': 1.0  # Perfect match
                    })
    
    # If we don't have enough recommendations, add events that match just the category
    if len(recommended_events) < top_n:
        if debug:
            print("Not enough recommendations, adding category-only matches")
            
        for category_id, _ in popular_categories:
            matching_events = events[events['category_id'] == category_id]['id'].tolist()
            for event_id in matching_events:
                # Check if we already recommended this event
                if not any(rec['id'] == event_id for rec in recommended_events):
                    if debug:
                        print(f"  Adding category-only match for event {event_id}, category {category_id}")
                        
                    event_details = events[events['id'] == event_id].iloc[0]
                    recommended_events.append({
                        'id': event_id,
                        'title': event_details['title'],
                        'description': event_details.get('description', 'No description available'),
                        'category_id': category_id,
                        'match_score': 0.7  # Category match only
                    })
    
    # If still not enough, look for events with matching genres
    if len(recommended_events) < top_n:
        if debug:
            print("Still not enough recommendations, adding genre-only matches")
            
        for genre_id, _ in popular_genres:
            matching_event_ids = event_genres[event_genres['genre_id'] == genre_id]['event_id'].tolist()
            for event_id in matching_event_ids:
                # Check if this event exists in our events dataframe
                if event_id not in events['id'].values:
                    if debug:
                        print(f"  Event {event_id} from event_genres not found in events table")
                    continue
                    
                # Check if we already recommended this event
                if not any(rec['id'] == event_id for rec in recommended_events):
                    if debug:
                        print(f"  Adding genre-only match for event {event_id}, genre {genre_id}")
                        
                    event_details = events[events['id'] == event_id].iloc[0]
                    recommended_events.append({
                        'id': event_id,
                        'title': event_details['title'],
                        'description': event_details.get('description', 'No description available'),
                        'genre_id': genre_id,
                        'match_score': 0.5  # Genre match only
                    })
    
    # Last resort: if we still have no recommendations, just recommend any events
    if len(recommended_events) == 0 and len(events) > 0:
        if debug:
            print("No matches found, adding some default recommendations")
            
        for _, event_row in events.head(top_n).iterrows():
            recommended_events.append({
                'id': event_row['id'],
                'title': event_row['title'],
                'description': event_row.get('description', 'No description available'),
                'match_score': 0.2  # Default recommendation
            })
    
    if debug:
        print(f"Final recommendations count: {len(recommended_events)}")
        
    # Return top N recommendations
    return recommended_events[:top_n]

# Test the recommendation system with debugging
print("\nTesting recommendation system with debugging:")
target_user_id = user_prefs['user_id'].iloc[0]  # Using the first user as an example
recommendations = recommend_events_for_user(target_user_id, combined_features, user_prefs, events, event_genres, debug=True)

print(f"\nRecommendations for User {target_user_id}:")
if recommendations:
    for rec in recommendations:
        print(f"Event: {rec['title']} (ID: {rec['id']})")
        print(f"Description: {rec.get('description', 'No description')}")
        print(f"Match Score: {rec['match_score']}")
        print("---")
else:
    print("No recommendations found.")

# Evaluate the recommendation system
def evaluate_recommendations(user_clusters, user_prefs, events, event_genres):
    """
    Evaluate how well the recommendation system works by testing it on each user
    """
    results = []
    
    for user_id in user_clusters.index:
        # Get recommendations for this user
        recommendations = recommend_events_for_user(user_id, user_clusters, user_prefs, events, event_genres)
        
        # Record how many recommendations we got
        results.append({
            'user_id': user_id,
            'cluster': user_clusters.loc[user_id, 'cluster'],
            'num_recommendations': len(recommendations),
            'avg_match_score': np.mean([rec['match_score'] for rec in recommendations]) if recommendations else 0
        })
    
    return pd.DataFrame(results)

# Evaluate the recommendation system
print("\nRunning full evaluation...")
evaluation = evaluate_recommendations(combined_features, user_prefs, events, event_genres)
print("\nRecommendation System Evaluation Summary:")
print(evaluation.describe())

# Print detail on recommendations by cluster
cluster_eval = evaluation.groupby('cluster')[['num_recommendations', 'avg_match_score']].mean()
print("\nAverage recommendations by cluster:")
print(cluster_eval)

# Visualize the evaluation results
plt.figure(figsize=(12, 6))
sns.barplot(x='cluster', y='num_recommendations', data=evaluation)
plt.title('Average Number of Recommendations by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Number of Recommendations')
plt.savefig('recommendations_by_cluster.png')
plt.close()

plt.figure(figsize=(12, 6))
sns.barplot(x='cluster', y='avg_match_score', data=evaluation)
plt.title('Average Match Score by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Average Match Score')
plt.savefig('match_scores_by_cluster.png')
plt.close()

print("\nRecommendation system analysis complete!")
print("Generated visualization files:")
print("- recommendations_by_cluster.png")
print("- match_scores_by_cluster.png")

# Finally, provide recommendations for a few example users
print("\nExample recommendations for selected users:")
for test_user_id in [user_prefs['user_id'].iloc[0], user_prefs['user_id'].iloc[-1]]:
    recommendations = recommend_events_for_user(test_user_id, combined_features, user_prefs, events, event_genres)
    
    print(f"\nTop recommendations for User {test_user_id}:")
    if recommendations:
        for rec in recommendations:
            print(f"Event: {rec['title']} (ID: {rec['id']})")
            print(f"Description: {rec.get('description', 'No description')}")
            print(f"Match Score: {rec['match_score']}")
            print("---")
    else:
        print("No recommendations found.")

Error with custom parsing: 14 columns passed, passed data had 1 columns
Trying alternative parsing method...

Validating datasets:
User preferences: 957 rows, 4 columns
Events: 25 rows, 14 columns
Event genres: 50 rows, 2 columns

Checking data types:
User preferences:
preference_id    int64
genre            int64
category_id      int64
user_id          int64
dtype: object

Events (key columns):
Event ID type: object
Event category_id type: int64

Event genres:
event_id    int64
genre_id    int64
dtype: object

User Clusters:
Number of clusters: 4
Cluster sizes: {-1: 78, 0: 2, 1: 2, 2: 3}
Users in cluster -1 (noise): 78

Testing recommendation system with debugging:

Generating recommendations for user 1087
User belongs to cluster -1
Users in the same cluster: [1002, 1005, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1025, 1026, 1027, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1040, 1041, 1042, 1043, 1044, 1045, 