# Movie Recommendation System Analysis

This notebook analyzes the performance of our recommendation system and explores the dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from data_processing import load_data, prepare_data, get_movie_features
from recommendation_models import CollaborativeFiltering, ContentBasedFiltering, HybridRecommender
from evaluation import evaluate_recommendations, precision_at_k, recall_at_k
from visualization import plot_rating_distribution, plot_model_comparison, plot_user_activity

## 1. Load and Explore Data

In [None]:
# Load the dataset
ratings, movies = load_data()

# Display basic information
print(f"Dataset contains {len(ratings)} ratings from {ratings['userId'].nunique()} users on {ratings['movieId'].nunique()} movies")
ratings.head()

In [None]:
# Explore movie data
movies.head()

In [None]:
# Distribution of ratings
plt.figure(figsize=(10, 6))
sns.countplot(x='rating', data=ratings)
plt.title('Distribution of Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()

## 2. Analyze User Activity

In [None]:
# Number of ratings per user
user_counts = ratings.groupby('userId').size().reset_index(name='counts')

plt.figure(figsize=(12, 6))
sns.histplot(user_counts['counts'], kde=True, bins=30)
plt.title('Distribution of Ratings per User')
plt.xlabel('Number of Ratings')
plt.ylabel('Count of Users')
plt.show()

print(f"Average ratings per user: {user_counts['counts'].mean():.2f}")
print(f"Median ratings per user: {user_counts['counts'].median():.2f}")

In [None]:
# Most popular movies
movie_counts = ratings.groupby('movieId').size().reset_index(name='counts')
top_movies = movie_counts.merge(movies[['movieId', 'title']], on='movieId').sort_values('counts', ascending=False).head(20)

plt.figure(figsize=(14, 8))
sns.barplot(x='counts', y='title', data=top_movies)
plt.title('Top 20 Most Rated Movies')
plt.xlabel('Number of Ratings')
plt.ylabel('Movie Title')
plt.tight_layout()
plt.show()

## 3. Compare Recommendation Models

In [None]:
# Split data
user_item_matrix, train_data, test_data = prepare_data(ratings)
movie_features = get_movie_features(movies)

# Load pre-trained models if available, otherwise train new ones
try:
    with open('models/cf_model.pkl', 'rb') as f:
        cf_model = pickle.load(f)
    with open('models/hybrid_model.pkl', 'rb') as f:
        hybrid_model = pickle.load(f)
    print("Loaded pre-trained models")
except:
    print("Training new models")
    cf_model = CollaborativeFiltering(k=20)
    cf_model.fit(user_item_matrix)
    
    hybrid_model = HybridRecommender(cf_weight=0.7)
    hybrid_model.fit(user_item_matrix, movie_features, movies, train_data)

In [None]:
# Compare models on a set of random users
test_users = np.random.choice(test_data['userId'].unique(), 5, replace=False)

for user_id in test_users:
    print(f"\nRecommendations for User {user_id}:")
    
    # Get CF recommendations
    cf_recs = cf_model.recommend_items(user_id, n_recommendations=5)
    if not cf_recs.empty:
        cf_recs = cf_recs.merge(movies[['movieId', 'title']], on='movieId')
        print("\nCollaborative filtering recommendations:")
        for _, row in cf_recs.iterrows():
            print(f"- {row['title']} (score: {row['score']:.2f})")
    
    # Get Hybrid recommendations
    hybrid_recs = hybrid_model.recommend_items(user_id, n_recommendations=5)
    if not hybrid_recs.empty:
        hybrid_recs = hybrid_recs.merge(movies[['movieId', 'title']], on='movieId')
        print("\nHybrid recommendations:")
        for _, row in hybrid_recs.iterrows():
            print(f"- {row['title']} (score: {row['score']:.2f})")
        
    # Get actual movies the user liked in test set
    user_test = test_data[test_data['userId'] == user_id]
    if not user_test.empty:
        liked_movies = user_test[user_test['rating'] >= 4]['movieId'].tolist()
        if liked_movies:
            liked_titles = movies[movies['movieId'].isin(liked_movies)]['title'].tolist()
            print("\nActually liked movies in test set:")
            for title in liked_titles[:5]:
                print(f"- {title}")
    
    print("---" * 20)

## 4. Visualization of Results

In [None]:
# Evaluate models more formally
print("Evaluating collaborative filtering model...")
cf_precision, cf_recall, cf_hit_rate = evaluate_recommendations(
    cf_model, test_data, movies, k=10, verbose=False
)

print("Evaluating hybrid model...")
hybrid_precision, hybrid_recall, hybrid_hit_rate = evaluate_recommendations(
    hybrid_model, test_data, movies, k=10, verbose=False
)

# Create visualization of results
metrics = {
    'Collaborative': [cf_precision, cf_recall, cf_hit_rate],
    'Hybrid': [hybrid_precision, hybrid_recall, hybrid_hit_rate]
}

# Create bar chart comparing the models
metrics_df = pd.DataFrame(metrics, index=['Precision@10', 'Recall@10', 'Hit Rate'])
metrics_df.plot(kind='bar', figsize=(12, 6))
plt.title('Comparison of Recommendation Models')
plt.ylabel('Score')
plt.ylim(0, 1)
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.legend(title='Model')
plt.show()

print(f"Collaborative Filtering: Precision@10={cf_precision:.4f}, Recall@10={cf_recall:.4f}, Hit Rate={cf_hit_rate:.4f}")
print(f"Hybrid Model: Precision@10={hybrid_precision:.4f}, Recall@10={hybrid_recall:.4f}, Hit Rate={hybrid_hit_rate:.4f}")