# 🎬 Movie Recommendation System - Data Exploration

**Project**: Movie Recommendation Engine  
**Author**: Ayush Jaggi  
**Date**: September 2024

---

## 🎯 Notebook Overview

Welcome to the first part of my movie recommendation system! In this notebook, I'll explore the MovieLens dataset to understand:

- 🎭 **Movie characteristics** - genres, release years, popularity
- 👥 **User behavior** - rating patterns, preferences, demographics
- ⭐ **Rating distributions** - how people rate movies
- 📊 **Data quality** - missing values, outliers, data consistency

This exploration will guide my recommendation algorithm design and help identify interesting business insights!

In [None]:
# Import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("🎬 Welcome to Movie Recommendation System Analysis!")
print(f"📅 Analysis started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("🚀 Let's explore some movie data!")

## 📊 Data Loading & Initial Exploration

Let me load the MovieLens dataset and get our first look at the data structure.

In [None]:
# Load the datasets
print("📥 Loading MovieLens dataset...")

# Load movies data
movies_df = pd.read_csv('../data/movies.csv')
print(f"🎬 Movies dataset shape: {movies_df.shape}")

# Load ratings data  
ratings_df = pd.read_csv('../data/ratings.csv')
print(f"⭐ Ratings dataset shape: {ratings_df.shape}")

# Display basic info
print(f"\n📊 Dataset Overview:")
print(f"   🎭 Total Movies: {len(movies_df):,}")
print(f"   👥 Total Users: {ratings_df['userId'].nunique():,}")
print(f"   ⭐ Total Ratings: {len(ratings_df):,}")
print(f"   📅 Rating Period: {pd.to_datetime(ratings_df['timestamp'], unit='s').dt.year.min()} - {pd.to_datetime(ratings_df['timestamp'], unit='s').dt.year.max()}")

In [None]:
# Examine the structure of our datasets
print("🔍 Movies Dataset Structure:")
print(movies_df.head())
print(f"\nColumns: {list(movies_df.columns)}")
print(f"Data types:\n{movies_df.dtypes}")

print("\n" + "="*50)
print("⭐ Ratings Dataset Structure:")
print(ratings_df.head())
print(f"\nColumns: {list(ratings_df.columns)}")
print(f"Data types:\n{ratings_df.dtypes}")

## 🎭 Movie Analysis

Let's dive deep into understanding our movie catalog - what genres are popular, when were movies released, and how are they distributed?

In [None]:
# Extract release year from movie titles
movies_df['year'] = movies_df['title'].str.extract(r'\((\d{4})\)')[0]
movies_df['year'] = pd.to_numeric(movies_df['year'], errors='coerce')

# Clean movie titles (remove year)
movies_df['title_clean'] = movies_df['title'].str.replace(r'\s*\(\d{4}\)\s*$', '', regex=True)

print("🎬 Movie Release Year Analysis:")
print(f"   📅 Earliest movie: {movies_df['year'].min():.0f}")
print(f"   📅 Latest movie: {movies_df['year'].max():.0f}")
print(f"   📊 Average release year: {movies_df['year'].mean():.1f}")
print(f"   ❓ Movies with missing year: {movies_df['year'].isna().sum()}")

# Display some examples
print("\n🎭 Sample movies with extracted years:")
print(movies_df[['title', 'title_clean', 'year', 'genres']].head(10))

In [None]:
# Analyze movie genres
print("🎭 Genre Analysis:")

# Split genres and count frequency
all_genres = []
for genres_str in movies_df['genres'].dropna():
    if genres_str != '(no genres listed)':
        genres = genres_str.split('|')
        all_genres.extend(genres)

genre_counts = pd.Series(all_genres).value_counts()

print(f"📊 Total unique genres: {len(genre_counts)}")
print(f"\n🏆 Top 10 Most Popular Genres:")
for i, (genre, count) in enumerate(genre_counts.head(10).items(), 1):
    percentage = (count / len(movies_df)) * 100
    print(f"   {i:2d}. {genre:15s}: {count:4d} movies ({percentage:5.1f}%)")

# Visualize genre distribution
plt.figure(figsize=(12, 6))
top_genres = genre_counts.head(15)
plt.barh(range(len(top_genres)), top_genres.values, color='skyblue')
plt.yticks(range(len(top_genres)), top_genres.index)
plt.xlabel('Number of Movies')
plt.title('📊 Movie Distribution by Genre (Top 15)', fontsize=16, fontweight='bold')
plt.gca().invert_yaxis()

# Add value labels
for i, v in enumerate(top_genres.values):
    plt.text(v + 10, i, str(v), va='center', fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# Analyze movie release trends over time
yearly_movies = movies_df.groupby('year').size().reset_index(name='movie_count')
yearly_movies = yearly_movies[yearly_movies['year'].between(1920, 2020)]  # Filter reasonable years

plt.figure(figsize=(14, 6))
plt.plot(yearly_movies['year'], yearly_movies['movie_count'], marker='o', linewidth=2, markersize=4)
plt.xlabel('Release Year')
plt.ylabel('Number of Movies')
plt.title('📈 Movie Production Trends Over Time', fontsize=16, fontweight='bold')
plt.grid(True, alpha=0.3)

# Highlight interesting periods
plt.axvspan(1990, 2000, alpha=0.2, color='yellow', label='90s Movie Boom')
plt.legend()

plt.tight_layout()
plt.show()

# Print insights
peak_year = yearly_movies.loc[yearly_movies['movie_count'].idxmax(), 'year']
peak_count = yearly_movies['movie_count'].max()
print(f"🏆 Peak movie production year: {peak_year:.0f} with {peak_count} movies")

# Decade analysis
movies_df['decade'] = (movies_df['year'] // 10) * 10
decade_counts = movies_df.groupby('decade').size().sort_index()

print(f"\n📊 Movies by Decade:")
for decade, count in decade_counts.items():
    if pd.notna(decade) and decade >= 1920:
        print(f"   {decade:.0f}s: {count:4d} movies")

## ⭐ Rating Behavior Analysis

Now let's understand how users rate movies. This will be crucial for building our recommendation system!

In [None]:
# Basic rating statistics
print("⭐ Rating Distribution Analysis:")
print(f"   📊 Total ratings: {len(ratings_df):,}")
print(f"   📊 Rating range: {ratings_df['rating'].min()} - {ratings_df['rating'].max()}")
print(f"   📊 Average rating: {ratings_df['rating'].mean():.2f}")
print(f"   📊 Median rating: {ratings_df['rating'].median():.1f}")
print(f"   📊 Most common rating: {ratings_df['rating'].mode().iloc[0]:.1f}")

# Rating distribution
rating_dist = ratings_df['rating'].value_counts().sort_index()
print(f"\n🎯 Rating Distribution:")
for rating, count in rating_dist.items():
    percentage = (count / len(ratings_df)) * 100
    print(f"   {rating}⭐: {count:7,} ratings ({percentage:5.1f}%)")

# Visualize rating distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Bar chart
ax1.bar(rating_dist.index, rating_dist.values, color='orange', alpha=0.7, edgecolor='black')
ax1.set_xlabel('Rating')
ax1.set_ylabel('Number of Ratings')
ax1.set_title('📊 Rating Distribution (Count)', fontweight='bold')
ax1.grid(True, alpha=0.3)

# Add percentage labels
for rating, count in rating_dist.items():
    percentage = (count / len(ratings_df)) * 100
    ax1.text(rating, count + 1000, f'{percentage:.1f}%', ha='center', fontweight='bold')

# Pie chart
colors = plt.cm.RdYlGn(np.linspace(0.3, 0.9, len(rating_dist)))
ax2.pie(rating_dist.values, labels=[f'{r}⭐' for r in rating_dist.index], 
        autopct='%1.1f%%', colors=colors, startangle=90)
ax2.set_title('📊 Rating Distribution (Percentage)', fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# User behavior analysis
user_stats = ratings_df.groupby('userId').agg({
    'rating': ['count', 'mean', 'std'],
    'movieId': 'nunique'
}).round(2)

user_stats.columns = ['num_ratings', 'avg_rating', 'rating_std', 'unique_movies']
user_stats = user_stats.reset_index()

print("👥 User Behavior Analysis:")
print(f"   📊 Total users: {len(user_stats):,}")
print(f"   📊 Average ratings per user: {user_stats['num_ratings'].mean():.1f}")
print(f"   📊 Median ratings per user: {user_stats['num_ratings'].median():.0f}")
print(f"   📊 Most active user rated: {user_stats['num_ratings'].max()} movies")
print(f"   📊 Least active user rated: {user_stats['num_ratings'].min()} movies")

# User activity distribution
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Number of ratings per user
axes[0,0].hist(user_stats['num_ratings'], bins=50, color='skyblue', alpha=0.7, edgecolor='black')
axes[0,0].set_xlabel('Number of Ratings per User')
axes[0,0].set_ylabel('Number of Users')
axes[0,0].set_title('📊 User Activity Distribution', fontweight='bold')
axes[0,0].axvline(user_stats['num_ratings'].mean(), color='red', linestyle='--', 
                  label=f'Mean: {user_stats["num_ratings"].mean():.1f}')
axes[0,0].legend()
axes[0,0].grid(True, alpha=0.3)

# Average rating per user
axes[0,1].hist(user_stats['avg_rating'], bins=30, color='lightgreen', alpha=0.7, edgecolor='black')
axes[0,1].set_xlabel('Average Rating per User')
axes[0,1].set_ylabel('Number of Users')
axes[0,1].set_title('📊 User Rating Tendency', fontweight='bold')
axes[0,1].axvline(user_stats['avg_rating'].mean(), color='red', linestyle='--',
                  label=f'Mean: {user_stats["avg_rating"].mean():.2f}')
axes[0,1].legend()
axes[0,1].grid(True, alpha=0.3)

# Rating standard deviation
axes[1,0].hist(user_stats['rating_std'].dropna(), bins=30, color='orange', alpha=0.7, edgecolor='black')
axes[1,0].set_xlabel('Rating Standard Deviation')
axes[1,0].set_ylabel('Number of Users')
axes[1,0].set_title('📊 User Rating Consistency', fontweight='bold')
axes[1,0].grid(True, alpha=0.3)

# User categories
user_categories = []
for _, user in user_stats.iterrows():
    if user['num_ratings'] < 20:
        category = 'Light User (<20 ratings)'
    elif user['num_ratings'] < 50:
        category = 'Moderate User (20-50 ratings)'
    elif user['num_ratings'] < 100:
        category = 'Active User (50-100 ratings)'
    else:
        category = 'Heavy User (100+ ratings)'
    user_categories.append(category)

user_category_counts = pd.Series(user_categories).value_counts()

axes[1,1].pie(user_category_counts.values, labels=user_category_counts.index, 
              autopct='%1.1f%%', startangle=90)
axes[1,1].set_title('👥 User Activity Categories', fontweight='bold')

plt.tight_layout()
plt.show()

print(f"\n👥 User Categories:")
for category, count in user_category_counts.items():
    percentage = (count / len(user_stats)) * 100
    print(f"   {category}: {count:4d} users ({percentage:5.1f}%)")

## 🎬 Movie Popularity Analysis

Let's identify the most popular and highest-rated movies to understand what makes content successful.

In [None]:
# Movie popularity analysis
movie_stats = ratings_df.groupby('movieId').agg({
    'rating': ['count', 'mean', 'std']
}).round(2)

movie_stats.columns = ['num_ratings', 'avg_rating', 'rating_std']
movie_stats = movie_stats.reset_index()

# Merge with movie information
movie_analysis = movie_stats.merge(movies_df[['movieId', 'title', 'genres', 'year']], 
                                  on='movieId', how='left')

print("🎬 Movie Popularity Analysis:")
print(f"   📊 Total movies with ratings: {len(movie_analysis):,}")
print(f"   📊 Average ratings per movie: {movie_analysis['num_ratings'].mean():.1f}")
print(f"   📊 Median ratings per movie: {movie_analysis['num_ratings'].median():.0f}")
print(f"   📊 Most rated movie has: {movie_analysis['num_ratings'].max()} ratings")

# Top rated movies (with minimum 50 ratings)
popular_movies = movie_analysis[movie_analysis['num_ratings'] >= 50].copy()
popular_movies = popular_movies.sort_values('avg_rating', ascending=False)

print(f"\n🏆 Top 10 Highest Rated Movies (min 50 ratings):")
for i, (_, movie) in enumerate(popular_movies.head(10).iterrows(), 1):
    print(f"   {i:2d}. {movie['title']:35s} | ⭐{movie['avg_rating']:.2f} | 👥{movie['num_ratings']:3.0f} ratings")

# Most popular movies (by number of ratings)
most_rated = movie_analysis.sort_values('num_ratings', ascending=False)

print(f"\n🔥 Top 10 Most Rated Movies:")
for i, (_, movie) in enumerate(most_rated.head(10).iterrows(), 1):
    print(f"   {i:2d}. {movie['title']:35s} | 👥{movie['num_ratings']:4.0f} ratings | ⭐{movie['avg_rating']:.2f}")

In [None]:
# Visualize movie popularity vs rating relationship
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Scatter plot: Number of ratings vs Average rating
axes[0].scatter(movie_analysis['num_ratings'], movie_analysis['avg_rating'], 
               alpha=0.6, s=30, color='purple')
axes[0].set_xlabel('Number of Ratings (Popularity)')
axes[0].set_ylabel('Average Rating')
axes[0].set_title('📊 Movie Popularity vs Quality', fontweight='bold')
axes[0].grid(True, alpha=0.3)

# Add trend line
z = np.polyfit(movie_analysis['num_ratings'], movie_analysis['avg_rating'], 1)
p = np.poly1d(z)
axes[0].plot(movie_analysis['num_ratings'], p(movie_analysis['num_ratings']), 
            "r--", alpha=0.8, linewidth=2)

# Highlight some famous movies
famous_movies = movie_analysis[movie_analysis['num_ratings'] >= 200]
axes[0].scatter(famous_movies['num_ratings'], famous_movies['avg_rating'], 
               s=80, color='red', alpha=0.8, label='Popular Movies (200+ ratings)')
axes[0].legend()

# Movie rating distribution
axes[1].hist(movie_analysis['num_ratings'], bins=50, color='skyblue', 
            alpha=0.7, edgecolor='black')
axes[1].set_xlabel('Number of Ratings per Movie')
axes[1].set_ylabel('Number of Movies')
axes[1].set_title('📊 Movie Rating Count Distribution', fontweight='bold')
axes[1].set_yscale('log')  # Log scale due to long tail
axes[1].grid(True, alpha=0.3)

# Add vertical lines for quartiles
quartiles = movie_analysis['num_ratings'].quantile([0.25, 0.5, 0.75])
for q, value in quartiles.items():
    axes[1].axvline(value, color='red', linestyle='--', alpha=0.7, 
                   label=f'Q{int(q*4)}: {value:.0f}')
axes[1].legend()

plt.tight_layout()
plt.show()

# Print insights about the long tail
print(f"\n📊 Movie Rating Distribution Insights:")
print(f"   25% of movies have ≤ {quartiles[0.25]:.0f} ratings")
print(f"   50% of movies have ≤ {quartiles[0.5]:.0f} ratings")
print(f"   75% of movies have ≤ {quartiles[0.75]:.0f} ratings")

# Cold start problem analysis
movies_with_few_ratings = len(movie_analysis[movie_analysis['num_ratings'] < 10])
percentage_cold_start = (movies_with_few_ratings / len(movie_analysis)) * 100
print(f"   ❄️ Cold start movies (<10 ratings): {movies_with_few_ratings} ({percentage_cold_start:.1f}%)")

## 🕐 Temporal Analysis

Let's understand how rating behavior changes over time - are there seasonal patterns or trends we should know about?

In [None]:
# Convert timestamp to datetime
ratings_df['timestamp'] = pd.to_datetime(ratings_df['timestamp'], unit='s')
ratings_df['date'] = ratings_df['timestamp'].dt.date
ratings_df['year'] = ratings_df['timestamp'].dt.year
ratings_df['month'] = ratings_df['timestamp'].dt.month
ratings_df['weekday'] = ratings_df['timestamp'].dt.dayofweek
ratings_df['hour'] = ratings_df['timestamp'].dt.hour

print("🕐 Temporal Rating Patterns:")
print(f"   📅 Rating period: {ratings_df['timestamp'].min().strftime('%Y-%m-%d')} to {ratings_df['timestamp'].max().strftime('%Y-%m-%d')}")
print(f"   📊 Total days: {(ratings_df['timestamp'].max() - ratings_df['timestamp'].min()).days} days")
print(f"   📊 Average ratings per day: {len(ratings_df) / (ratings_df['timestamp'].max() - ratings_df['timestamp'].min()).days:.1f}")

# Monthly rating trends
monthly_ratings = ratings_df.groupby(['year', 'month']).size().reset_index(name='rating_count')
monthly_ratings['date'] = pd.to_datetime(monthly_ratings[['year', 'month']].assign(day=1))

plt.figure(figsize=(15, 6))
plt.plot(monthly_ratings['date'], monthly_ratings['rating_count'], 
         marker='o', linewidth=2, markersize=4, color='navy')
plt.xlabel('Date')
plt.ylabel('Number of Ratings')
plt.title('📈 Rating Activity Over Time', fontsize=16, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)

# Highlight peak period
peak_month = monthly_ratings.loc[monthly_ratings['rating_count'].idxmax()]
plt.annotate(f'Peak: {peak_month["rating_count"]} ratings\n{peak_month["date"].strftime("%B %Y")}',
             xy=(peak_month['date'], peak_month['rating_count']),
             xytext=(10, 10), textcoords='offset points',
             bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.7),
             arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0'))

plt.tight_layout()
plt.show()

In [None]:
# Analyze rating patterns by day of week and hour
fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# Day of week patterns
weekday_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
weekday_ratings = ratings_df.groupby('weekday').size()

axes[0,0].bar(range(7), weekday_ratings.values, color='lightblue', alpha=0.7, edgecolor='black')
axes[0,0].set_xticks(range(7))
axes[0,0].set_xticklabels(weekday_names, rotation=45)
axes[0,0].set_ylabel('Number of Ratings')
axes[0,0].set_title('📊 Rating Activity by Day of Week', fontweight='bold')
axes[0,0].grid(True, alpha=0.3)

# Hour of day patterns
hourly_ratings = ratings_df.groupby('hour').size()

axes[0,1].plot(hourly_ratings.index, hourly_ratings.values, marker='o', linewidth=2, color='green')
axes[0,1].set_xlabel('Hour of Day')
axes[0,1].set_ylabel('Number of Ratings')
axes[0,1].set_title('📊 Rating Activity by Hour of Day', fontweight='bold')
axes[0,1].grid(True, alpha=0.3)
axes[0,1].set_xticks(range(0, 24, 2))

# Monthly patterns (seasonal)
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
               'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
monthly_patterns = ratings_df.groupby('month').size()

axes[1,0].bar(range(1, 13), monthly_patterns.values, color='orange', alpha=0.7, edgecolor='black')
axes[1,0].set_xticks(range(1, 13))
axes[1,0].set_xticklabels(month_names, rotation=45)
axes[1,0].set_ylabel('Number of Ratings')
axes[1,0].set_title('📊 Rating Activity by Month', fontweight='bold')
axes[1,0].grid(True, alpha=0.3)

# Average rating by month (quality trends)
monthly_avg_rating = ratings_df.groupby('month')['rating'].mean()

axes[1,1].plot(range(1, 13), monthly_avg_rating.values, marker='s', 
               linewidth=2, color='red', markersize=6)
axes[1,1].set_xticks(range(1, 13))
axes[1,1].set_xticklabels(month_names, rotation=45)
axes[1,1].set_ylabel('Average Rating')
axes[1,1].set_title('📊 Average Rating by Month', fontweight='bold')
axes[1,1].grid(True, alpha=0.3)
axes[1,1].set_ylim(3.0, 4.0)

plt.tight_layout()
plt.show()

# Print insights
print(f"\n🔍 Temporal Insights:")
print(f"   📅 Most active day: {weekday_names[weekday_ratings.idxmax()]} ({weekday_ratings.max():,} ratings)")
print(f"   🕐 Peak rating hour: {hourly_ratings.idxmax()}:00 ({hourly_ratings.max():,} ratings)")
print(f"   📆 Most active month: {month_names[monthly_patterns.idxmax()-1]} ({monthly_patterns.max():,} ratings)")
print(f"   ⭐ Highest rated month: {month_names[monthly_avg_rating.idxmax()-1]} (avg: {monthly_avg_rating.max():.3f})")

## 🔍 Data Quality Assessment

Before building our recommendation system, let's assess the quality of our data and identify any potential issues.

In [None]:
# Data quality assessment
print("🔍 Data Quality Assessment:")
print("=" * 50)

# Movies dataset quality
print("🎬 Movies Dataset:")
print(f"   📊 Total records: {len(movies_df):,}")
print(f"   📊 Duplicate titles: {movies_df['title'].duplicated().sum()}")
print(f"   📊 Missing titles: {movies_df['title'].isna().sum()}")
print(f"   📊 Missing genres: {movies_df['genres'].isna().sum()}")
print(f"   📊 Movies with '(no genres listed)': {(movies_df['genres'] == '(no genres listed)').sum()}")
print(f"   📊 Movies with extractable year: {movies_df['year'].notna().sum()} ({movies_df['year'].notna().mean()*100:.1f}%)")

# Ratings dataset quality
print(f"\n⭐ Ratings Dataset:")
print(f"   📊 Total records: {len(ratings_df):,}")
print(f"   📊 Duplicate ratings: {ratings_df[['userId', 'movieId']].duplicated().sum()}")
print(f"   📊 Missing values: {ratings_df.isna().sum().sum()}")
print(f"   📊 Invalid ratings (outside 0.5-5.0): {((ratings_df['rating'] < 0.5) | (ratings_df['rating'] > 5.0)).sum()}")
print(f"   📊 Unique users: {ratings_df['userId'].nunique():,}")
print(f"   📊 Unique movies rated: {ratings_df['movieId'].nunique():,}")

# Data consistency checks
print(f"\n🔗 Data Consistency:")
movies_in_ratings = set(ratings_df['movieId'].unique())
movies_in_catalog = set(movies_df['movieId'].unique())

movies_rated_not_in_catalog = movies_in_ratings - movies_in_catalog
movies_in_catalog_not_rated = movies_in_catalog - movies_in_ratings

print(f"   📊 Movies rated but not in catalog: {len(movies_rated_not_in_catalog)}")
print(f"   📊 Movies in catalog but never rated: {len(movies_in_catalog_not_rated)} ({len(movies_in_catalog_not_rated)/len(movies_df)*100:.1f}%)")

# Sparsity analysis
total_possible_ratings = ratings_df['userId'].nunique() * ratings_df['movieId'].nunique()
actual_ratings = len(ratings_df)
sparsity = (1 - actual_ratings / total_possible_ratings) * 100

print(f"\n📊 Data Sparsity Analysis:")
print(f"   📊 Total possible user-movie combinations: {total_possible_ratings:,}")
print(f"   📊 Actual ratings: {actual_ratings:,}")
print(f"   📊 Data sparsity: {sparsity:.2f}%")
print(f"   📊 Data density: {100-sparsity:.2f}%")

if sparsity > 99:
    print(f"   ⚠️ High sparsity detected! This will challenge our recommendation algorithms.")
elif sparsity > 95:
    print(f"   ✅ Moderate sparsity - typical for recommendation systems.")
else:
    print(f"   ✅ Low sparsity - excellent for recommendation algorithms!")

## 💡 Key Findings & Recommendations

Based on my exploration, here are the key insights that will guide our recommendation system design:

In [None]:
# Generate key findings summary
print("💡 KEY FINDINGS & RECOMMENDATIONS")
print("=" * 60)

print("\n🎬 MOVIE INSIGHTS:")
print(f"   • Drama ({genre_counts['Drama']:,} movies) and Comedy are most popular genres")
print(f"   • Peak movie production was in the {peak_year:.0f}s")
print(f"   • {len(movies_in_catalog_not_rated)} movies ({len(movies_in_catalog_not_rated)/len(movies_df)*100:.1f}%) have never been rated (cold start problem)")

print("\n👥 USER BEHAVIOR:")
print(f"   • Average user rates {user_stats['num_ratings'].mean():.1f} movies")
print(f"   • Users tend to be generous: average rating is {ratings_df['rating'].mean():.2f}/5.0")
print(f"   • Rating distribution is skewed toward higher ratings")
print(f"   • {user_category_counts['Heavy User (100+ ratings)']} users ({user_category_counts['Heavy User (100+ ratings)']/len(user_stats)*100:.1f}%) are super active")

print("\n⭐ RATING PATTERNS:")
print(f"   • {ratings_df['rating'].mode().iloc[0]}⭐ is the most common rating ({rating_dist[ratings_df['rating'].mode().iloc[0]]:,} times)")
print(f"   • Peak rating activity: {weekday_names[weekday_ratings.idxmax()]} at {hourly_ratings.idxmax()}:00")
print(f"   • Most active month: {month_names[monthly_patterns.idxmax()-1]}")

print("\n🎯 RECOMMENDATION SYSTEM IMPLICATIONS:")
print(f"   ✅ Collaborative Filtering: Feasible with {ratings_df['userId'].nunique():,} users and {len(ratings_df):,} ratings")
print(f"   ⚠️ Cold Start: Need content-based approach for {len(movies_in_catalog_not_rated)} unrated movies")
print(f"   📊 Data Sparsity: {sparsity:.2f}% sparsity requires robust algorithms")
print(f"   🎭 Content-Based: Genre information available for content-based filtering")
print(f"   🕐 Temporal: Consider time-based recommendations (seasonal preferences)")

print("\n🚀 RECOMMENDED APPROACH:")
print("   1. 🤝 User-Based Collaborative Filtering for personalization")
print("   2. 🎬 Item-Based Collaborative Filtering for stability")
print("   3. 🎭 Content-Based Filtering for cold start movies")
print("   4. 🔄 Hybrid approach combining all methods")
print("   5. ⭐ Focus on movies with sufficient ratings (>10) for reliable recommendations")

print(f"\n📊 DATASET SUMMARY:")
print(f"   📈 Dataset Quality: HIGH - clean, consistent, well-structured")
print(f"   📊 Size: MEDIUM - perfect for learning and prototyping")
print(f"   🎯 Suitability: EXCELLENT for recommendation system development")
print(f"   ⚡ Processing Speed: FAST - can iterate quickly on algorithms")

print("\n" + "=" * 60)
print("🎉 Data exploration complete! Ready to build recommendation algorithms.")
print(f"📅 Analysis completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")