# MovieLens 100K - Exploratory Data Analysis

Quick exploration of the MovieLens 100K dataset for the recommender system.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Dataset

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

## Load Data

In [None]:
# Load MovieLens 100K dataset
data = Dataset.load_builtin('ml-100k')
df = pd.DataFrame(data.raw_ratings, columns=['user_id', 'item_id', 'rating', 'timestamp'])

print(f"Dataset shape: {df.shape}")
df.head()

## Basic Statistics

In [None]:
print("Dataset Statistics:")
print(f"Number of ratings: {len(df):,}")
print(f"Number of users: {df['user_id'].nunique():,}")
print(f"Number of movies: {df['item_id'].nunique():,}")
print(f"\nRating statistics:")
print(df['rating'].describe())

## Rating Distribution

In [None]:
plt.figure(figsize=(10, 6))
rating_counts = df['rating'].value_counts().sort_index()
plt.bar(rating_counts.index, rating_counts.values, color='steelblue', alpha=0.7)
plt.xlabel('Rating', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.title('Distribution of Ratings', fontsize=14, fontweight='bold')
plt.xticks(range(1, 6))
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

print("\nRating distribution:")
print(rating_counts)

## User Activity Analysis

In [None]:
user_rating_counts = df.groupby('user_id').size()

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(user_rating_counts, bins=50, color='coral', alpha=0.7, edgecolor='black')
plt.xlabel('Number of Ratings per User', fontsize=11)
plt.ylabel('Number of Users', fontsize=11)
plt.title('User Activity Distribution', fontsize=12, fontweight='bold')
plt.grid(axis='y', alpha=0.3)

plt.subplot(1, 2, 2)
user_rating_counts.plot(kind='box', vert=True, color='coral')
plt.ylabel('Ratings per User', fontsize=11)
plt.title('User Activity Boxplot', fontsize=12, fontweight='bold')
plt.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

print("\nUser activity statistics:")
print(user_rating_counts.describe())

## Movie Popularity Analysis

In [None]:
movie_rating_counts = df.groupby('item_id').size()

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(movie_rating_counts, bins=50, color='mediumseagreen', alpha=0.7, edgecolor='black')
plt.xlabel('Number of Ratings per Movie', fontsize=11)
plt.ylabel('Number of Movies', fontsize=11)
plt.title('Movie Popularity Distribution', fontsize=12, fontweight='bold')
plt.grid(axis='y', alpha=0.3)

plt.subplot(1, 2, 2)
movie_rating_counts.plot(kind='box', vert=True, color='mediumseagreen')
plt.ylabel('Ratings per Movie', fontsize=11)
plt.title('Movie Popularity Boxplot', fontsize=12, fontweight='bold')
plt.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

print("\nMovie popularity statistics:")
print(movie_rating_counts.describe())

## Sparsity Analysis

In [None]:
n_users = df['user_id'].nunique()
n_items = df['item_id'].nunique()
n_ratings = len(df)

# Calculate sparsity
possible_ratings = n_users * n_items
sparsity = (1 - n_ratings / possible_ratings) * 100

print("\nDataset Sparsity:")
print(f"Total possible ratings: {possible_ratings:,}")
print(f"Actual ratings: {n_ratings:,}")
print(f"Sparsity: {sparsity:.2f}%")
print(f"Density: {100 - sparsity:.2f}%")

## Average Rating by Movie (Top 20)

In [None]:
# Calculate average rating for each movie (only movies with 20+ ratings)
movie_stats = df.groupby('item_id').agg({
    'rating': ['mean', 'count']
}).round(2)
movie_stats.columns = ['avg_rating', 'num_ratings']
movie_stats = movie_stats[movie_stats['num_ratings'] >= 20]
top_rated = movie_stats.sort_values('avg_rating', ascending=False).head(20)

print("\nTop 20 highest-rated movies (with 20+ ratings):")
print(top_rated)

## Summary

Key insights from the MovieLens 100K dataset:
- The dataset is highly sparse (~93.7% sparse)
- Ratings range from 1 to 5, with most ratings being 3 or 4
- Some users are very active (hundreds of ratings) while others have few ratings
- Movie popularity follows a long-tail distribution
- This dataset is suitable for collaborative filtering approaches like SVD