In [None]:
# Data Exploration – MovieLens Dataset

# This notebook explores the MovieLens dataset to understand user behavior, item popularity, sparsity, and challenges relevant to recommendation systems.


In [7]:
import pandas as pd
import numpy as np


In [8]:
ratings = pd.read_csv("data/ratings.csv")
movies = pd.read_csv("data/movies.csv")
tags = pd.read_csv("data/tags.csv")

ratings.head(), movies.head(), tags.head()

## We analyze the size of each table and the number of unique users and movies.

(   userId  movieId  rating  timestamp
 0       1        1     4.0  964982703
 1       1        3     4.0  964981247
 2       1        6     4.0  964982224
 3       1       47     5.0  964983815
 4       1       50     5.0  964982931,
    movieId                               title  \
 0        1                    Toy Story (1995)   
 1        2                      Jumanji (1995)   
 2        3             Grumpier Old Men (1995)   
 3        4            Waiting to Exhale (1995)   
 4        5  Father of the Bride Part II (1995)   
 
                                         genres  
 0  Adventure|Animation|Children|Comedy|Fantasy  
 1                   Adventure|Children|Fantasy  
 2                               Comedy|Romance  
 3                         Comedy|Drama|Romance  
 4                                       Comedy  ,
    userId  movieId              tag   timestamp
 0       2    60756            funny  1445714994
 1       2    60756  Highly quotable  1445714996
 2       

In [11]:
ratings.shape, movies.shape, tags.shape


((100836, 4), (9742, 3), (3683, 4))

In [12]:
num_users = ratings['userId'].nunique()
num_movies = ratings['movieId'].nunique()

num_users, num_movies

## - The dataset contains ratings from several hundred users.
## - Each user has rated only a small subset of the available movies.
## - indicates a highly sparse user–item interaction matrix.

(610, 9724)

In [15]:
ratings['rating'].value_counts().sort_index()

## - Ratings range from 0.5 to 5.0 stars.
## - Most ratings are concentrated around 3.0–4.0.
## - The distribution is slightly skewed toward positive ratings.

rating
0.5     1370
1.0     2811
1.5     1791
2.0     7551
2.5     5550
3.0    20047
3.5    13136
4.0    26818
4.5     8551
5.0    13211
Name: count, dtype: int64

In [17]:
ratings_per_user = ratings.groupby('userId').size()
ratings_per_user.describe()


count     610.000000
mean      165.304918
std       269.480584
min        20.000000
25%        35.000000
50%        70.500000
75%       168.000000
max      2698.000000
dtype: float64

In [19]:
ratings_per_movie = ratings.groupby('movieId').size()
ratings_per_movie.describe()

## - Most users have rated only a small number of movies.
## - A few users are highly active and rate many movies.
## - This imbalance contributes to the cold-start problem.


count    9724.000000
mean       10.369807
std        22.401005
min         1.000000
25%         1.000000
50%         3.000000
75%         9.000000
max       329.000000
dtype: float64

In [21]:
num_ratings = len(ratings)
sparsity = 1 - (num_ratings / (num_users * num_movies))
sparsity

## - Many movies receive very few ratings.
## - A small number of movies dominate user attention.
## - This long-tail distribution is common in recommender systems.

0.9830003169443864

In [24]:
ratings_movies = ratings.merge(movies, on="movieId")
ratings_movies.head()

## - The user–item matrix is highly sparse.
## - Sparsity motivates the use of collaborative filtering techniques, as traditional supervised learning methods are ineffective in such settings.


Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [25]:
ratings_movies.groupby('genres')['rating'].mean().sort_values(ascending=False).head()

## Key Insights

# - The dataset is highly sparse (~98%).
# - User activity is uneven, with a few highly active users.
# - Movie popularity follows a long-tail distribution.
# - These properties motivate collaborative and hybrid recommendation approaches.


genres
Comedy|Crime|Drama|Horror                5.0
Adventure|Comedy|Fantasy|Musical         5.0
Animation|Children|Mystery               5.0
Animation|Drama|Sci-Fi|IMAX              5.0
Adventure|Drama|Fantasy|Horror|Sci-Fi    5.0
Name: rating, dtype: float64