## Dataset Overview

In [3]:
import pandas as pd 

In [7]:
movies = pd.read_csv('../data/ml-32m/movies.csv')

In [12]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [14]:
movies.shape

(87585, 3)

In [9]:
ratings = pd.read_csv ('../data/ml-32m/ratings.csv')

In [10]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
1,1,25,1.0,944250228
2,1,29,2.0,943230976
3,1,30,5.0,944249077
4,1,32,5.0,943228858


In [15]:
ratings.shape

(32000204, 4)

In [28]:
tags = pd.read_csv('../data/ml-32m/tags.csv')

In [29]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,22,26479,Kevin Kline,1583038886
1,22,79592,misogyny,1581476297
2,22,247150,acrophobia,1622483469
3,34,2174,music,1249808064
4,34,2174,weird,1249808102


In [31]:
tags.shape

(2000072, 4)

## Number of users

In [25]:
ratings['userId'].nunique()

200948

## Missing or Duplicated

In [34]:
print("Missing values in ratings:\n", ratings.isnull().sum())
print("Missing values in movies:\n", movies.isnull().sum())
print("Missing values in tags:\n", tags.isnull().sum())

Missing values in ratings:
 userId       0
movieId      0
rating       0
timestamp    0
dtype: int64
Missing values in movies:
 movieId    0
title      0
genres     0
dtype: int64
Missing values in tags:
 userId        0
movieId       0
tag          17
timestamp     0
dtype: int64


In [35]:
print("Duplicate ratings:", ratings.duplicated().sum())
print("Duplicate movies:", movies.duplicated().sum())
print("Duplicate tags:", tags.duplicated().sum())

Duplicate ratings: 0
Duplicate movies: 0
Duplicate tags: 0


## Most rated movies 

In [20]:
ratings.groupby('movieId')['rating'].count().sort_values(ascending = False).head()

movieId
318     102929
356     100296
296      98409
2571     93808
593      90330
Name: rating, dtype: int64

In [22]:
movies[movies['movieId']==318]

Unnamed: 0,movieId,title,genres
314,318,"Shawshank Redemption, The (1994)",Crime|Drama


In [23]:
movies[movies['movieId']==356]

Unnamed: 0,movieId,title,genres
351,356,Forrest Gump (1994),Comedy|Drama|Romance|War


## Most rated genres

In [37]:
combined = pd.merge(movies , ratings , on = 'movieId')
combined.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10,2.5,1169265231
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,3.0,850085076
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.0,1027305751
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,19,3.0,974704488
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,20,5.0,1553184230


In [39]:
combined.shape

(32000204, 6)

In [40]:
all_genres = movies['genres'].str.split('|').sum()

In [42]:
from collections import Counter
genre_counts = Counter(all_genres)
genre = pd.DataFrame(genre_counts.items(), columns=['Genre', 'Count']).sort_values(by='Count', ascending=False)
genre.head()

Unnamed: 0,Genre,Count
6,Drama,34175
3,Comedy,23124
9,Thriller,11823
5,Romance,10369
7,Action,9668


## Average rating

In [43]:
ratings.groupby('movieId')['rating'].count().sort_values().head()

movieId
292757    1
209631    1
209633    1
209635    1
91871     1
Name: rating, dtype: int64

### Filtering only movies that have been rated by at least 100 users

In [69]:
min_users = 100
movie_counts = ratings['movieId'].value_counts()
popular_movie_ids = movie_counts[movie_counts > min_users].index

filtered_ratings = ratings[ratings['movieId'].isin(popular_movie_ids)]

avg_ratings = filtered_ratings.groupby('movieId')['rating'].mean().reset_index().round(2)
avg_ratings.columns = ['movieId', 'average_rating']

avg_ratings = avg_ratings.merge(movies[['movieId', 'title']], on='movieId')

avg_ratings = avg_ratings.sort_values(by='average_rating', ascending=False)

avg_ratings.head()

Unnamed: 0,movieId,average_rating,title
10964,171011,4.45,Planet Earth II (2016)
10706,159817,4.44,Planet Earth (2006)
10942,170705,4.43,Band of Brothers (2001)
293,318,4.4,"Shawshank Redemption, The (1994)"
10973,171495,4.33,Cosmos
