In [1]:
import numpy as np
import pandas as pd

In [2]:
movies = pd.read_csv("movies.csv")

In [3]:
## exploring the dataset

In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
links = pd.read_csv("links.csv")

In [6]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [7]:
ratings = pd.read_csv("ratings.csv")

In [8]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [9]:
tags = pd.read_csv("tags.csv")

In [10]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [11]:
## data preprocessing

In [12]:
movies.isnull().sum()
movies.duplicated().sum()

0

In [13]:
links.isnull().sum()
links.duplicated().sum()

0

In [14]:
ratings.isnull().sum()
ratings.duplicated().sum()

0

In [15]:
tags.isnull().sum()
tags.duplicated().sum()

0

In [22]:
print(movies.shape)
print(tags.shape)
print(links.shape)
print(ratings.shape)

(9742, 3)
(3683, 4)
(9742, 3)
(100836, 4)


In [16]:
## creating a new dataset

In [17]:
# 1. movieId
# 2. title
# 3. genres
# 4. tag
# 5. rating
# 6. imdbId
# 7. tmdbId

In [18]:
tags_new = tags[['tag', 'movieId']]
ratings_new = ratings[['movieId', 'rating']]
tags_new

Unnamed: 0,tag,movieId
0,funny,60756
1,Highly quotable,60756
2,will ferrell,60756
3,Boxing story,89774
4,MMA,89774
...,...,...
3678,for katie,7382
3679,austere,7936
3680,gun fu,3265
3681,heroic bloodshed,3265


In [19]:
new_df = pd.merge(movies, links , on = 'movieId')
new_df

Unnamed: 0,movieId,title,genres,imdbId,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0
4,5,Father of the Bride Part II (1995),Comedy,113041,11862.0
...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,5476944,432131.0
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,5914996,445030.0
9739,193585,Flint (2017),Drama,6397426,479308.0
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,8391976,483455.0


In [20]:
# different users give different ratings for same movieId. assigning average rating to to each movieId
average_ratings = ratings_new.groupby('movieId')['rating'].mean().reset_index()

# Merge the movies DataFrame with the average_ratings DataFrame based on movieId
new_df = pd.merge(new_df, average_ratings, on='movieId', how='left')
new_df

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,3.920930
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0,3.431818
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0,3.259615
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0,2.357143
4,5,Father of the Bride Part II (1995),Comedy,113041,11862.0,3.071429
...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,5476944,432131.0,4.000000
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,5914996,445030.0,3.500000
9739,193585,Flint (2017),Drama,6397426,479308.0,3.500000
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,8391976,483455.0,3.500000


In [21]:
# concatenating tags for same movie by different users
tags_concatenated = tags_new.groupby('movieId')['tag'].apply(lambda x: ','.join(x)).reset_index()

new_df = pd.merge(new_df, tags_concatenated, on='movieId', how='left')
new_df

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,rating,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,3.920930,"pixar,pixar,fun"
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0,3.431818,"fantasy,magic board game,Robin Williams,game"
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0,3.259615,"moldy,old"
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0,2.357143,
4,5,Father of the Bride Part II (1995),Comedy,113041,11862.0,3.071429,"pregnancy,remake"
...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,5476944,432131.0,4.000000,
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,5914996,445030.0,3.500000,
9739,193585,Flint (2017),Drama,6397426,479308.0,3.500000,
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,8391976,483455.0,3.500000,
