In [1]:
import numpy as np
import pandas as pd


### Data Cleaning

In [2]:
links = pd.read_csv('data/links.csv')
links.head(2)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0


In [3]:
movies = pd.read_csv('data/movies.csv')
movies.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [4]:
ratings = pd.read_csv('data/ratings.csv')
ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247


In [5]:
tags = pd.read_csv('data/tags.csv')
tags.head(2)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996


In [6]:
tags['tag'].value_counts()

In Netflix queue         131
atmospheric               36
thought-provoking         24
superhero                 24
surreal                   23
                        ... 
real estate                1
video game adaptation      1
beat poetry                1
cool                       1
dreams                     1
Name: tag, Length: 1589, dtype: int64

In [7]:
tags['movieId'].value_counts()

296     181
2959     54
924      41
293      35
7361     34
       ... 
6107      1
5878      1
5876      1
3192      1
8190      1
Name: movieId, Length: 1572, dtype: int64

Based on the above initial previewing of the dataset, there are two datsets we can use in our project, the movies and the ratings datasets. This is because these two datasets are the ones containing information on ratings, movie ids and user ids which we'll need to create a recommender system. 

In [8]:
#Previewing the datasets again
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [10]:
#Merging the two datasets on movieid
df = pd.merge(movies, ratings, on='movieId', how='left')
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982700.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,847435000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1106636000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1510578000.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1305696000.0


The timestamp column has no value in our analysis therefore we can proceed to drop it. 

In [11]:
#Dropping the timestamp column
df.drop(columns=['timestamp'], axis=1, inplace=True)
df.head()

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5


In [12]:
df.isna().sum()

movieId     0
title       0
genres      0
userId     18
rating     18
dtype: int64

There are key columns(userId and rating) with missing values. We will investigate the 

In [13]:
#Dropping the columns with missing 
no_missing = df.isna().sum()
percent_missing = (df.isna().sum() * 100/len(df)).round(2)
missing_value_df = pd.DataFrame({'no_missing_values':no_missing,'percent_missing':percent_missing})
missing_value_df

Unnamed: 0,no_missing_values,percent_missing
movieId,0,0.0
title,0,0.0
genres,0,0.0
userId,18,0.02
rating,18,0.02


In [14]:
df = df.dropna()
df.isna().sum()

movieId    0
title      0
genres     0
userId     0
rating     0
dtype: int64

## Feature Engineering

In [21]:
df['release_year'] = df['title'].str.extract(r'\((\d{4})\)$', expand=False)
df['release_year'] = pd.to_numeric(df['release_year'], errors='coerce').astype('Int64')
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,release_year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,1995
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,1995
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1995
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1995
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1995


In [24]:
#Assigning the release year to decades
#Define a function to convert year into decade
def year_to_decade(year):
    if pd.isna(year):
        return None
    return int(year / 10) * 10

#Apply the fucntion to the dataset
df['decade'] = df['release_year'].apply(year_to_decade)

## Exploratory Data Analysis