In [1]:
import pandas as pd
import numpy as np

In [2]:
#importing the datasets
movies_metadata = pd.read_csv("raw_datasets/movies_metadata.csv")
ratings = pd.read_csv("raw_datasets/ratings.csv")

  interactivity=interactivity, compiler=compiler, result=result)


### I got a DtypeWarning while loading the datasets which means that some values' dtype does not match the dtype of the column. Before commiting any changes to dataset I will create copies of the original datasets.

In [3]:
mmd = movies_metadata.copy()
rts = ratings.copy()

In [4]:
#checking general information of movies_metadata
mmd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [5]:
#checking general information of ratings
rts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26024289 entries, 0 to 26024288
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 794.2 MB


### movies_metadata set contains labels that are not supposed to be used, columns of not correct dtype, and few non-null values

In [6]:
#dropping non related columns
mmd.drop(mmd.columns[[0,1,2,4,6,7,8,9,10,11,12,13,14,15,16,17,18,19,21,22,23]], axis=1, inplace=True)

In [7]:
mmd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   genres  45466 non-null  object
 1   id      45466 non-null  object
 2   title   45460 non-null  object
dtypes: object(3)
memory usage: 1.0+ MB


In [8]:
#finding the position of the NaN values
mmd.loc[pd.isna(mmd['title']), :]

Unnamed: 0,genres,id,title
19729,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",82663,
19730,"[{'name': 'Carousel Productions', 'id': 11176}...",1997-08-20,
29502,"[{'id': 16, 'name': 'Animation'}, {'id': 878, ...",122662,
29503,"[{'name': 'Aniplex', 'id': 2883}, {'name': 'Go...",2012-09-29,
35586,"[{'id': 10770, 'name': 'TV Movie'}, {'id': 28,...",249260,
35587,"[{'name': 'Odyssey Media', 'id': 17161}, {'nam...",2014-01-01,


### because of limited data and the insignificance of the amount of missing data (6 / 45430) I will remove it from the dataset. Before doing that some missing titles have valid id so I need to check if some users might have rated those movies before they were removed. (I treated the titless movies as movies which were removed from the platform)

In [9]:
rts.loc[((rts['movieId'] == 82663) | (rts['movieId'] == 122662) | (rts['movieId'] == 249260))]

Unnamed: 0,userId,movieId,rating,timestamp


In [10]:
#none of the ids were not present in the rating history so we can safely remove them
mmd.dropna(inplace=True)
mmd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45460 entries, 0 to 45465
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   genres  45460 non-null  object
 1   id      45460 non-null  object
 2   title   45460 non-null  object
dtypes: object(3)
memory usage: 1.4+ MB


In [16]:
#checking for duplicate values in the dataset by using the title column
mmd['title'].value_counts()

Cinderella                                 11
Hamlet                                      9
Alice in Wonderland                         9
Les Misérables                              8
Beauty and the Beast                        8
                                           ..
Zebra                                       1
The Red Dwarf                               1
Hurricane of Fun: The Making of Wet Hot     1
Attack on the Pin-Up Boys                   1
31st of June                                1
Name: title, Length: 42277, dtype: Int64

In [18]:
#dropping duplicates
mmd.drop_duplicates(inplace=True)
mmd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45430 entries, 0 to 45465
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   genres  45430 non-null  object
 1   id      45430 non-null  int64 
 2   title   45430 non-null  string
dtypes: int64(1), object(1), string(1)
memory usage: 1.4+ MB


In [11]:
#changing the columns to the correct dtype
mmd['title'] = mmd['title'].astype('string', copy=False)
mmd['id'] = mmd['id'].astype(np.int64, copy=False)
mmd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45460 entries, 0 to 45465
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   genres  45460 non-null  object
 1   id      45460 non-null  int64 
 2   title   45460 non-null  string
dtypes: int64(1), object(1), string(1)
memory usage: 1.4+ MB


### the genres column contains a list of dictionaries for each movie. Each dictionary contains the movie's genre name and the genre's id. I will use the Abstract Syntax Tree module in order to extract only the genre names and replace the list of dictionaries with the genre names 

In [19]:
import ast
genres = mmd.genres.apply(ast.literal_eval).apply(lambda x : [i['name'] for i in x])
mmd['genres'] = genres
mmd.head()

Unnamed: 0,genres,id,title
0,"[Animation, Comedy, Family]",862,Toy Story
1,"[Adventure, Fantasy, Family]",8844,Jumanji
2,"[Romance, Comedy]",15602,Grumpier Old Men
3,"[Comedy, Drama, Romance]",31357,Waiting to Exhale
4,[Comedy],11862,Father of the Bride Part II


In [20]:
#saving the clean datasets for further usage
mmd.to_csv('movies_md.csv',index=False)
rts.to_csv('ratings_nw.csv', index=False)

In [None]:
#fin