# Preprocessing
Below we are going to preprocess the IMDB dataset and clean out not useful data

In [1]:
#load libraries
import pandas as pd
import numpy as np

In [2]:
basics_url='https://datasets.imdbws.com/title.basics.tsv.gz'
ratings_url='https://datasets.imdbws.com/title.ratings.tsv.gz'
akas_url='https://datasets.imdbws.com/title.akas.tsv.gz'


In [3]:
#load in dataset this is the basics one first
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)



In [4]:
#display the info 
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9806297 entries, 0 to 9806296
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 673.3+ MB


In [5]:
#check for missing data this check shows only primary title original title and genres are missing values but \N was used on this data and must be replaced to see tru missing values
basics.isna().sum()

tconst             0
titleType          0
primaryTitle      11
originalTitle     11
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres            14
dtype: int64

In [6]:
#premanantly replace \N with nan
basics.replace({'\\N':np.nan}, inplace = True)

In [7]:
#check for missing data again
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle           11
originalTitle          11
isAdult                 1
startYear         1327682
endYear           9700427
runtimeMinutes    6917265
genres             442042
dtype: int64

In [8]:
#check for missing data again
basics['runtimeMinutes'].value_counts()

30      217731
22      159059
60      154138
44       75649
15       75217
         ...  
1554         1
612          1
850          1
562          1
2088         1
Name: runtimeMinutes, Length: 885, dtype: int64

In [9]:
#no duplicates
basics.duplicated().sum()

0

In [10]:
#check how many values are missing in runtimeminutes columns
basics['runtimeMinutes'].isnull().value_counts()

True     6917265
False    2889032
Name: runtimeMinutes, dtype: int64

In [11]:
#drop them
basics.dropna(subset=['runtimeMinutes'], inplace=True)

In [12]:
#check again and theyre gone
basics['runtimeMinutes'].isnull().value_counts()

False    2889032
Name: runtimeMinutes, dtype: int64

In [13]:
#check how many rows have null values
basics['genres'].isnull().value_counts()

False    2812457
True       76575
Name: genres, dtype: int64

In [14]:
#drop them
basics.dropna(subset=['genres'], inplace=True)

In [15]:
#check they were dropped
basics['genres'].isnull().value_counts()

False    2812457
Name: genres, dtype: int64

In [16]:
#check that we only include movie titles 
basics['titleType'].value_counts()

tvEpisode       1425631
short            599275
movie            381476
video            180144
tvMovie           91428
tvSeries          90217
tvSpecial         18051
tvMiniSeries      17123
tvShort            8790
videoGame           322
Name: titleType, dtype: int64

In [17]:
#make a filter to filter out movies
movie_filter = basics['titleType'] == 'movie'


In [18]:
#apply filter to dataset and make it permanant
basics = basics[movie_filter]

In [19]:
#check if it changed
basics['titleType'].value_counts()

movie    381476
Name: titleType, dtype: int64

In [20]:
#convert Start year to a float
basics['startYear'] = basics['startYear'].astype(float)

In [21]:
#filter out only movies between 2000 and 2022
basics['startYear'].value_counts()

2017.0    14366
2018.0    14321
2019.0    14054
2016.0    13949
2015.0    13475
          ...  
1904.0        1
1897.0        1
1896.0        1
2026.0        1
1894.0        1
Name: startYear, Length: 130, dtype: int64

In [23]:
#make a filter for years below 2000
year_filter = basics['startYear'] > 1999.0

In [25]:
#make a filter for years above 2022
year_filter2 = basics['startYear']<= 2022.0

In [26]:
#combine both filters and apply them to the df
basics= basics.loc[year_filter & year_filter2,:]

In [27]:
#check results
basics['startYear'].value_counts()

2017.0    14366
2018.0    14321
2019.0    14054
2016.0    13949
2015.0    13475
2014.0    13100
2022.0    12730
2013.0    12380
2021.0    12327
2012.0    11625
2020.0    11561
2011.0    10773
2010.0    10200
2009.0     9351
2008.0     8147
2007.0     6962
2006.0     6512
2005.0     5828
2004.0     5201
2003.0     4587
2002.0     4131
2001.0     3861
2000.0     3638
Name: startYear, dtype: int64

In [28]:
#make a filter to remove the documantary type movies
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [29]:
#check results
basics['genres'].value_counts()

Drama                         35994
Comedy                        13436
Comedy,Drama                   6458
Horror                         5783
Drama,Romance                  4303
                              ...  
Action,Animation,Game-Show        1
Adult,Crime,Mystery               1
Family,Musical,Sport              1
Horror,Music,Mystery              1
Crime,Fantasy,Sci-Fi              1
Name: genres, Length: 969, dtype: int64

In [30]:
#load akas dataframe
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)

In [31]:
#check it out
akas.isna().sum()

titleId              0
ordering             0
title                5
region             111
language             0
types                0
attributes           0
isOriginalTitle      0
dtype: int64

In [32]:
#replace any \N values with np.nan
akas.replace({'\\N':np.nan}, inplace = True)

In [33]:
#check resuts for more hidden missing values
akas.isna().sum()

titleId                   0
ordering                  0
title                     5
region              1888398
language            6583607
types              30184309
attributes         35465560
isOriginalTitle        2109
dtype: int64

In [34]:
#filter out the US only moves 
us_filter = akas['region'] =='US'

In [35]:
#apply it to the dataset
akas = akas[us_filter]

In [36]:
#check results
akas['region'].value_counts()

US    1432658
Name: region, dtype: int64

In [37]:
#apply this to the basics df
keepers =basics['tconst'].isin(akas['titleId'])
keepers

34803       True
61116       True
67669       True
77964      False
86801       True
           ...  
9805970     True
9805979     True
9806018    False
9806063     True
9806147    False
Name: tconst, Length: 147348, dtype: bool

In [38]:
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
86801,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
93938,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama
...,...,...,...,...,...,...,...,...,...
9805435,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019.0,,74,Drama
9805830,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019.0,,97,"Comedy,Drama,Fantasy"
9805970,tt9916170,movie,The Rehearsal,O Ensaio,0,2019.0,,51,Drama
9805979,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95,"Action,Adventure,Thriller"


In [39]:
#Load rating dataset
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)

In [40]:
#no missisng values
ratings.isna().sum()

tconst           0
averageRating    0
numVotes         0
dtype: int64

In [41]:
#filter only us movies using akas df
keepers2 =ratings['tconst'].isin(akas['titleId'])
keepers2

0           True
1           True
2          False
3          False
4           True
           ...  
1306411    False
1306412    False
1306413    False
1306414    False
1306415    False
Name: tconst, Length: 1306416, dtype: bool

In [42]:
#check results
ratings= ratings[keepers2]
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1966
1,tt0000002,5.8,263
4,tt0000005,6.2,2607
5,tt0000006,5.2,181
6,tt0000007,5.4,816
...,...,...,...
1306377,tt9916200,8.1,229
1306378,tt9916204,8.1,262
1306385,tt9916348,8.1,18
1306386,tt9916362,6.4,5307


In [43]:
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

['title_akas.csv.gz', 'title_basics.csv.gz', 'title_ratings.csv.gz']

In [44]:
basics.to_csv('basics.csv')

In [45]:
akas.to_csv('akas.csv')

In [46]:
ratings.to_csv('ratings.csv')

In [47]:
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [48]:
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)

In [49]:
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

In [50]:
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama


In [51]:
akas = pd.read_csv("Data/title_akas.csv.gz", low_memory = False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0


In [52]:
ratings = pd.read_csv("Data/title_ratings.csv.gz", low_memory = False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1966
1,tt0000002,5.8,263
2,tt0000005,6.2,2607
3,tt0000006,5.2,181
4,tt0000007,5.4,816
