# Preprocessing
Below we are going to preprocess the IMDB dataset and clean out not useful data

In [1]:
#load libraries
import pandas as pd
import numpy as np

In [2]:
basics_url='https://datasets.imdbws.com/title.basics.tsv.gz'
ratings_url='https://datasets.imdbws.com/title.ratings.tsv.gz'
akas_url='https://datasets.imdbws.com/title.akas.tsv.gz'


In [3]:
#load in dataset this is the basics one first
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)



In [4]:
#display the info 
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9826265 entries, 0 to 9826264
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 674.7+ MB


In [5]:
#check for missing data this check shows only primary title original title and genres are missing values but \N was used on this data and must be replaced to see tru missing values
basics.isna().sum()

tconst             0
titleType          0
primaryTitle      11
originalTitle     11
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres            14
dtype: int64

In [6]:
#premanantly replace \N with nan
basics.replace({'\\N':np.nan}, inplace = True)

In [7]:
#check for missing data again
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle           11
originalTitle          11
isAdult                 1
startYear         1330171
endYear           9720083
runtimeMinutes    6928273
genres             442923
dtype: int64

In [8]:
#check for missing data again
basics['runtimeMinutes'].value_counts()

30      217789
60      159473
22      159177
44       75721
15       75304
         ...  
1554         1
612          1
850          1
562          1
2088         1
Name: runtimeMinutes, Length: 885, dtype: int64

In [9]:
#no duplicates
basics.duplicated().sum()

0

In [10]:
#check how many values are missing in runtimeminutes columns
basics['runtimeMinutes'].isnull().value_counts()

True     6928273
False    2897992
Name: runtimeMinutes, dtype: int64

In [11]:
#drop them
basics.dropna(subset=['runtimeMinutes'], inplace=True)

In [12]:
#check again and theyre gone
basics['runtimeMinutes'].isnull().value_counts()

False    2897992
Name: runtimeMinutes, dtype: int64

In [13]:
#check how many rows have null values
basics['genres'].isnull().value_counts()

False    2821369
True       76623
Name: genres, dtype: int64

In [14]:
#drop them
basics.dropna(subset=['genres'], inplace=True)

In [15]:
#check they were dropped
basics['genres'].isnull().value_counts()

False    2821369
Name: genres, dtype: int64

In [16]:
#check that we only include movie titles 
basics['titleType'].value_counts()

tvEpisode       1432862
short            600149
movie            381850
video            180311
tvMovie           91496
tvSeries          90323
tvSpecial         18096
tvMiniSeries      17153
tvShort            8807
videoGame           322
Name: titleType, dtype: int64

In [17]:
#make a filter to filter out movies
movie_filter = basics['titleType'] == 'movie'


In [18]:
#apply filter to dataset and make it permanant
basics = basics[movie_filter]

In [19]:
#check if it changed
basics['titleType'].value_counts()

movie    381850
Name: titleType, dtype: int64

In [20]:
#convert Start year to a float
basics['startYear'] = basics['startYear'].astype(float)

In [21]:
#filter out only movies between 2000 and 2022
basics['startYear'].value_counts()

2017.0    14366
2018.0    14325
2019.0    14057
2016.0    13953
2015.0    13477
          ...  
1899.0        1
1904.0        1
1897.0        1
1896.0        1
1894.0        1
Name: startYear, Length: 130, dtype: int64

In [22]:
#make a filter for years below 2000
year_filter = basics['startYear'] > 1999.0

In [23]:
#make a filter for years above 2022
year_filter2 = basics['startYear']<= 2022.0

In [24]:
#combine both filters and apply them to the df
basics= basics.loc[year_filter & year_filter2,:]

In [25]:
#check results
basics['startYear'].value_counts()

2017.0    14366
2018.0    14325
2019.0    14057
2016.0    13953
2015.0    13477
2014.0    13103
2022.0    12761
2013.0    12385
2021.0    12339
2012.0    11628
2020.0    11566
2011.0    10777
2010.0    10204
2009.0     9354
2008.0     8153
2007.0     6962
2006.0     6517
2005.0     5831
2004.0     5204
2003.0     4589
2002.0     4131
2001.0     3866
2000.0     3641
Name: startYear, dtype: int64

In [26]:
#make a filter to remove the documantary type movies
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [27]:
#check results
basics['genres'].value_counts()

Drama                        36001
Comedy                       13442
Comedy,Drama                  6461
Horror                        5782
Drama,Romance                 4305
                             ...  
Comedy,History,Mystery           1
Animation,Biography,Sport        1
Adventure,History,Music          1
Adventure,History,War            1
Crime,Fantasy,Sci-Fi             1
Name: genres, Length: 967, dtype: int64

In [28]:
#load akas dataframe
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)

ParserError: Error tokenizing data. C error: out of memory

In [None]:
#check it out
akas.isna().sum()

In [None]:
#replace any \N values with np.nan
akas.replace({'\\N':np.nan}, inplace = True)

In [None]:
#check resuts for more hidden missing values
akas.isna().sum()

In [None]:
#filter out the US only moves 
us_filter = akas['region'] =='US'

In [None]:
#apply it to the dataset
akas = akas[us_filter]

In [None]:
#check results
akas['region'].value_counts()

In [None]:
#apply this to the basics df
keepers =basics['tconst'].isin(akas['titleId'])
keepers

In [None]:
basics = basics[keepers]
basics

In [None]:
#Load rating dataset
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)

In [None]:
#no missisng values
ratings.isna().sum()

In [None]:
#filter only us movies using akas df
keepers2 =ratings['tconst'].isin(akas['titleId'])
keepers2

In [None]:
#check results
ratings= ratings[keepers2]
ratings

In [None]:
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

In [None]:
basics.to_csv('basics.csv')

In [None]:
akas.to_csv('akas.csv')

In [None]:
ratings.to_csv('ratings.csv')

In [None]:
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [None]:
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)

In [None]:
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

In [None]:
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

In [None]:
akas = pd.read_csv("Data/title_akas.csv.gz", low_memory = False)
akas.head()

In [None]:
ratings = pd.read_csv("Data/title_ratings.csv.gz", low_memory = False)
ratings.head()