In [1]:
#Imports
import pandas as pd
import numpy as np

# Loading Data

In [2]:
title_akas_url='https://datasets.imdbws.com/title.akas.tsv.gz'
title_basics_url='https://datasets.imdbws.com/title.basics.tsv.gz'
title_ratings_url='https://datasets.imdbws.com/title.ratings.tsv.gz'

In [3]:
akas = pd.read_csv(title_akas_url,sep='\t', low_memory=False)
basics = pd.read_csv(title_basics_url,sep='\t', low_memory=False)
ratings = pd.read_csv(title_ratings_url,sep='\t', low_memory=False)

In [4]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


# Checking/Fixing \N Placeholder Values

In [5]:
basics.isna().sum()

tconst             0
titleType          0
primaryTitle      10
originalTitle     10
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres            10
dtype: int64

In [6]:
akas.isna().sum()

titleId              0
ordering             0
title                5
region             105
language             0
types                0
attributes           0
isOriginalTitle      0
dtype: int64

In [7]:
ratings.isna().sum()

tconst           0
averageRating    0
numVotes         0
dtype: int64

In [8]:
#Handling \N placeholder values
basics.replace({'\\N':np.nan}, inplace=True)
akas.replace({'\\N':np.nan}, inplace=True)

In [9]:
#sanity checking dataframes
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle           10
originalTitle          10
isAdult                 1
startYear         1211857
endYear           9054035
runtimeMinutes    6689466
genres             416048
dtype: int64

akas.isna().sum()

# Preprocessing Dataframes

## Basics

In [34]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 366291 entries, 8 to 9149120
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          366291 non-null  object
 1   titleType       366291 non-null  object
 2   primaryTitle    366291 non-null  object
 3   originalTitle   366291 non-null  object
 4   isAdult         366291 non-null  object
 5   startYear       360492 non-null  object
 6   endYear         0 non-null       object
 7   runtimeMinutes  366291 non-null  object
 8   genres          366291 non-null  object
dtypes: object(9)
memory usage: 27.9+ MB


### Eliminating movies that are null for runtimeMinutes

In [13]:
basics = basics.dropna(axis=0, subset=['runtimeMinutes'])
basics['runtimeMinutes'].isna().sum()

0

In [15]:
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 1
startYear           37591
endYear           2413206
runtimeMinutes          0
genres              67284
dtype: int64

In [20]:
### Eliminating movies that are null for genre
basics = basics.dropna(axis=0, subset=['genres'])
basics['genres'].isna().sum()

0

### Keeping only titleType==Movie

In [21]:
#checking value counts
basics['titleType'].value_counts()

tvEpisode       1061461
short            573177
movie            366291
video            174788
tvMovie           88216
tvSeries          86381
tvSpecial         16432
tvMiniSeries      16008
tvShort            9372
videoGame           294
Name: titleType, dtype: int64

In [29]:
#applying filter to only show movies
movie_filter = basics['titleType']=='movie'

#filtering dataframe
basics = basics[movie_filter]

#sanity checking
basics['titleType'].value_counts()

movie    366291
Name: titleType, dtype: int64

### Keeping startYear 2000-2022

In [30]:
basics['startYear'].value_counts()

2017    14179
2018    14119
2016    13810
2019    13788
2015    13320
        ...  
1894        1
1899        1
1904        1
1906        1
1896        1
Name: startYear, Length: 129, dtype: int64

In [35]:
#applying filter to only have movies that started between 2000-2022
basics = basics[(basics['startYear'] >= '2000') & (basics['startYear'] <= '2022')]

In [36]:
#checking results
basics['startYear'].value_counts()

2017    14179
2018    14119
2016    13810
2019    13788
2015    13320
2014    12980
2013    12264
2021    11692
2012    11534
2020    11243
2011    10672
2010    10115
2009     9261
2008     8067
2022     7389
2007     6878
2006     6425
2005     5760
2004     5127
2003     4526
2002     4084
2001     3810
2000     3591
Name: startYear, dtype: int64

### Eliminate movies that include "Documentary" in genre

In [38]:
basics['genres'].value_counts()

Documentary                   50676
Drama                         34691
Comedy                        13089
Comedy,Drama                   6244
Horror                         5590
                              ...  
Action,Music,Sci-Fi               1
Drama,Mystery,Short               1
Adventure,Reality-TV,Sport        1
Biography,Comedy,War              1
Crime,Fantasy,Sci-Fi              1
Name: genres, Length: 1184, dtype: int64

In [39]:
# Excluding movies that are included in the documentary category.
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]
basics['genres'].value_counts()

Drama                      34691
Comedy                     13089
Comedy,Drama                6244
Horror                      5590
Drama,Romance               4159
                           ...  
Comedy,Game-Show               1
Horror,Music,Mystery           1
Drama,Musical,Sport            1
Adventure,History,Music        1
Crime,Fantasy,Sci-Fi           1
Name: genres, Length: 966, dtype: int64