In [283]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [284]:
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"

In [285]:
df_basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
df_basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [286]:
df_ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)
df_ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2002
1,tt0000002,5.8,269
2,tt0000003,6.5,1894
3,tt0000004,5.5,178
4,tt0000005,6.2,2678


In [287]:
df_akas = pd.read_csv(akas_url, sep='\t', low_memory=False)
df_akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [288]:
#Basics
#Replace "\N" with np.nan
df_basics.replace({'\\N':np.nan}, inplace = True)
df_basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [289]:
df_basics.isna().sum()

tconst                   0
titleType                0
primaryTitle            11
originalTitle           11
isAdult                  1
startYear          1370289
endYear           10134918
runtimeMinutes     7158084
genres              458487
dtype: int64

In [290]:
df_basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10248781 entries, 0 to 10248780
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 703.7+ MB


In [291]:
df_basics.loc[pd.isnull(df_basics['runtimeMinutes'])]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
18,tt0000019,short,The Clown Barber,The Clown Barber,0,1898,,,"Comedy,Short"
22,tt0000024,short,Opening of the Kiel Canal,Opening of the Kiel Canal,0,1895,,,"News,Short"
23,tt0000025,short,The Oxford and Cambridge University Boat Race,The Oxford and Cambridge University Boat Race,0,1896,,,"News,Short,Sport"
36,tt0000038,short,The Ball Game,The Ball Game,0,1898,,,"Documentary,Short,Sport"
37,tt0000039,short,Barnet Horse Fair,Barnet Horse Fair,0,1896,,,Short
...,...,...,...,...,...,...,...,...,...
10248774,tt9916844,tvEpisode,Episode #3.15,Episode #3.15,0,2009,,,"Action,Drama,Family"
10248775,tt9916846,tvEpisode,Episode #3.18,Episode #3.18,0,2009,,,"Action,Drama,Family"
10248776,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2009,,,"Action,Drama,Family"
10248777,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,,,"Action,Drama,Family"


In [292]:
#Eliminate movies that are null for runtimeMinutes
df_basics_rtm_nulls = pd.isnull(df_basics['runtimeMinutes'])
df_basics_rtm_nulls

0           False
1           False
2           False
3           False
4           False
            ...  
10248776     True
10248777     True
10248778     True
10248779    False
10248780    False
Name: runtimeMinutes, Length: 10248781, dtype: bool

In [293]:
df_basics = df_basics[~df_basics_rtm_nulls]

In [294]:
df_basics.loc[pd.isnull(df_basics['runtimeMinutes'])]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres


In [295]:
df_basics.loc[pd.isnull(df_basics['genres'])]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
498,tt0000502,movie,Bohemios,Bohemios,0,1905,,100,
1616,tt0001630,movie,Der fremde Vogel,Der fremde Vogel,0,1911,,45,
1739,tt0001756,movie,Lucha por la herencia,Lucha por la herencia,0,1911,,92,
2069,tt0002094,movie,Charles IV,Charles IV,0,1912,,50,
2289,tt0002315,movie,El lobo de la sierra,El lobo de la sierra,0,1912,,76,
...,...,...,...,...,...,...,...,...,...
10247391,tt9913814,movie,Chikan densha: Muremure shanai,Chikan densha: Muremure shanai,0,1981,,60,
10247401,tt9913834,movie,Hiniku: Nure nawazeme,Hiniku: Nure nawazeme,0,1981,,64,
10247422,tt9913878,movie,Document porno: Yubi ijime,Document porno: Yubi ijime,0,1981,,61,
10247996,tt9915130,movie,Meet John Doe,Meet John Doe,0,,,120,


In [296]:
#Eliminate movies that are null for genre
df_basics_g_nulls = pd.isnull(df_basics['genres'])
df_basics_g_nulls

0           False
1           False
2           False
3           False
4           False
            ...  
10248731    False
10248737    False
10248772    False
10248779    False
10248780    False
Name: genres, Length: 3090697, dtype: bool

In [297]:
df_basics = df_basics[~df_basics_g_nulls]

In [298]:
df_basics.loc[pd.isnull(df_basics['genres'])]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres


In [299]:
df_basics['titleType'].value_counts()

tvEpisode       1584476
short            617188
movie            390813
video            185720
tvMovie           92917
tvSeries          92451
tvSpecial         19226
tvMiniSeries      18091
tvShort            8747
videoGame           339
Name: titleType, dtype: int64

In [300]:
df_basics.loc[(df_basics['titleType'] == 'movie')]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,,100,"Documentary,News,Sport"
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,,120,"Adventure,Fantasy"
...,...,...,...,...,...,...,...,...,...
10248631,tt9916538,movie,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,0,2019,,123,Drama
10248672,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015,,57,Documentary
10248699,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007,,100,Documentary
10248721,tt9916730,movie,6 Gunn,6 Gunn,0,2017,,116,Drama


In [301]:
#keep only titleType==Movie
df_basics = df_basics.loc[(df_basics['titleType'] == 'movie')]

In [302]:
df_basics['titleType'].value_counts()

movie    390813
Name: titleType, dtype: int64

In [303]:
#keep startYear 2000-2022
df_basics['startYear'].value_counts()

2018    14444
2017    14431
2019    14189
2016    14015
2015    13537
        ...  
1894        1
1899        1
1904        1
1897        1
2028        1
Name: startYear, Length: 132, dtype: int64

In [304]:
year_filter = ((df_basics['startYear'] > '1999') & (df_basics['startYear'] < '2023'))
print(year_filter)

8           False
144         False
570         False
587         False
672         False
            ...  
10248631     True
10248672     True
10248699     True
10248721     True
10248731     True
Name: startYear, Length: 390813, dtype: bool


In [305]:
df_basics = df_basics.loc[year_filter]

In [306]:
#keep startYear 2000-2022
df_basics['startYear'].value_counts().sort_values().sort_index(0)

2000     3667
2001     3900
2002     4156
2003     4634
2004     5236
2005     5876
2006     6568
2007     7000
2008     8207
2009     9404
2010    10250
2011    10810
2012    11688
2013    12425
2014    13174
2015    13537
2016    14015
2017    14431
2018    14444
2019    14189
2020    11666
2021    12560
2022    13332
Name: startYear, dtype: int64

In [307]:
df_basics.shape

(225169, 9)

In [308]:
is_documentary = df_basics['genres'].str.contains('documentary',case=False)
is_documentary.value_counts()

False    148585
True      76584
Name: genres, dtype: int64

In [309]:
#Eliminate movies that include "Documentary" in genre
#is_documentary = df_basics['genres'].str.contains('documentary',case=False)
df_basics = df_basics[~is_documentary]

In [310]:
df_basics.shape

(148585, 9)

In [311]:
#Keep only US movies
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers =df_basics['tconst'].isin(df_akas['titleId'])
keepers.value_counts()

True     147877
False       708
Name: tconst, dtype: int64

In [312]:
df_basics = df_basics[keepers]
df_basics.shape

(147877, 9)

In [313]:
#AKAS
(df_akas['region'] == 'US').value_counts()

False    36054859
True      1472351
Name: region, dtype: int64

In [314]:
#keep only US movies.
df_akas = df_akas.loc[(df_akas['region'] == 'US')]
df_akas.shape

(1472351, 8)

In [315]:
#Replace "\N" with np.nan
df_akas.replace({'\\N':np.nan}, inplace = True)
df_akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


In [316]:
for i in df_ratings.columns:
    n_check = df_ratings.loc[(df_ratings[i] == '\\N')]
    print(n_check)
    

Empty DataFrame
Columns: [tconst, averageRating, numVotes]
Index: []
Empty DataFrame
Columns: [tconst, averageRating, numVotes]
Index: []
Empty DataFrame
Columns: [tconst, averageRating, numVotes]
Index: []


In [317]:
#Ratings
#Replace "\N" with np.nan (if any)
#No placeholder values to replace

In [318]:
df_ratings.shape

(1360556, 3)

In [319]:
#Keep only US movies
keepers_ratings =df_ratings['tconst'].isin(df_akas['titleId'])
keepers_ratings.value_counts()

False    848486
True     512070
Name: tconst, dtype: int64

In [320]:
df_ratings = df_ratings[keepers_ratings]
df_ratings.shape

(512070, 3)