In [1]:
import pandas as pd
import numpy as np

In [2]:
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"

In [3]:
basics = pd.read_csv(basics_url, sep = "\t", low_memory = False)
akas = pd.read_csv(akas_url, sep = "\t", low_memory = False)
ratings = pd.read_csv(ratings_url, sep = "\t", low_memory = False)

# Akas Processing

In [4]:
akas.replace({"\\N":np.nan}, inplace = True)

In [5]:
akas.isna().sum()

titleId                   0
ordering                  0
title                     5
region              1865119
language            6295240
types              33484260
attributes         33484260
isOriginalTitle        2187
dtype: int64

# Basics Processing

In [6]:
basics.replace({"\\N":np.nan}, inplace = True)

In [7]:
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle           11
originalTitle          11
isAdult                 1
startYear         1242667
endYear           9199706
runtimeMinutes    6793333
genres             428698
dtype: int64

In [8]:
basics.dropna(subset = ["runtimeMinutes", "genres"], inplace = True)

In [9]:
basics.query("titleType == 'movie'", inplace = True)
basics["titleType"].value_counts()

movie    369706
Name: titleType, dtype: int64

In [10]:
basics.query("startYear >= '2000' & startYear <= '2022'", inplace = True)
basics["startYear"].value_counts()

2017    14226
2018    14162
2016    13844
2019    13829
2015    13345
2014    12995
2013    12294
2021    11825
2012    11549
2020    11292
2011    10694
2010    10131
2022     9426
2009     9287
2008     8083
2007     6890
2006     6436
2005     5772
2004     5141
2003     4536
2002     4093
2001     3819
2000     3595
Name: startYear, dtype: int64

In [11]:
is_documentary = basics["genres"].str.contains("documentary", case = False)
basics = basics[~is_documentary]

In [12]:
keepers = basics["tconst"].isin(akas["titleId"])
keepers

34792      True
61094      True
67640      True
77934      True
86770      True
           ... 
9296737    True
9296746    True
9296785    True
9296830    True
9296914    True
Name: tconst, Length: 143820, dtype: bool

In [13]:
basics = basics[keepers]

# Ratings Processing

In [14]:
ratings.replace({"\\N":np.nan}, inplace = True)

In [15]:
ratings.isna().sum()

tconst           0
averageRating    0
numVotes         0
dtype: int64

In [16]:
keepers2 = ratings["tconst"].isin(akas["titleId"])
keepers2

0           True
1           True
2           True
3           True
4           True
           ...  
1237935    False
1237936     True
1237937     True
1237938     True
1237939    False
Name: tconst, Length: 1237940, dtype: bool

In [17]:
ratings = ratings[keepers2]

In [18]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33484260 entries, 0 to 33484259
Data columns (total 8 columns):
 #   Column           Dtype  
---  ------           -----  
 0   titleId          object 
 1   ordering         int64  
 2   title            object 
 3   region           object 
 4   language         object 
 5   types            float64
 6   attributes       float64
 7   isOriginalTitle  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 2.0+ GB


In [19]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 143153 entries, 34792 to 9296914
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          143153 non-null  object
 1   titleType       143153 non-null  object
 2   primaryTitle    143153 non-null  object
 3   originalTitle   143153 non-null  object
 4   isAdult         143153 non-null  object
 5   startYear       143153 non-null  object
 6   endYear         0 non-null       object
 7   runtimeMinutes  143153 non-null  object
 8   genres          143153 non-null  object
dtypes: object(9)
memory usage: 10.9+ MB


In [20]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 925098 entries, 0 to 1237938
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         925098 non-null  object 
 1   averageRating  925098 non-null  float64
 2   numVotes       925098 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 28.2+ MB
