Imports 

In [1]:
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [2]:
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"
basics_url2="https://datasets.imdbws.com/title.akas.tsv.gz"
basics_url3="https://datasets.imdbws.com/title.ratings.tsv.gz"

# Filtering/Coding AKAs

In [3]:
akas = pd.read_csv(basics_url2, sep='\t', low_memory=False)

In [4]:
akas.replace({'\\N':np.nan}, inplace = True)

In [5]:
akas = akas[akas['region']=='US']

# Filtering/Cleaning Basics

In [6]:
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)

In [7]:
basics.replace({'\\N':np.nan}, inplace = True)

In [8]:
basics = basics.dropna(subset=['runtimeMinutes', 'genres'])

In [9]:
basics = basics[basics['titleType']== 'movie']

In [10]:
basics['startYear']= basics['startYear'].astype(float)

In [11]:
filter1 = basics['startYear']>= 2000
filter2 = basics['startYear']<=2022

basics = basics[filter1 & filter2]

In [12]:
# Exclude movies that are included in the documentary category.
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [13]:
# Filter the basics table down to only include the US movies by using the filtered akas dataframe above
keepers_basics = basics['tconst'].isin(akas['titleId'])
basics = basics[keepers_basics]

basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 84581 entries, 34803 to 9514411
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          84581 non-null  object 
 1   titleType       84581 non-null  object 
 2   primaryTitle    84581 non-null  object 
 3   originalTitle   84581 non-null  object 
 4   isAdult         84581 non-null  object 
 5   startYear       84581 non-null  float64
 6   endYear         0 non-null      object 
 7   runtimeMinutes  84581 non-null  object 
 8   genres          84581 non-null  object 
dtypes: float64(1), object(8)
memory usage: 6.5+ MB


# Filtering/Cleaning Ratings

In [14]:
ratings = pd.read_csv(basics_url3,  sep='\t', low_memory=False)

In [15]:
ratings.replace({'\\N':np.nan}, inplace = True)

In [16]:
# Filter the basics table down to only include the US movies by using the filtered akas dataframe above
keepers_ratings = ratings['tconst'].isin(akas['titleId'])
ratings = ratings[keepers_ratings]

ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 484321 entries, 0 to 1265771
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         484321 non-null  object 
 1   averageRating  484321 non-null  float64
 2   numVotes       484321 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 14.8+ MB


In [17]:
# Save current dataframes to files.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)
print("Done!")

Done!
