Imports 

In [36]:
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [37]:
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"
basics_url2="https://datasets.imdbws.com/title.akas.tsv.gz"
basics_url3="https://datasets.imdbws.com/title.ratings.tsv.gz"

# Filtering/Coding AKAs

In [38]:
akas = pd.read_csv(basics_url2, sep='\t', low_memory=False)

In [39]:
akas.replace({'\\N':np.nan}, inplace = True)

In [40]:
akas = akas[akas['region']=='US']

# Filtering/Cleaning Basics

In [41]:
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)

In [42]:
basics.replace({'\\N':np.nan}, inplace = True)

In [43]:
basics = basics.dropna(subset=['runtimeMinutes', 'genres'])

In [44]:
basics = basics[basics['titleType']== 'Movie']

In [45]:
filter1 = basics['startYear']>= 2000
filter2 = basics['startYear']<=2022

basics = basics[filter1 & filter2]

In [46]:
# Exclude movies that are included in the documentary category.
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [47]:
# Filter the basics table down to only include the US movies by using the filtered akas dataframe above
keepers_basics = basics['tconst'].isin(akas['titleId'])
basics = basics[keepers_basics]

basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tconst          0 non-null      object
 1   titleType       0 non-null      object
 2   primaryTitle    0 non-null      object
 3   originalTitle   0 non-null      object
 4   isAdult         0 non-null      object
 5   startYear       0 non-null      object
 6   endYear         0 non-null      object
 7   runtimeMinutes  0 non-null      object
 8   genres          0 non-null      object
dtypes: object(9)
memory usage: 0.0+ bytes


# Filtering/Cleaning Ratings

In [48]:
ratings = pd.read_csv(basics_url3,  sep='\t', low_memory=False)

In [49]:
ratings.replace({'\\N':np.nan}, inplace = True)

In [50]:
# Filter the basics table down to only include the US movies by using the filtered akas dataframe above
keepers_ratings = ratings['tconst'].isin(akas['titleId'])
ratings = ratings[keepers_ratings]

ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 479086 entries, 0 to 1258352
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         479086 non-null  object 
 1   averageRating  479086 non-null  float64
 2   numVotes       479086 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 14.6+ MB


In [51]:
# Save current dataframes to files.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)
print("Done!")

Done!
