In [23]:
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"

In [24]:
import pandas as pd
import numpy as np
import os

In [25]:
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)

In [26]:
print(basics.columns)

Index(['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult',
       'startYear', 'endYear', 'runtimeMinutes', 'genres'],
      dtype='object')


In [27]:
# Title Basics
basics.replace(r'\\N', np.nan, inplace=True)
basics = basics.dropna(subset=['runtimeMinutes', 'genres'])
basics['startYear'] = pd.to_numeric(basics['startYear'], errors='coerce')  # Convert 'startYear' to numeric
basics = basics[(basics['titleType'] == 'movie') & (basics['startYear'].between(2000, 2022))]
basics = basics[~basics['genres'].str.contains('Documentary', na=False)]

In [28]:
print(akas.columns)

Index(['titleId', 'ordering', 'title', 'region', 'language', 'types',
       'attributes', 'isOriginalTitle'],
      dtype='object')


In [29]:
# AKAs
akas = akas[akas['region'] == 'US']
akas.replace(r'\\N', np.nan, inplace=True)

In [30]:
print(ratings.columns)

Index(['tconst', 'averageRating', 'numVotes'], dtype='object')


In [31]:
# Ratings
ratings.replace(r'\\N', np.nan, inplace=True)
ratings = ratings[ratings['tconst'].isin(akas['titleId'])]

In [32]:
print("Basics DataFrame Info:")
basics.info()

Basics DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 202549 entries, 11636 to 10255925
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          202549 non-null  object 
 1   titleType       202549 non-null  object 
 2   primaryTitle    202549 non-null  object 
 3   originalTitle   202549 non-null  object 
 4   isAdult         202549 non-null  object 
 5   startYear       202549 non-null  float64
 6   endYear         202549 non-null  object 
 7   runtimeMinutes  202549 non-null  object 
 8   genres          202549 non-null  object 
dtypes: float64(1), object(8)
memory usage: 15.5+ MB


In [33]:
print("AKAs DataFrame Info:")
akas.info()

AKAs DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1472694 entries, 5 to 37544647
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1472694 non-null  object
 1   ordering         1472694 non-null  int64 
 2   title            1472694 non-null  object
 3   region           1472694 non-null  object
 4   language         1472694 non-null  object
 5   types            1472694 non-null  object
 6   attributes       1472694 non-null  object
 7   isOriginalTitle  1472694 non-null  object
dtypes: int64(1), object(7)
memory usage: 101.1+ MB


In [34]:
print("Ratings DataFrame Info:")
ratings.info()

Ratings DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 512306 entries, 0 to 1361653
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         512306 non-null  object 
 1   averageRating  512306 non-null  float64
 2   numVotes       512306 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 15.6+ MB


In [35]:
os.makedirs("Data", exist_ok=True)

In [36]:
basics.to_csv("Data/basics_cleaned.csv", index=False, compression="gzip")
akas.to_csv("Data/akas_cleaned.csv", index=False, compression="gzip")
ratings.to_csv("Data/ratings_cleaned.csv", index=False, compression="gzip")