In [1]:
import polars as pl

In [None]:
import kagglehub

import configparser
config = configparser.ConfigParser()

# Download latest version
dataset_path = kagglehub.dataset_download("rishabjadhav/imdb-actors-and-movies")

print("Path to dataset files:", dataset_path)
print(f"{config.KAGGLE_API_TOKEN}")

In [2]:
# dataset_path = "/Users/benhoskings/.cache/kagglehub/datasets/rishabjadhav/imdb-actors-and-movies/versions/1"

In [3]:
import re
import polars as pl

def to_snake_case(name: str) -> str:
    name = re.sub(r"[^\w\s]", "", name)          # remove punctuation
    name = re.sub(r"[\s\-]+", "_", name)         # spaces & hyphens → _
    name = re.sub(r"(.)([A-Z][a-z]+)", r"\1_\2", name)
    name = re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", name)
    return name.lower()

In [4]:
actor_df = (
    pl.read_csv(
        dataset_path + "/names.csv",
        null_values=r"\N",
        ignore_errors=True
    )
)
actor_df = actor_df.rename(
    {col: to_snake_case(col) for col in actor_df.columns}
)

film_df = pl.read_csv(
    dataset_path + "/titles.csv",
    null_values=r"\N",
    ignore_errors=True
)
film_df = film_df.rename(
    {col: to_snake_case(col) for col in film_df.columns}
)

In [5]:
film_df.head()

tconst,title_type,primary_title,original_title,is_adult,start_year,end_year,runtime_minutes,genres
str,str,str,str,i64,i64,str,i64,str
"""tt0000001""","""short""","""Carmencita""","""Carmencita""",0,1894,,1,"""Documentary,Short"""
"""tt0000002""","""short""","""Le clown et ses chiens""","""Le clown et ses chiens""",0,1892,,5,"""Animation,Short"""
"""tt0000003""","""short""","""Pauvre Pierrot""","""Pauvre Pierrot""",0,1892,,4,"""Animation,Comedy,Romance"""
"""tt0000004""","""short""","""Un bon bock""","""Un bon bock""",0,1892,,12,"""Animation,Short"""
"""tt0000005""","""short""","""Blacksmith Scene""","""Blacksmith Scene""",0,1893,,1,"""Comedy,Short"""


In [9]:
films = (
    film_df
    .select(
        pl.col("tconst").alias("film_id"),
        pl.col("primary_title").alias("title"),
    )
    .filter(
        pl.col("title").is_not_null()
    )
    .unique()
)

films.write_parquet("film_dataset.parquet", compression="zstd")

In [7]:
film_actors = (
    film_df
    .select("tconst")
    .join(
        actor_df
        .with_columns(pl.col("known_for_titles").str.split(","))
        .explode("known_for_titles"),
        left_on="tconst",
        right_on="known_for_titles",
    )
    .select(
        pl.col("tconst").alias("film_id"),
        pl.col("primary_name").alias("actor_name"),
    )
)

film_actors.write_parquet(
    "film_actors.parquet",
    compression="zstd"
)