In [0]:
# Databricks notebook source
import dlt
from pyspark.sql.functions import (
    col,
    lit,
    current_timestamp,
    lower,
    upper,
    row_number,
    coalesce,
)
from pyspark.sql.window import Window

current_user = spark.conf.get("pipeline.owner", "unknown")

In [0]:
@dlt.table(
    name="dim_title",
    comment="Dimension for titles (movies, series, episodes, etc.)",
    table_properties={
        "delta.columnMapping.mode": "name",
        "quality": "gold"
    }
)
def dim_title():
    basics = dlt.read("silver_title_basics").alias("b")
    episodes = dlt.read("silver_title_episode").alias("e")

    # Join episode info (season, episode_number, parent_title_id)
    joined = (
        basics
        .join(
            episodes,
            basics.title_id == episodes.title_id,
            "left"
        )
        .select(
            col("b.title_id"),
            col("b.primary_title"),
            col("b.title_type"),
            col("e.parent_title_id"),
            col("e.season_number"),
            col("e.episode_number"),
            col("b.is_adult"),
            col("b.start_year"),
            col("b.end_year"),
            col("b.runtime_minutes"),
            col("b.load_dt"),
        )
    )

    # Add surrogate title_key
    w = Window.orderBy("title_id")
    with_keys = joined.withColumn(
        "title_key",
        row_number().over(w).cast("bigint")
    )

    # Self-join to resolve parent_series_key (keep your teammate's logic)
    parent_map = with_keys.select(
        col("title_id").alias("parent_title_id"),
        col("title_key").alias("parent_series_key")
    )

    final = (
        with_keys
        .join(parent_map, on="parent_title_id", how="left")
        .withColumn(
            "parent_series_key",
            col("parent_series_key")
        )
        .select(
            "title_key",
            "title_id",
            "primary_title",
            "title_type",
            "parent_series_key",
            "season_number",
            "episode_number",
            "is_adult",
            "start_year",
            "end_year",
            "runtime_minutes",
            col("load_dt").alias("load_date")
        )
    )

    return final

In [0]:
@dlt.table(
    name="dim_person",
    comment="Dimension for people (actors, directors, writers, etc.)",
    table_properties={
        "delta.columnMapping.mode": "name",
        "quality": "gold"
    }
)
def dim_person():
    src = dlt.read("silver_name_basics")

    w = Window.orderBy("name_id")
    with_keys = src.withColumn(
        "person_key",
        row_number().over(w).cast("bigint")
    )

    final = with_keys.select(
        "person_key",
        col("name_id").alias("person_id"),
        col("primary_name").alias("person_name"),
        "birth_year",
        "death_year",
        (col("death_year").isNull()).alias("is_current"),
        col("load_dt").alias("load_date")
    )

    return final

In [0]:
@dlt.table(
    name="dim_genre",
    comment="Dimension for movie genres",
    table_properties={"delta.columnMapping.mode": "name", "quality": "gold"}
)
def dim_genre():
    src = dlt.read("silver_expl_title_genre")

    genres = src.select(col("genre_name")).distinct()

    w = Window.orderBy("genre_name")
    with_keys = genres.withColumn(
        "genre_key",
        row_number().over(w).cast("int")
    )

    return with_keys.select(
        "genre_key",
        "genre_name",
        current_timestamp().alias("load_date")
    )

In [0]:
@dlt.table(
    name="dim_profession",
    comment="Dimension for professions (actor, producer, etc.)",
    table_properties={"delta.columnMapping.mode": "name", "quality": "gold"}
)
def dim_profession():
    src = dlt.read("silver_expl_person_profession")

    professions = src.select(col("profession_name")).distinct()

    w = Window.orderBy("profession_name")
    with_keys = professions.withColumn(
        "profession_key",
        row_number().over(w).cast("int")
    )

    return with_keys.select(
        "profession_key",
        "profession_name",
        current_timestamp().alias("load_date")
    )

In [0]:
@dlt.table(
    name="dim_language",
    comment="Dimension for languages associated with titles",
    table_properties={"delta.columnMapping.mode": "name", "quality": "gold"}
)
def dim_language():
    src = dlt.read("silver_title_akas")

    langs = (
        src
        .where(col("language").isNotNull())  # only real codes from data
        .select(
            lower(col("language")).alias("language_code"),
            col("language_name")
        )
        .distinct()
    )

    # Add an 'Unknown / Missing' language member
    unknown_lang_df = spark.createDataFrame(
        [("unknown", "Unknown / Missing language")],
        ["language_code", "language_name"]
    )

    langs_all = (
        langs
        .unionByName(unknown_lang_df)
        .dropDuplicates(["language_code"])
    )

    w = Window.orderBy("language_code")
    with_keys = langs_all.withColumn(
        "language_key",
        row_number().over(w).cast("int")
    )

    return with_keys.select(
        "language_key",
        "language_code",
        "language_name",
        current_timestamp().alias("load_date")
    )



In [0]:
@dlt.table(
    name="dim_region",
    comment="Dimension for regions / countries",
    table_properties={"delta.columnMapping.mode": "name", "quality": "gold"}
)
def dim_region():
    ref = dlt.read("ref_region_codes")

    regions = (
        ref
        .select(
            upper(col("region_code")).alias("region_code"),
            col("region_name")
        )
        .distinct()
    )

    # Add an 'Unknown / Missing' region member
    unknown_region_df = spark.createDataFrame(
        [("UNKNOWN", "Unknown / Missing region")],
        ["region_code", "region_name"]
    )

    regions_all = (
        regions
        .unionByName(unknown_region_df)
        .dropDuplicates(["region_code"])
    )

    w = Window.orderBy("region_code")
    with_keys = regions_all.withColumn(
        "region_key",
        row_number().over(w).cast("int")
    )

    return with_keys.select(
        "region_key",
        "region_code",
        "region_name",
        current_timestamp().alias("load_date")
    )

In [0]:
@dlt.table(
    name="bridge_title_genre",
    comment="Bridge between titles and genres",
    table_properties={"delta.columnMapping.mode": "name", "quality": "gold"}
)
def bridge_title_genre():
    titles = dlt.read("dim_title").select("title_key", "title_id")
    genres = dlt.read("dim_genre")
    expl   = dlt.read("silver_expl_title_genre")

    joined = (
        expl.alias("e")
        .join(titles.alias("t"), "title_id", "inner")
        .join(genres.alias("g"), col("e.genre_name") == col("g.genre_name"), "inner")
    )

    final = joined.select(
        col("t.title_key"),
        col("g.genre_key"),
        current_timestamp().alias("load_date")
    ).dropDuplicates(["title_key", "genre_key"])

    return final


In [0]:
@dlt.table(
    name="bridge_person_profession",
    comment="Bridge between people and professions",
    table_properties={"delta.columnMapping.mode": "name", "quality": "gold"}
)
def bridge_person_profession():
    persons     = dlt.read("dim_person").select("person_key", "person_id")
    professions = dlt.read("dim_profession")
    expl        = dlt.read("silver_expl_person_profession")

    joined = (
        expl.alias("e")
        .join(persons.alias("p"), col("e.person_id") == col("p.person_id"), "inner")
        .join(professions.alias("pr"),
              col("e.profession_name") == col("pr.profession_name"), "inner")
    )

    final = joined.select(
        col("p.person_key"),
        col("pr.profession_key"),
        col("e.is_primary"),
        current_timestamp().alias("load_date")
    ).dropDuplicates(["person_key", "profession_key"])

    return final

In [0]:
@dlt.table(
    name="bridge_title_person",
    comment="Bridge between titles and people (cast & crew)",
    table_properties={"delta.columnMapping.mode": "name", "quality": "gold"}
)
def bridge_title_person():
    # Keys from dims
    titles  = dlt.read("dim_title").select("title_key", "title_id")
    persons = dlt.read("dim_person").select("person_key", "person_id")
 
    # 1) Main principals (cast & other crew)
    principals = dlt.read("silver_title_principals").select(
        "title_id",
        "name_id",
        col("category").alias("job_title"),          # <- use CATEGORY as role
        col("characters").alias("character_name"),
        "ordering",
        "load_dt"
    )
 
    # 2) Exploded directors
    directors = dlt.read("silver_expl_title_director").select(
        "title_id",
        "name_id",
        col("category").alias("job_title"),          # 'director'
        "character_name",                            # always null here
        "ordering",
        "load_dt"
    )
 
    # 3) Exploded writers
    writers = dlt.read("silver_expl_title_writer").select(
        "title_id",
        "name_id",
        col("category").alias("job_title"),          # 'writer'
        "character_name",
        "ordering",
        "load_dt"
    )
 
    # Union all sources
    unioned = principals.unionByName(directors).unionByName(writers)
 
    # Join to dims to get surrogate keys
    joined = (
        unioned.alias("u")
        .join(titles.alias("t"), "title_id", "inner")
        .join(persons.alias("p"), col("u.name_id") == col("p.person_id"), "inner")
    )
 
    final = joined.select(
        col("t.title_key"),
        col("p.person_key"),
        "job_title",          # now always category/role
        "character_name",     # populated for actors/actresses/self etc.
        "ordering",
        col("u.load_dt").alias("load_date")
    ).dropDuplicates(["title_key", "person_key", "job_title", "character_name", "ordering"])
 
    return final

In [0]:
@dlt.table(
    name="bridge_title_language",
    comment="Bridge between titles and languages",
    table_properties={"delta.columnMapping.mode": "name", "quality": "gold"}
)
def bridge_title_language():
    titles = dlt.read("dim_title").select("title_key", "title_id")
    langs  = dlt.read("dim_language")
    akas   = dlt.read("silver_title_akas")

    # Get the 'unknown' language_key once
    unknown_lang_key_df = (
        langs
        .filter(col("language_code") == "unknown")
        .select(col("language_key").alias("unknown_language_key"))
    )

    joined = (
        akas.alias("a")
        # keep ALL rows, including language NULL
        .join(titles.alias("t"), "title_id", "inner")
        .join(
            langs.alias("l"),
            lower(col("a.language")) == col("l.language_code"),
            "left"       # allow non-matching / null languages
        )
        .crossJoin(unknown_lang_key_df)
    )

    final = joined.select(
        col("t.title_key"),
        coalesce(col("l.language_key"),
                 col("unknown_language_key")).alias("language_key"),
        col("a.is_original_title").alias("is_original_language"),
        col("a.load_dt").alias("load_date")
    ).dropDuplicates(["title_key", "language_key"])

    return final

In [0]:
@dlt.table(
    name="bridge_title_region",
    comment="Bridge between titles and regions",
    table_properties={"delta.columnMapping.mode": "name", "quality": "gold"}
)
def bridge_title_region():
    titles  = dlt.read("dim_title").select("title_key", "title_id")
    regions = dlt.read("dim_region")
    akas    = dlt.read("silver_title_akas")

    # Get the 'UNKNOWN' region_key once
    unknown_region_key_df = (
        regions
        .filter(col("region_code") == "UNKNOWN")
        .select(col("region_key").alias("unknown_region_key"))
    )

    base = (
        akas.alias("a")
        # keep ALL rows, including region NULL
        .join(titles.alias("t"), "title_id", "inner")
        .join(
            regions.alias("r"),
            upper(col("a.region")) == col("r.region_code"),
            "left"      # allow non-matching / null regions
        )
        .crossJoin(unknown_region_key_df)
    )

    final = base.select(
        col("t.title_key"),
        coalesce(col("r.region_key"),
                 col("unknown_region_key")).alias("region_key"),
        col("a.aka_title").alias("regional_title"),
        col("a.is_original_title"),
        col("a.load_dt").alias("load_date")
    )

    return final

In [0]:
@dlt.table(
    name="fact_title_ratings",
    comment="Fact table for IMDb title ratings",
    table_properties={"delta.columnMapping.mode": "name", "quality": "gold"}
)
def fact_title_ratings():
    titles  = dlt.read("dim_title").select("title_key", "title_id")
    rating  = dlt.read("silver_title_ratings")

    joined = (
        rating.alias("r")
        .join(titles.alias("t"), "title_id", "inner")
    )

    w = Window.orderBy("t.title_key")
    with_keys = joined.withColumn(
        "rating_key",
        row_number().over(w).cast("bigint")
    )

    final = with_keys.select(
        "rating_key",
        col("t.title_key"),
        col("r.average_rating"),
        col("r.num_votes"),
        col("r.load_dt").alias("load_date")
    )

    return final