In [0]:
import dlt
from pyspark.sql.functions import (
    col, when, trim, split, year, current_timestamp,
    lit, regexp_replace, regexp_extract
)

In [0]:
# ============================
# SILVER - NAME BASICS
# ============================

@dlt.table(
    name="silver_name_basics",
    comment="Cleaned name.basics data (persons) from IMDb",
    table_properties={
        "delta.columnMapping.mode": "name",
        "quality": "silver"
    }
)
def silver_name_basics():
    df = dlt.read("raw_name_basics")

    # Current year for simple range checks
    current_year = year(current_timestamp())

    # Helper to turn '\N' into null
    def null_if_N(cname):
        return when((col(cname) == "\\N") | (col(cname) == ""), None).otherwise(col(cname))

    # --- Clean base columns ---
    df_clean = (
        df
        # nconst: trim and keep only valid PKs later
        .withColumn("nconst", trim(col("nconst")))
        .withColumn("primary_name", trim(col("primaryName")))

        # Replace '\N' with null for year columns
        .withColumn("birth_year_raw", null_if_N("birthYear").cast("int"))
        .withColumn("death_year_raw", null_if_N("deathYear").cast("int"))

        # Profession / known-for raw strings (null instead of '\N')
        .withColumn("primary_profession_raw", trim(null_if_N("primaryProfession")))
        .withColumn("known_for_titles_raw", trim(null_if_N("knownForTitles")))
    )

    # --- Validate year ranges ---
    df_years = (
        df_clean
        # Birth year: keep only sensible values
        .withColumn(
            "birth_year",
            when(
                (col("birth_year_raw") >= lit(1850)) &
                (col("birth_year_raw") <= current_year + lit(1)),
                col("birth_year_raw")
            ).otherwise(None)
        )
        # Death year: range check
        .withColumn(
            "death_year_tmp",
            when(
                (col("death_year_raw") >= lit(1850)) &
                (col("death_year_raw") <= current_year + lit(1)),
                col("death_year_raw")
            ).otherwise(None)
        )
        # Death year must not be before birth year
        .withColumn(
            "death_year",
            when(
                col("death_year_tmp").isNotNull()
                & col("birth_year").isNotNull()
                & (col("death_year_tmp") < col("birth_year")),
                None
            ).otherwise(col("death_year_tmp"))
        )
    )

    # --- Parse arrays ---
    df_arrays = (
        df_years
        # Profession array
        .withColumn(
            "primary_profession_array",
            when(
                col("primary_profession_raw").isNotNull(),
                split(col("primary_profession_raw"), r"\s*,\s*")
            ).otherwise(None)
        )
        # Known-for titles array
        .withColumn(
            "known_for_titles_array",
            when(
                col("known_for_titles_raw").isNotNull(),
                split(col("known_for_titles_raw"), r"\s*,\s*")
            ).otherwise(None)
        )
    )

    # --- Filter invalid PKs (drop rows with bad nconst) ---
    df_filtered = df_arrays.filter(
        (col("nconst").isNotNull()) &
        (col("nconst") != "") &
        (col("nconst") != "\\N")
    )

    # --- Final column selection ---
    return df_filtered.select(
        col("nconst").alias("name_id"),
        "primary_name",
        "birth_year",
        "death_year",
        "primary_profession_raw",
        "primary_profession_array",
        "known_for_titles_raw",
        "known_for_titles_array",
      
        "load_dt",
        "source_file_name"
    )

In [0]:
# Helper to normalize '\N' to null
def null_if_N(cname):
    return when((col(cname) == "\\N") | (col(cname) == ""), None).otherwise(col(cname))

# ============================
# SILVER - TITLE BASICS
# ============================

@dlt.table(
    name="silver_title_basics",
    comment="Cleaned title.basics data for building dim_title",
    table_properties={
        "delta.columnMapping.mode": "name",
        "quality": "silver"
    }
)
def silver_title_basics():
    df = dlt.read("raw_title_basics")

    current_year = year(current_timestamp())

    df_clean = (
        df
        # Rename & trim
        .withColumn("tconst", trim(col("tconst")))
        .withColumn("title_type", trim(col("titleType")))
        .withColumn("primary_title", trim(col("primaryTitle")))
        .withColumn("original_title", trim(col("originalTitle")))

        # isAdult: '\N' → null, then cast to boolean
        .withColumn("is_adult_int", null_if_N("isAdult").cast("int"))
        .withColumn(
            "is_adult",
            when(col("is_adult_int") == 1, lit(True))
            .when(col("is_adult_int") == 0, lit(False))
            .otherwise(None)
        )

        # Years: '\N' → null, cast to int, basic range sanity
        .withColumn("start_year_raw", null_if_N("startYear").cast("int"))
        .withColumn("end_year_raw", null_if_N("endYear").cast("int"))
        .withColumn(
            "start_year",
            when(
                (col("start_year_raw") >= lit(1850)) &
                (col("start_year_raw") <= current_year + lit(1)),
                col("start_year_raw")
            ).otherwise(None)
        )
        .withColumn(
            "end_year",
            when(
                (col("end_year_raw") >= lit(1850)) &
                (col("end_year_raw") <= current_year + lit(10)),
                col("end_year_raw")
            ).otherwise(None)
        )

        # runtimeMinutes: '\N' → null, cast to int, non-negative
        .withColumn("runtime_minutes_raw", null_if_N("runtimeMinutes").cast("int"))
        .withColumn(
            "runtime_minutes",
            when(col("runtime_minutes_raw") >= 0, col("runtime_minutes_raw")).otherwise(None)
        )

        # genres: '\N' → null, then split to array
        .withColumn("genres_str", null_if_N("genres"))
        .withColumn(
            "genres",
            when(col("genres_str").isNotNull(),
                 split(col("genres_str"), r"\s*,\s*")
            ).otherwise(None)
        )
    )

    # Drop any rows with missing tconst (should be none, but defensive)
    df_filtered = df_clean.filter(
        (col("tconst").isNotNull()) &
        (col("tconst") != "") &
        (col("tconst") != "\\N")
    )

    # Final minimal projection
    return df_filtered.select(
        col("tconst").alias("title_id"),
        "title_type",
        "primary_title",
        "original_title",
        "is_adult",
        "start_year",
        "end_year",
        "runtime_minutes",
        "genres",
        "load_dt",
        "source_file_name"
    )

In [0]:
# ============================
# SILVER - TITLE EPISODE
# ============================

@dlt.table(
    name="silver_title_episode",
    comment="Cleaned title.episode data for enriching dim_title with series/episode info",
    table_properties={
        "delta.columnMapping.mode": "name",
        "quality": "silver"
    }
)
def silver_title_episode():
    df = dlt.read("raw_title_episode")

    df_clean = (
        df
        .withColumn("tconst", trim(col("tconst")))
        .withColumn("parent_tconst", trim(null_if_N("parentTconst")))
        .withColumn(
            "season_number",
            null_if_N("seasonNumber").cast("int")
        )
        .withColumn(
            "episode_number",
            null_if_N("episodeNumber").cast("int")
        )
    )

    # tconst is mandatory; if somehow missing, drop those rows
    df_filtered = df_clean.filter(
        (col("tconst").isNotNull()) &
        (col("tconst") != "") &
        (col("tconst") != "\\N")
    )

    return df_filtered.select(
        col("tconst").alias("title_id"),
        col("parent_tconst").alias("parent_title_id"),
        "season_number",
        "episode_number",
        "load_dt",
        "source_file_name"
    )

In [0]:
# ============================
# SILVER - TITLE AKAS
# ============================

@dlt.table(
    name="silver_title_akas",
    comment="Cleaned title.akas data for regions and languages per title",
    table_properties={
        "delta.columnMapping.mode": "name",
        "quality": "silver"
    }
)
def silver_title_akas():
    df = dlt.read("raw_title_akas")

    df_clean = (
        df
        # rename + trim
        .withColumn("tconst", trim(col("titleId")))
        .withColumn("aka_title", trim(col("title")))
        .withColumn("ordering_int", null_if_N("ordering").cast("int"))

        # region & language: '\N' -> null
        .withColumn("region", trim(null_if_N("region")))
        .withColumn("language", trim(null_if_N("language")))

        # isOriginalTitle: '\N' or null allowed -> boolean
        .withColumn("is_original_title_int", null_if_N("isOriginalTitle").cast("int"))
        .withColumn(
            "is_original_title",
            when(col("is_original_title_int") == 1, lit(True))
            .when(col("is_original_title_int") == 0, lit(False))
            .otherwise(None)
        )
    )

    # Drop rows with missing tconst defensively
    df_filtered = df_clean.filter(
        (col("tconst").isNotNull()) &
        (col("tconst") != "") &
        (col("tconst") != "\\N")
    )

    return df_filtered.select(
        col("tconst").alias("title_id"),
        col("ordering_int").alias("ordering"),
        "aka_title",
        "region",
        "language",
        "is_original_title",
        "load_dt",
        "source_file_name"
    )

In [0]:
# ============================
# SILVER - TITLE PRINCIPALS
# ============================

@dlt.table(
    name="silver_title_principals",
    comment="Cleaned title.principals data for cast/crew, jobs, and characters per title",
    table_properties={
        "delta.columnMapping.mode": "name",
        "quality": "silver"
    }
)
def silver_title_principals():
    df = dlt.read("raw_title_principals")

    df_clean = (
        df
        # trim & rename
        .withColumn("tconst", trim(col("tconst")))
        .withColumn("nconst", trim(col("nconst")))
        .withColumn("ordering_int", null_if_N("ordering").cast("int"))
        .withColumn("category", trim(null_if_N("category")))
        .withColumn("job", trim(null_if_N("job")))

        # characters: '\N' -> null, then strip [ ] and quotes
        .withColumn("characters_raw", null_if_N("characters"))
        .withColumn(
            "characters_clean",
            when(
                col("characters_raw").isNotNull(),
                regexp_replace(col("characters_raw"), r'[\[\]\"]', "")
            ).otherwise(None)
        )
    )

    # Drop rows missing tconst or nconst (invalid relationships)
    df_filtered = df_clean.filter(
        (col("tconst").isNotNull()) & (col("tconst") != "") & (col("tconst") != "\\N") &
        (col("nconst").isNotNull()) & (col("nconst") != "") & (col("nconst") != "\\N")
    )

    return df_filtered.select(
        col("tconst").alias("title_id"),
        col("nconst").alias("name_id"),
        col("ordering_int").alias("ordering"),
        "category",
        "job",
        col("characters_clean").alias("characters"),
        "load_dt",
        "source_file_name"
    )

In [0]:
# ============================
# SILVER - TITLE CREW
# ============================

@dlt.table(
    name="silver_title_crew",
    comment="Cleaned title.crew data for directors and writers per title",
    table_properties={
        "delta.columnMapping.mode": "name",
        "quality": "silver"
    }
)
def silver_title_crew():
    df = dlt.read("raw_title_crew")

    df_clean = (
        df
        # tconst
        .withColumn("tconst", trim(col("tconst")))

        # raw strings with '\N' -> null
        .withColumn("directors_raw", trim(null_if_N("directors")))
        .withColumn("writers_raw", trim(null_if_N("writers")))

        # arrays of nconsts (no explode in Silver)
        .withColumn(
            "directors_array",
            when(
                col("directors_raw").isNotNull(),
                split(col("directors_raw"), r"\s*,\s*")
            ).otherwise(None)
        )
        .withColumn(
            "writers_array",
            when(
                col("writers_raw").isNotNull(),
                split(col("writers_raw"), r"\s*,\s*")
            ).otherwise(None)
        )
    )

    # drop rows with missing tconst (defensive)
    df_filtered = df_clean.filter(
        (col("tconst").isNotNull()) &
        (col("tconst") != "") &
        (col("tconst") != "\\N")
    )

    return df_filtered.select(
        col("tconst").alias("title_id"),
        "directors_raw",
        "directors_array",
        "writers_raw",
        "writers_array",
        "load_dt",
        "source_file_name"
    )

In [0]:
# ============================
# SILVER - TITLE RAITINGS
# ============================

@dlt.table(
    name="silver_title_ratings",
    comment="Cleaned title.ratings data for fact_title_ratings",
    table_properties={
        "delta.columnMapping.mode": "name",
        "quality": "silver"
    }
)
def silver_title_ratings():
    df = dlt.read("raw_title_ratings")

    df_clean = (
        df
        .withColumn("tconst", trim(col("tconst")))
        .withColumn("average_rating",
                    null_if_N("averageRating").cast("double"))
        .withColumn("num_votes",
                    null_if_N("numVotes").cast("int"))
    )

    df_filtered = df_clean.filter(
        (col("tconst").isNotNull()) &
        (col("tconst") != "") &
        (col("tconst") != "\\N")
    )

    return df_filtered.select(
        col("tconst").alias("title_id"),
        "average_rating",
        "num_votes",
        "load_dt",
        "source_file_name"
    )