In [0]:
import dlt
from pyspark.sql.functions import (
    col, explode, array_position, when, trim, split, year, current_timestamp,
    lit, regexp_replace, regexp_extract, upper, lower, posexplode
)


In [0]:
@dlt.table(
    name="silver_expl_title_genre",
    comment="One row per title and genre (exploded from silver_title_basics)",
    table_properties={
        "delta.columnMapping.mode": "name",
        "quality": "silver_exploded"
    }
)
def silver_expl_title_genre():
    df = dlt.read("silver_title_basics")

    # genres is already an array in silver_title_basics
    df_exploded = (
        df
        .withColumn("genre", explode("genres"))
    )

    return df_exploded.select(
        "title_id",
        col("genre").alias("genre_name"),
        "load_dt",
        "source_file_name",
        "loaded_by"
    )


In [0]:
@dlt.table(
    name="silver_expl_person_profession",
    comment="One row per person and profession (primary_profession_array exploded)",
    table_properties={
        "delta.columnMapping.mode": "name",
        "quality": "silver_exploded"
    }
)
def silver_expl_person_profession():
    df = dlt.read("silver_name_basics")

    # explode profession array
    df_exploded = (
        df
        .withColumn("profession_name", explode("primary_profession_array"))
        # mark primary if this profession is the first element in the array
        .withColumn(
            "is_primary",
            array_position(col("primary_profession_array"), col("profession_name")) == 1
        )
    )

    return df_exploded.select(
        col("name_id").alias("person_id"),
        "profession_name",
        "is_primary",
        "load_dt",
        "source_file_name",
        "loaded_by"
    )


In [0]:
@dlt.table(
    name="silver_expl_title_director",
    comment="One row per (title, director) exploded from directors_array",
    table_properties={
        "delta.columnMapping.mode": "name",
        "quality": "silver_exploded"
    }
)
def silver_expl_title_director():
    df = dlt.read("silver_title_crew")

    df_exploded = (
        df
        # posexplode gives: pos (0-based), element
        .select(
            "title_id", "directors_array",
            "load_dt", "source_file_name", "loaded_by"
        )
        .where(col("directors_array").isNotNull())
        .select(
            "title_id",
            posexplode("directors_array").alias("ordering_zero", "name_id"),
            "load_dt", "source_file_name", "loaded_by"
        )
        .withColumn("ordering", col("ordering_zero") + 1)  # 1,2,3...
    )

    return df_exploded.select(
        "title_id",
        "name_id",
        col("ordering"),
        lit("director").alias("job_title"),
        lit("director").alias("category"),
        lit(None).cast("string").alias("character_name"),
        "load_dt",
        "source_file_name",
        "loaded_by"
    )

In [0]:
@dlt.table(
    name="silver_expl_title_writer",
    comment="One row per (title, writer) exploded from writers_array",
    table_properties={
        "delta.columnMapping.mode": "name",
        "quality": "silver_exploded"
    }
)
def silver_expl_title_writer():
    df = dlt.read("silver_title_crew")

    df_exploded = (
        df
        .select(
            "title_id", "writers_array",
            "load_dt", "source_file_name", "loaded_by"
        )
        .where(col("writers_array").isNotNull())
        .select(
            "title_id",
            posexplode("writers_array").alias("ordering_zero", "name_id"),
            "load_dt", "source_file_name", "loaded_by"
        )
        .withColumn("ordering", col("ordering_zero") + 1)
    )

    return df_exploded.select(
        "title_id",
        "name_id",
        col("ordering"),
        lit("writer").alias("job_title"),
        lit("writer").alias("category"),
        lit(None).cast("string").alias("character_name"),
        "load_dt",
        "source_file_name",
        "loaded_by"
    )


In [0]:
#this is not used in any current business req, but exploded it just in case if we decide to come up with more metrics:
@dlt.table(
    name="silver_expl_person_known_for",
    comment="One row per (person, known_for_title) exploded from known_for_titles_array in silver_name_basics",
    table_properties={
        "delta.columnMapping.mode": "name",
        "quality": "silver_exploded"
    }
)
def silver_expl_person_known_for():
    df = dlt.read("silver_name_basics")

    # explode known_for_titles_array if not null
    df_exploded = (
        df
        .where(col("known_for_titles_array").isNotNull())
        .select(
            col("name_id"),
            explode(col("known_for_titles_array")).alias("known_for_title_id"),
            "load_dt",
            "source_file_name",
            "loaded_by"
        )
    )

    return df_exploded
