In [0]:
from pyspark.sql.functions import col, to_date
from delta.tables import DeltaTable

# ---------- CONFIG ----------
S3_PATH = "s3a://my-api-data-lambda/*/*/*/*.json"  # read all folders & files
BRONZE_PATH = "yt_trending_catalog.brz_sch_yt.youtube_trending_table_bronze"

# ---------- STEP 1: READ ALL JSON FILES ----------
df_raw = spark.read.json(S3_PATH)


In [0]:
from pyspark.sql.functions import explode, col

In [0]:
df_exploded = df_raw.select(explode(col("items")).alias("item"))

In [0]:

# ---------- STEP 2: SELECT COLUMNS ----------
df_bronze = df_exploded.select(
   col("item.id").alias("video_id"),
    col("item.etag").alias("item_etag"),
    col("item.kind").alias("video_kind"),

    # Snippet fields
    col("item.snippet.categoryId").alias("category_id"),
    col("item.snippet.channelId").alias("channel_id"),
    col("item.snippet.channelTitle").alias("channel_title"),
    col("item.snippet.defaultAudioLanguage").alias("default_audio_language"),
    col("item.snippet.defaultLanguage").alias("default_language"),
    col("item.snippet.description").alias("description"),
    col("item.snippet.liveBroadcastContent").alias("live_broadcast_content"),
    col("item.snippet.publishedAt").alias("published_at"),
    col("item.snippet.title").alias("title"),

    # Localized fields
    col("item.snippet.localized.title").alias("localized_title"),
    col("item.snippet.localized.description").alias("localized_description"),

    # Tags (array)
    col("item.snippet.tags").alias("tags"),

    # Thumbnails
    col("item.snippet.thumbnails.default.url").alias("thumbnail_default_url"),
    col("item.snippet.thumbnails.default.height").alias("thumbnail_default_height"),
    col("item.snippet.thumbnails.default.width").alias("thumbnail_default_width"),

    col("item.snippet.thumbnails.medium.url").alias("thumbnail_medium_url"),
    col("item.snippet.thumbnails.medium.height").alias("thumbnail_medium_height"),
    col("item.snippet.thumbnails.medium.width").alias("thumbnail_medium_width"),

    col("item.snippet.thumbnails.high.url").alias("thumbnail_high_url"),
    col("item.snippet.thumbnails.high.height").alias("thumbnail_high_height"),
    col("item.snippet.thumbnails.high.width").alias("thumbnail_high_width"),

    col("item.snippet.thumbnails.standard.url").alias("thumbnail_standard_url"),
    col("item.snippet.thumbnails.standard.height").alias("thumbnail_standard_height"),
    col("item.snippet.thumbnails.standard.width").alias("thumbnail_standard_width"),

    col("item.snippet.thumbnails.maxres.url").alias("thumbnail_maxres_url"),
    col("item.snippet.thumbnails.maxres.height").alias("thumbnail_maxres_height"),
    col("item.snippet.thumbnails.maxres.width").alias("thumbnail_maxres_width"),

    # Statistics
    col("item.statistics.viewCount").alias("views"),
    col("item.statistics.likeCount").alias("likes"),
    col("item.statistics.commentCount").alias("comments"),
    col("item.statistics.favoriteCount").alias("favorite_count")
).dropDuplicates(["video_id"])  # avoid duplicates within same file


In [0]:
from pyspark.sql.types import IntegerType
df_bronze = df_bronze.withColumn("views", col("views").cast(IntegerType())) \
    .withColumn("likes", col("likes").cast(IntegerType())) \
    .withColumn("comments", col("comments").cast(IntegerType())) \
    .withColumn("favorite_count", col("favorite_count").cast(IntegerType()))

In [0]:
df_bronze= df_bronze.orderBy(col("views").desc())

In [0]:
# ---------- STEP 3: MERGE INTO SILVER ----------
if spark.catalog.tableExists(BRONZE_PATH):
    delta_tbl = DeltaTable.forName(spark, BRONZE_PATH)
    (
        delta_tbl.alias("t")
        .merge(
            df_bronze.alias("s"),
            "t.video_id = s.video_id"
        )
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        .execute()
    )
else:
    df_bronze.write.format("delta").mode("overwrite").saveAsTable(BRONZE_PATH)