In [0]:
# --- STEP 2: Clean & standardize to Silver ---

# If 00_config is in the same folder, keep this line enabled.
# MAGIC %run ./00_config

from pyspark.sql import functions as F

# ---- Safe bootstrap: define tbl() if 00_config didn't run ----
try:
    tbl  # type: ignore
except NameError:
    CATALOG = "influencer"  # default catalog; change if needed
    def tbl(name: str) -> str:
        return f"{CATALOG}.{name}"

# ---- Check Bronze exists ----
bronze_tbl = tbl("raw.posts_bronze")
if not spark.catalog.tableExists(bronze_tbl):
    raise RuntimeError(f"❌ Bronze table not found: {bronze_tbl}\n"
                       f"Run Step 1 first to create it.")

# ---- Read Bronze ----
b = spark.table(bronze_tbl)

# ---- Clean & standardize types/strings/timestamps ----
clean = (
    b.withColumn("like_count",        F.coalesce(F.col("like_count").cast("int"),   F.lit(0)))
     .withColumn("comment_count",     F.coalesce(F.col("comment_count").cast("int"),F.lit(0)))
     .withColumn("share_count",       F.coalesce(F.col("share_count").cast("int"),  F.lit(0)))
     .withColumn("audience_age_18_24",F.coalesce(F.col("audience_age_18_24").cast("double"), F.lit(0.0)))
     .withColumn("audience_age_25_34",F.coalesce(F.col("audience_age_25_34").cast("double"), F.lit(0.0)))
     .withColumn("audience_gender_f", F.coalesce(F.col("audience_gender_f").cast("double"),  F.lit(0.0)))
     .withColumn("platform",          F.lower(F.trim(F.col("platform"))))
     .withColumn("creator_id",        F.trim(F.col("creator_id")))
     .withColumn("post_id",           F.trim(F.col("post_id")))
     .withColumn("text",              F.trim(F.col("text")))
     .withColumn("audience_country",  F.upper(F.col("audience_country")))
     .withColumn("timestamp",         F.coalesce(F.to_timestamp(F.col("timestamp")), F.current_timestamp()))
)

# ---- Quality filters + derived fields ----
silver = (
    clean.filter("post_id IS NOT NULL AND creator_id IS NOT NULL AND platform IS NOT NULL")
         .filter("like_count >= 0 AND comment_count >= 0 AND share_count >= 0")
         .dropDuplicates(["post_id"])
         .withColumn("creator_norm_id", F.sha2(F.concat_ws(":", F.col("platform"), F.col("creator_id")), 256))
         .withColumn("eng_rate_proxy",  F.col("like_count") + F.col("comment_count") + F.col("share_count"))
)

# ---- Write Silver table ----
silver_tbl = tbl("curated.posts_silver")
(silver.write.format("delta")
      .mode("overwrite")
      .option("overwriteSchema", "true")
      .saveAsTable(silver_tbl))

print(f"✅ Silver table created: {silver_tbl}")
display(spark.table(silver_tbl).limit(10))


✅ Silver table created: influencer.curated.posts_silver


platform,creator_id,post_id,timestamp,text,like_count,comment_count,share_count,audience_country,audience_age_18_24,audience_age_25_34,audience_gender_f,creator_norm_id,eng_rate_proxy
instagram,unknown_creator,p_1159,2025-11-11T15:24:51.628Z,,0,0,0,NIGERIA,0.0,0.0,0.0,8bc07220b1f7b0dceef13078018877a7da5e450703262d5dfcfd25040e2d1dfc,0
twitter,unknown_creator,p_412,2025-11-11T15:24:51.628Z,,0,0,0,INDIA,0.0,0.0,0.0,e46e5ad757ac6efc118a88e74c7bd625eaabf6b4189ec1f27ec31a368bb72bf3,0
youtube,unknown_creator,p_969,2025-11-11T15:24:51.628Z,,0,0,0,BRAZIL,0.0,0.0,0.0,742419aa8e7450263ad85b8b534b3d5ffa16055748a7cc933522ff962a8d76cc,0
instagram,unknown_creator,p_2899,2025-11-11T15:24:51.628Z,,0,0,0,UNITED KINGDOM,0.0,0.0,0.0,8bc07220b1f7b0dceef13078018877a7da5e450703262d5dfcfd25040e2d1dfc,0
instagram,unknown_creator,p_762,2025-11-11T15:24:51.628Z,,0,0,0,KENYA,0.0,0.0,0.0,8bc07220b1f7b0dceef13078018877a7da5e450703262d5dfcfd25040e2d1dfc,0
tiktok,unknown_creator,p_1796,2025-11-11T15:24:51.628Z,,0,0,0,PHILIPPINES,0.0,0.0,0.0,00b7c47c8ff06f8570d69042134a8b86650c18433078c5b419182eae9e918bb2,0
tiktok,unknown_creator,p_2901,2025-11-11T15:24:51.628Z,,0,0,0,UNITED STATES,0.0,0.0,0.0,00b7c47c8ff06f8570d69042134a8b86650c18433078c5b419182eae9e918bb2,0
tiktok,unknown_creator,p_2,2025-11-11T15:24:51.628Z,,0,0,0,KENYA,0.0,0.0,0.0,00b7c47c8ff06f8570d69042134a8b86650c18433078c5b419182eae9e918bb2,0
youtube,unknown_creator,p_1405,2025-11-11T15:24:51.628Z,,0,0,0,UNITED STATES,0.0,0.0,0.0,742419aa8e7450263ad85b8b534b3d5ffa16055748a7cc933522ff962a8d76cc,0
youtube,unknown_creator,p_266,2025-11-11T15:24:51.628Z,,0,0,0,FRANCE,0.0,0.0,0.0,742419aa8e7450263ad85b8b534b3d5ffa16055748a7cc933522ff962a8d76cc,0
