In [0]:
# MAGIC %run ./00_config
# STEP 3: Feature Engineering and Alignment (ML-ready Features)

from pyspark.sql import functions as F, Window

# ---- Safe fallback if 00_config didn't run ----
try:
    tbl  # type: ignore
except NameError:
    CATALOG = "influencer"
    def tbl(name: str) -> str:
        return f"{CATALOG}.{name}"

# ---- Read Silver table ----
silver_tbl = tbl("curated.posts_silver")
if not spark.catalog.tableExists(silver_tbl):
    raise RuntimeError(f"❌ Silver table not found: {silver_tbl}\nRun Step 2 first!")

s = spark.table(silver_tbl)

# ---- Compute features per creator ----
w_creator = Window.partitionBy("creator_norm_id")

features = (
    s.withColumn("total_posts", F.count("post_id").over(w_creator))
     .withColumn("avg_eng_rate", F.mean("eng_rate_proxy").over(w_creator))
     .withColumn("avg_like", F.mean("like_count").over(w_creator))
     .withColumn("avg_comment", F.mean("comment_count").over(w_creator))
     .withColumn("avg_share", F.mean("share_count").over(w_creator))
     .withColumn("avg_age_18_24", F.mean("audience_age_18_24").over(w_creator))
     .withColumn("avg_age_25_34", F.mean("audience_age_25_34").over(w_creator))
     .withColumn("avg_female_ratio", F.mean("audience_gender_f").over(w_creator))
     .withColumn("avg_align", F.when(F.col("audience_age_25_34") > F.col("audience_age_18_24"), 1)
                               .otherwise(0))
     .withColumn("activity_score", 
                 (F.col("avg_eng_rate") / (1 + F.col("total_posts"))))
     .select("creator_norm_id", "platform", "creator_id",
             "total_posts", "avg_eng_rate", "avg_like", "avg_comment", "avg_share",
             "avg_age_18_24", "avg_age_25_34", "avg_female_ratio", 
             "avg_align", "activity_score")
     .dropDuplicates(["creator_norm_id"])
)

# ---- Write features to ML schema ----
features_tbl = tbl("ml.creator_features")

(features.write.format("delta")
         .mode("overwrite")
         .option("overwriteSchema", "true")
         .saveAsTable(features_tbl))

print(f"✅ Feature table created: {features_tbl}")
display(spark.table(features_tbl).limit(10))


✅ Feature table created: influencer.ml.creator_features


creator_norm_id,platform,creator_id,total_posts,avg_eng_rate,avg_like,avg_comment,avg_share,avg_age_18_24,avg_age_25_34,avg_female_ratio,avg_align,activity_score
00b7c47c8ff06f8570d69042134a8b86650c18433078c5b419182eae9e918bb2,tiktok,unknown_creator,608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0
742419aa8e7450263ad85b8b534b3d5ffa16055748a7cc933522ff962a8d76cc,youtube,unknown_creator,754,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0
8bc07220b1f7b0dceef13078018877a7da5e450703262d5dfcfd25040e2d1dfc,instagram,unknown_creator,1340,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0
e46e5ad757ac6efc118a88e74c7bd625eaabf6b4189ec1f27ec31a368bb72bf3,twitter,unknown_creator,298,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0
