In [0]:
# MAGIC %run ./00_config
# STEP 4: Success probability via normalized features + weighted sigmoid (no ML objects, no clip)

from pyspark.sql import functions as F

# ---- Safe fallback if 00_config didn’t run ----
try:
    tbl  # type: ignore
except NameError:
    CATALOG = "influencer"
    def tbl(name: str) -> str:
        return f"{CATALOG}.{name}"

features_tbl = tbl("ml.creator_features")
if not spark.catalog.tableExists(features_tbl):
    raise RuntimeError(f"❌ Feature table not found: {features_tbl} — run Step 3 first!")

df = spark.table(features_tbl).fillna(0)

# ---- Choose feature set ----
feat_cols = [
    "avg_eng_rate", "avg_like", "avg_comment", "avg_share",
    "avg_female_ratio", "activity_score", "avg_age_25_34", "avg_align"
]
feat_cols = [c for c in feat_cols if c in df.columns]
if not feat_cols:
    raise RuntimeError("❌ No expected feature columns found. Check Step 3 output schema.")

# ---- Quantile-based normalization (no clip) ----
eps = 1e-9
for c in feat_cols:
    q = df.approxQuantile(c, [0.05, 0.95], 1e-3)
    lo = float(q[0]) if len(q) > 0 else 0.0
    hi = float(q[1]) if len(q) > 1 else lo + 1.0
    if hi - lo <= 0:
        hi = lo + 1.0
    df = df.withColumn(f"nz_{c}", F.when(F.col(c).isNull(), 0.0).otherwise(F.col(c)))
    norm_expr = (F.col(f"nz_{c}") - F.lit(lo)) / F.lit((hi - lo) + eps)
    # Clamp between 0 and 1 manually
    norm_expr = F.when(norm_expr < 0, 0).when(norm_expr > 1, 1).otherwise(norm_expr)
    df = df.withColumn(f"norm_{c}", norm_expr)

# ---- Feature weights ----
w = {
    "avg_eng_rate": 0.30, "avg_like": 0.10, "avg_comment": 0.10, "avg_share": 0.10,
    "avg_female_ratio": 0.05, "activity_score": 0.15, "avg_age_25_34": 0.10, "avg_align": 0.10
}
w = {f: w[f] for f in feat_cols if f in w}
wsum = sum(w.values()) or 1.0
w = {k: v / wsum for k, v in w.items()}

# ---- Weighted score ----
raw = None
for c in w:
    term = F.col(f"norm_{c}") * F.lit(w[c])
    raw = term if raw is None else (raw + term)
df = df.withColumn("raw_score", raw)

# ---- Map to probability via sigmoid ----
a, b = 6.0, 0.5
df = df.withColumn("success_prob", 1 / (1 + F.exp(-a * (F.col("raw_score") - F.lit(b)))))

# ---- Heuristic label for validation ----
thr = float(df.approxQuantile("raw_score", [0.6], 1e-3)[0])
df = df.withColumn("label", (F.col("raw_score") > F.lit(thr)).cast("int"))

# ---- Save predictions ----
pred_tbl = tbl("ml.creator_predictions")
(df.select("creator_norm_id", "success_prob", "label")
   .write.format("delta")
   .mode("overwrite")
   .option("overwriteSchema", "true")
   .saveAsTable(pred_tbl))

print(f"✅ Step 4 complete (no-ML scorer). Predictions saved to: {pred_tbl}")
display(spark.table(pred_tbl).orderBy(F.desc("success_prob")).limit(10))


✅ Step 4 complete (no-ML scorer). Predictions saved to: influencer.ml.creator_predictions


creator_norm_id,success_prob,label
8bc07220b1f7b0dceef13078018877a7da5e450703262d5dfcfd25040e2d1dfc,0.0474258731775667,0
742419aa8e7450263ad85b8b534b3d5ffa16055748a7cc933522ff962a8d76cc,0.0474258731775667,0
00b7c47c8ff06f8570d69042134a8b86650c18433078c5b419182eae9e918bb2,0.0474258731775667,0
e46e5ad757ac6efc118a88e74c7bd625eaabf6b4189ec1f27ec31a368bb72bf3,0.0474258731775667,0
