In [0]:
# # 02 - Ingeniería de Features

# COMMAND ----------

from pyspark.sql.functions import avg

# Leer datos limpios
df = spark.table("datalottery.lotterybets.lottery_bets_dirty_cleaned")

# Promedio de monto apostado por usuario
avg_stake = df.groupBy("user_id").agg(avg("stake_amount").alias("avg_stake_amount"))

# Riesgos agregados por usuario
user_risk_avg = df.groupBy("user_id").agg(
    avg("ip_risk").alias("avg_user_ip_risk"),
    avg("geo_risk").alias("avg_user_geo_risk")
)

# Base features
features_df = df.select(
    "user_id",
    "bets_last_7d",            
    "win_rate_last_30d",
    "ip_risk",
    "geo_risk",
    "num_picks",
    "suspicious"
).dropDuplicates(["user_id"])

# Unión de todas las features
features_df = features_df \
    .join(avg_stake, on="user_id", how="left") \
    .join(user_risk_avg, on="user_id", how="left")

# Guardar tabla final
features_df.write \
    .format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable("datalottery.lotterybets.lottery_bets_dirty_features")


In [0]:
print(df.columns)