In [0]:
import os
os.environ["MLFLOW_DFS_TMP"] = "/Volumes/workspace/ecommerce/ecommerce_data/mlflow_tmp"
print("UC volume temp path set!")

In [0]:
import mlflow.spark
model_uri = "runs:/af7882d6199441f5a3b64002f7e8138a/random_forest_model"
rf_model = mlflow.spark.load_model(model_uri)
print("Model loaded successfully!")

In [0]:
from pyspark.ml.functions import vector_to_array
from pyspark.sql.functions import col
# Load silver features
features_df = spark.table("workspace.ecommerce.silver_user_features")

In [0]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(
    inputCols=["total_events", "total_purchases", "total_spent", "avg_spent"],
    outputCol="features")
final_ml = assembler.transform(features_df)

In [0]:
# Score users
full_predictions = rf_model.transform(final_ml)

scored_df = full_predictions.withColumn(
    "purchase_probability",
    vector_to_array(col("probability"))[1]
)

print("All users scored successfully!")

In [0]:
from pyspark.sql.functions import current_date

gold_df = scored_df.select(
    "user_id",
    col("prediction").alias("predicted_label"),
    "purchase_probability"
).withColumn("scoring_date", current_date())

gold_df.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("workspace.ecommerce.gold_user_purchase_predictions")

print("Gold prediction table created!")

In [0]:
from pyspark.sql.functions import desc
top_buyers = gold_df.orderBy(
    desc("purchase_probability")
)
top_buyers.show(10)