In [0]:
from pyspark.sql import functions as F
# Load bronze events table
events = spark.table("workspace.ecommerce.events_delta")

# Recreate binary label
label_df = events.groupBy("user_id").agg(
    F.max(
        F.when(F.col("event_type") == "purchase", 1).otherwise(0)
    ).alias("purchased")
)

# Load silver feature table
features_df = spark.table("workspace.ecommerce.user_features_silver")

# Recreate training dataset (features + label)
training_data = features_df.join(label_df, "user_id")

print("Training dataset recreated successfully!")
training_data.show(5)

In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

assembler = VectorAssembler(
    inputCols=["total_events", "purchases", "total_spent", "avg_price"],
    outputCol="features"
)

ml_data = assembler.transform(training_data).select("features", "purchased")

print("Feature vector created successfully!")
ml_data.show(5, truncate=False)

In [0]:
lr = LogisticRegression(
    featuresCol="features",
    labelCol="purchased",
    maxIter=20,
    regParam=0.1
)
lr_model = lr.fit(ml_data)

print("Logistic Regression model trained successfully!")

In [0]:
# Generate predictions from Logistic Regression model
lr_predictions = lr_model.transform(ml_data)

# Evaluate using AUC
evaluator = BinaryClassificationEvaluator(
    labelCol="purchased",
    rawPredictionCol="rawPrediction",
    metricName="areaUnderROC"
)
lr_auc = evaluator.evaluate(lr_predictions)

print("Logistic Regression AUC:", lr_auc)

In [0]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(
    featuresCol="features",
    labelCol="purchased",
    numTrees=100,
    maxDepth=5,
    seed=42
)

rf_model = rf.fit(ml_data)

print("Random Forest model trained successfully!")

In [0]:
# Generate predictions from RandomForest model
rf_predictions = rf_model.transform(ml_data)

# Evaluate AUC for RandomForest
rf_auc = evaluator.evaluate(rf_predictions)

print("RandomForest AUC:", rf_auc)

In [0]:
# Recreate train/test split from training_data
train_df, test_df = training_data.randomSplit([0.8, 0.2], seed=42)

print("Train/Test recreated successfully!")
print("Train count:", train_df.count())
print("Test count:", test_df.count())

In [0]:
# Create ML-ready train dataset
train_ml = assembler.transform(train_df) \
.select("features", F.col("purchased").alias("label"))

# Create ML-ready test dataset
test_ml = assembler.transform(test_df) \
.select("features", F.col("purchased").alias("label"))

print("Train and Test ML datasets prepared!")
print("Train ML count:", train_ml.count())
print("Test ML count:", test_ml.count())

In [0]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Train model
lr_final = LogisticRegression(
    featuresCol="features",
    labelCol="label",
    maxIter=20,
    regParam=0.1
)

lr_final_model = lr_final.fit(train_ml)

print("Final Logistic Regression model trained on TRAIN set")

In [0]:
# Make predictions on TEST data
lr_test_predictions = lr_final_model.transform(test_ml)

# Create evaluator for AUC
evaluator = BinaryClassificationEvaluator(
    labelCol="label",
    rawPredictionCol="rawPrediction",
    metricName="areaUnderROC"

)

# Calculate REAL AUC on unseen test data
lr_test_auc = evaluator.evaluate(lr_test_predictions)

print("Final Logistic Regression Test AUC:", lr_test_auc)


In [0]:
# Retrain RandomForest on TRAIN set (correct way)
rf_final = RandomForestClassifier(
    featuresCol="features",
    labelCol="label",
    numTrees=50,
    maxDepth=10,
    seed=42
)

rf_final_model = rf_final.fit(train_ml)

print("Final RandomForest model trained on TRAIN set!")

In [0]:
# Make predictions on TEST data using final RandomForest model
rf_test_predictions = rf_final_model.transform(test_ml)

# Calculate REAL AUC on unseen test data
rf_test_auc = evaluator.evaluate(rf_test_predictions)

print("Final RandomForest Test AUC:", rf_test_auc)