In [0]:
# Load ML-ready dataset from Gold layer
gold_df = spark.table(
    "ai_trust_catalog.churn_trust.gold_churn_ml"
)

display(gold_df)

In [0]:
# Separate train and validation datasets
train_df = gold_df.filter("dataset_split = 'train'")
val_df = gold_df.filter("dataset_split = 'validation'")

In [0]:
lr = LogisticRegression(
    featuresCol="features",
    labelCol="label",          # <-- MUST match Gold schema
    probabilityCol="probability",
    predictionCol="prediction"
)

In [0]:
# Train base model
model_1 = lr.fit(train_df)

In [0]:
# MLflow setup for serverless Spark
import os

os.environ["MLFLOW_DFS_TMP"] = "/Volumes/ai_trust_catalog/mlflow_artifacts/spark_ml_tmp"

In [0]:
import mlflow
import mlflow.spark

# DO NOT set experiment on Databricks Free / Serverless
# MLflow will use the default experiment safely

with mlflow.start_run(run_name="logistic_regression_churn"):
    mlflow.log_param("model_type", "logistic_regression")
    mlflow.log_param("input_table", "gold_churn_ml")

    mlflow.spark.log_model(
        model_1,
        artifact_path="model_1"
    )

In [0]:
# Generate predictions for validation set
val_predictions = model_1.transform(val_df)

# Persist predictions for Trust Model training
val_predictions.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(
        "ai_trust_catalog.churn_trust.model_1_predictions"
    )

## Summary

- Loaded ML-ready Gold dataset
- Trained a baseline Logistic Regression churn model
- Logged Model-1 to MLflow for versioning
- Stored validation predictions for Trust Model training