In [0]:
# Core imports
import mlflow
import mlflow.spark

# Spark helpers
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.functions import vector_to_array

In [0]:
# Load gold churn dataset
gold_df = spark.table("ai_trust_catalog.churn_trust.gold_churn_ml")

# Inspect schema
gold_df.printSchema()

In [0]:
# Churn model experiment
churn_experiment_id = 3980534552620287

runs_df = mlflow.search_runs(
    experiment_ids=[churn_experiment_id],
    order_by=["start_time DESC"],
    max_results=1
)

churn_run_id = runs_df.iloc[0]["run_id"]
churn_run_id

In [0]:
from pyspark.ml.classification import LogisticRegression

# Train / validation split
train_df = gold_df.filter(col("dataset_split") == "train")
val_df   = gold_df.filter(col("dataset_split") == "validation")

from pyspark.ml.classification import LogisticRegression

# Base churn model
lr = LogisticRegression(
    featuresCol="features",
    labelCol="label"  
)
churn_model = lr.fit(train_df)

In [0]:
pred_df = churn_model.transform(val_df)

pred_df.select(
    "label",
    "prediction",
    "probability"
).show(5)


In [0]:
# Convert probability vector to array
trust_features_df = (
    pred_df
    .withColumn("prob_array", vector_to_array("probability"))
    .withColumn("prediction_confidence", col("prob_array")[1])
    .withColumn("prediction_uncertainty", 1 - col("prediction_confidence"))
)

In [0]:
# Trust label = correct churn prediction
trust_train_df = trust_features_df.withColumn(
    "trust_label",
    (col("churn") == col("prediction")).cast("int")
)

In [0]:
trust_assembler = VectorAssembler(
    inputCols=["prediction_confidence", "prediction_uncertainty"],
    outputCol="trust_features"
)

trust_train_df = trust_assembler.transform(trust_train_df)

In [0]:
from pyspark.sql.functions import col, abs as sql_abs

trust_base_df = spark.table(
    "ai_trust_catalog.churn_trust.gold_churn_ml"
)

In [0]:
# Trust label = whether Model-1 prediction matches ground truth label
trust_train_df = trust_base_df.withColumn(
    "trust_label",
    (col("prediction") == col("label")).cast("int")
)

# Confidence & uncertainty
trust_train_df = trust_train_df.withColumn(
    "prediction_confidence",
    sql_abs(col("probability")[1] - 0.5) * 2
).withColumn(
    "prediction_uncertainty",
    1 - col("prediction_confidence")
)

In [0]:
train_vec = trust_train_df.filter(col("dataset_split") == "train")
val_vec   = trust_train_df.filter(col("dataset_split") == "validation")

In [0]:
gold_df = spark.table("ai_trust_catalog.churn_trust.gold_churn_ml")

In [0]:
from pyspark.sql.functions import col

trust_train_df = gold_df.filter(col("dataset_split") == "train")
trust_val_df   = gold_df.filter(col("dataset_split") == "validation")

In [0]:
from pyspark.ml.classification import LogisticRegression

trust_model = LogisticRegression(
    featuresCol="features",
    labelCol="label",          # USE label, NOT churn
    probabilityCol="trust_probability",
    predictionCol="trust_prediction"
)

model_2 = trust_model.fit(trust_train_df)

In [0]:
trust_scored_df = model_2.transform(trust_val_df)

In [0]:
from pyspark.sql.functions import col, abs

trust_final_df = (
    trust_scored_df
    .withColumn(
        "prediction_confidence",
        abs(col("trust_probability")[1] - 0.5) * 2
    )
    .withColumn(
        "prediction_uncertainty",
        1 - col("prediction_confidence")
    )
)

In [0]:
trust_final_df = trust_final_df.withColumnRenamed(
    "label",
    "trust_label"
)

In [0]:
from pyspark.ml.functions import vector_to_array
from pyspark.sql.functions import col, abs

In [0]:
trust_final_df = (
    trust_scored_df
    .withColumn(
        "trust_prob_array",
        vector_to_array(col("trust_probability"))
    )
    .withColumn(
        "prediction_confidence",
        abs(col("trust_prob_array")[1] - 0.5) * 2
    )
    .withColumn(
        "prediction_uncertainty",
        1 - col("prediction_confidence")
    )
    .drop("trust_prob_array")
)

In [0]:
trust_final_df = trust_final_df.withColumnRenamed(
    "label",
    "trust_label"
)

In [0]:
trust_final_df.printSchema()

In [0]:
# Select only finalized Trust dataset columns
trust_final_df = trust_final_df.select(
    "features",
    "trust_label",
    "trust_prediction",
    "trust_probability",
    "prediction_confidence",
    "prediction_uncertainty",
    "dataset_split"
)

In [0]:
# Preview Trust dataset
trust_final_df.show(5, truncate=False)

In [0]:
# Save Trust dataset as Gold table
trust_final_df.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(
        "ai_trust_catalog.churn_trust.gold_trust_dataset"
    )

In [0]:
# Validate saved Trust dataset
spark.table(
    "ai_trust_catalog.churn_trust.gold_trust_dataset"
).printSchema()

In [0]:
# Check dataset split distribution
spark.sql("""
SELECT dataset_split, COUNT(*) AS records
FROM ai_trust_catalog.churn_trust.gold_trust_dataset
GROUP BY dataset_split
""").show()

### Trust Dataset Creation Summary

- Loaded Gold churn ML features (`gold_churn_ml`)
- Trained Trust model (Logistic Regression) on feature vectors
- Generated trust predictions and probabilities
- Derived prediction confidence and uncertainty
- Standardized trust label and outputs
- Persisted final Trust dataset to Gold layer
