# Gold Layer â€“ ML-Ready Feature Dataset

This notebook converts cleaned Silver data into an ML-ready Gold dataset by
encoding features, assembling vectors, and creating train/validation splits.

In [0]:
# Load cleaned and quality-enriched data from Silver layer
silver_df = spark.table(
    "ai_trust_catalog.churn_trust.silver_customer_churn"
)

display(silver_df)

In [0]:
# Standardize target column name for ML consistency
gold_df = silver_df.withColumnRenamed("churn", "label")

# Remove identifier columns not useful for prediction
gold_df = gold_df.drop("customerid")

In [0]:
from pyspark.sql.types import StringType, NumericType

# Categorical features for encoding
categorical_cols = [
    f.name for f in gold_df.schema.fields
    if isinstance(f.dataType, StringType)
    and f.name != "label"
]

# Numeric features for direct inclusion
numeric_cols = [
    f.name for f in gold_df.schema.fields
    if isinstance(f.dataType, NumericType)
    and f.name != "label"
]

In [0]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

# Convert categorical values to indexed form
indexers = [
    StringIndexer(
        inputCol=col,
        outputCol=f"{col}_idx",
        handleInvalid="keep"
    )
    for col in categorical_cols
]

# One-hot encode indexed categorical features
encoder = OneHotEncoder(
    inputCols=[f"{c}_idx" for c in categorical_cols],
    outputCols=[f"{c}_ohe" for c in categorical_cols]
)

In [0]:
from pyspark.ml.feature import VectorAssembler

# Combine numeric and encoded categorical features into a single vector
assembler = VectorAssembler(
    inputCols=numeric_cols + [f"{c}_ohe" for c in categorical_cols],
    outputCol="features"
)

In [0]:
from pyspark.ml import Pipeline

# End-to-end feature transformation pipeline
feature_pipeline = Pipeline(
    stages=indexers + [encoder, assembler]
)

gold_ready_df = feature_pipeline.fit(gold_df).transform(gold_df)

In [0]:
from pyspark.sql.functions import lit

# Create reproducible train-validation split
train_df, val_df = gold_ready_df.randomSplit([0.8, 0.2], seed=42)

# Add dataset split flag for downstream tracking
train_df = train_df.withColumn("dataset_split", lit("train"))
val_df = val_df.withColumn("dataset_split", lit("validation"))

gold_final_df = train_df.unionByName(val_df)

In [0]:
# Retain only ML-required columns
gold_final_df = gold_final_df.select(
    "features",
    "label",
    "dataset_split"
)

In [0]:
# Persist ML-ready dataset for model training and trust modeling
# overwriteSchema allows schema evolution when overwriting existing Gold table
gold_final_df.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(
        "ai_trust_catalog.churn_trust.gold_churn_ml"
    )

In [0]:
%sql
DESCRIBE TABLE ai_trust_catalog.churn_trust.gold_churn_ml;

In [0]:
%sql
SELECT dataset_split, COUNT(*)
FROM ai_trust_catalog.churn_trust.gold_churn_ml
GROUP BY dataset_split;

## Summary

- Loaded cleaned data from the Silver layer
- Standardized target label and removed identifiers
- Encoded categorical features and assembled feature vectors
- Created reproducible train and validation splits
- Persisted an ML-ready Gold dataset for model training