# Silver ➜ Gold Metrics Aggregation

This notebook builds gold-layer growth metrics from the GDPR-compliant silver orders dataset.


In [None]:
import numpy as np
import polars as pl
from datetime import datetime, timedelta

np.random.seed(21)


## 1. Load Silver Orders

Prefer loading from Unity Catalog; fall back to an example dataset when Spark is unavailable.


In [None]:
try:
    spark
except NameError:
    spark = None

if spark:
    spark_rows = spark.table("erp_demo.silver_orders").collect()
    silver_orders = pl.DataFrame([row.asDict(recursive=True) for row in spark_rows])
    source = "spark"
else:
    silver_orders = pl.DataFrame({
        "order_id": list(range(1, 11)),
        "customer_id": [1, 1, 2, 2, 3, 4, 4, 5, 5, 5],
        "customer_hash_id": [
            "cust_a1", "cust_a1", "cust_b2", "cust_b2", "cust_c3",
            "cust_d4", "cust_d4", "cust_e5", "cust_e5", "cust_e5"
        ],
        "country": ["DE", "DE", "FR", "FR", "ES", "US", "US", "DE", "DE", "DE"],
        "purchase_value_eur": [120.0, 75.0, 200.0, 180.0, 90.5, 310.0, 95.0, 49.0, 130.0, 160.0],
        "marketing_opt_in": [True, True, True, True, True, False, False, True, True, True],
        "ingestion_timestamp": [
            datetime(2025, 10, 1, 0, 0) + timedelta(hours=i) for i in range(10)
        ],
    })
    source = "sample"

print(f"Loaded silver dataset via {source} path. Records: {silver_orders.height}")
silver_orders.head()


## 2. Feature Engineering

We derive order month, count of events per customer, and flags that support gold metrics.


In [None]:
silver_orders = silver_orders.with_columns([
    pl.col("ingestion_timestamp").dt.truncate("1mo").alias("order_month"),
    pl.lit(1).alias("order_count"),
])

silver_orders.head()


## 3. Build Gold Metrics

Aggregate monthly KPIs aligned with growth tracking: revenue, active customers, order velocity, and marketing engagement.


In [None]:
monthly = (
    silver_orders.groupby("order_month")
    .agg([
        pl.col("purchase_value_eur").sum().alias("gross_revenue_eur"),
        pl.col("purchase_value_eur").mean().alias("average_order_value_eur"),
        pl.col("order_count").sum().alias("orders"),
        pl.col("customer_hash_id").n_unique().alias("unique_customers"),
        pl.col("marketing_opt_in").sum().alias("marketing_opt_in_count"),
    ])
    .sort("order_month")
)

monthly = monthly.with_columns([
    (pl.col("marketing_opt_in_count") / pl.col("unique_customers")).round(3).alias("marketing_opt_in_rate"),
    pl.col("gross_revenue_eur").shift(1).alias("prev_revenue"),
    pl.col("orders").shift(1).alias("prev_orders"),
])

monthly = monthly.with_columns([
    pl.when(pl.col("prev_revenue") > 0)
    .then(((pl.col("gross_revenue_eur") - pl.col("prev_revenue")) / pl.col("prev_revenue")).round(3))
    .otherwise(None)
    .alias("revenue_growth"),
    pl.when(pl.col("prev_orders") > 0)
    .then(((pl.col("orders") - pl.col("prev_orders")) / pl.col("prev_orders")).round(3))
    .otherwise(None)
    .alias("orders_growth"),
]).drop(["prev_revenue", "prev_orders"])

monthly


## 4. Gold-Layer Quality Checks


In [None]:
assert {"customer_hash_id", "purchase_value_eur"}.issubset(set(silver_orders.columns))
assert "first_name" not in silver_orders.columns, "PII leakage from previous tier"
assert monthly.select(pl.col("gross_revenue_eur") >= 0).to_series().all(), "Negative revenue detected"

print("Gold checks passed ✅")


## 5. Persist Gold Metrics (optional)

Execute inside Databricks to publish the gold table.


In [None]:
if spark:
    spark_df_monthly = spark.createDataFrame(monthly.to_dicts())
    spark.sql("CREATE DATABASE IF NOT EXISTS erp_demo")
    spark_df_monthly.write.mode("overwrite").saveAsTable("erp_demo.gold_growth_metrics")
    print("Gold table 'erp_demo.gold_growth_metrics' updated.")
else:
    print("Spark session not available. Export step skipped.")
