# Silver ➜ Gold Metrics Aggregation

This notebook builds gold-layer growth metrics from the GDPR-compliant silver orders dataset.


In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

np.random.seed(21)


## 1. Load Silver Orders

Prefer loading from Unity Catalog; fall back to an example dataset when Spark is unavailable.


In [None]:
try:
    spark
except NameError:
    spark = None

if spark:
    silver_orders = spark.table("erp_demo.silver_orders").toPandas()
    source = "spark"
else:
    silver_orders = pd.DataFrame({
        "order_id": range(1, 11),
        "customer_id": [1, 1, 2, 2, 3, 4, 4, 5, 5, 5],
        "customer_hash_id": [
            "cust_a1", "cust_a1", "cust_b2", "cust_b2", "cust_c3",
            "cust_d4", "cust_d4", "cust_e5", "cust_e5", "cust_e5"
        ],
        "country": ["DE", "DE", "FR", "FR", "ES", "US", "US", "DE", "DE", "DE"],
        "purchase_value_eur": [120.0, 75.0, 200.0, 180.0, 90.5, 310.0, 95.0, 49.0, 130.0, 160.0],
        "marketing_opt_in": [True, True, True, True, True, False, False, True, True, True],
        "ingestion_timestamp": pd.date_range("2025-10-01", periods=10, freq="H"),
    })
    source = "sample"

print(f"Loaded silver dataset via {source} path. Records: {len(silver_orders)}")
silver_orders.head()


## 2. Feature Engineering

We derive order month, count of events per customer, and flags that support gold metrics.


In [None]:
silver_orders["order_month"] = silver_orders["ingestion_timestamp"].dt.to_period("M").dt.to_timestamp()
silver_orders["order_count"] = 1

silver_orders.head()


## 3. Build Gold Metrics

Aggregate monthly KPIs aligned with growth tracking: revenue, active customers, order velocity, and marketing engagement.


In [None]:
monthly = silver_orders.groupby("order_month").agg({
    "purchase_value_eur": ["sum", "mean"],
    "order_count": "sum",
    "customer_hash_id": pd.Series.nunique,
    "marketing_opt_in": "sum",
}).reset_index()
monthly.columns = [
    "order_month",
    "gross_revenue_eur",
    "average_order_value_eur",
    "orders",
    "unique_customers",
    "marketing_opt_in_count",
]

monthly["marketing_opt_in_rate"] = (
    monthly["marketing_opt_in_count"] / monthly["unique_customers"]
).round(3)
monthly["revenue_growth"] = monthly["gross_revenue_eur"].pct_change().replace([np.inf, -np.inf], np.nan).round(3)
monthly["orders_growth"] = monthly["orders"].pct_change().replace([np.inf, -np.inf], np.nan).round(3)

monthly


## 4. Gold-Layer Quality Checks


In [None]:
assert {"customer_hash_id", "purchase_value_eur"}.issubset(silver_orders.columns)
assert "first_name" not in silver_orders.columns, "PII leakage from previous tier"
assert monthly["gross_revenue_eur"].ge(0).all(), "Negative revenue detected"

print("Gold checks passed ✅")


## 5. Persist Gold Metrics (optional)

Execute inside Databricks to publish the gold table.


In [None]:
if spark:
    spark_df_monthly = spark.createDataFrame(monthly)
    spark.sql("CREATE DATABASE IF NOT EXISTS erp_demo")
    spark_df_monthly.write.mode("overwrite").saveAsTable("erp_demo.gold_growth_metrics")
    print("Gold table 'erp_demo.gold_growth_metrics' updated.")
else:
    print("Spark session not available. Export step skipped.")
