# Bronze ➜ Silver GDPR Compliance

This didactic notebook demonstrates how to enforce GDPR principles while transforming raw bronze-layer ERP data into a curated silver dataset.


In [None]:
import hashlib
import json
from datetime import datetime, timedelta

import numpy as np
from pyspark.sql import functions as F

np.random.seed(7)

try:
    spark
except NameError:
    from pyspark.sql import SparkSession
    spark = SparkSession.builder.appName("bronze_to_silver_gdpr").getOrCreate()


## 1. Simulate Bronze Data

The bronze layer mirrors raw ingestion with personally identifiable information (PII) and consent metadata.


In [None]:
bronze_rows = []
for order_id in range(1, 11):
    purchase_value = float(np.round(np.random.uniform(49, 499), 2))
    service_cogs = purchase_value * np.random.uniform(0.25, 0.4)
    bronze_rows.append({
        "order_id": order_id,
        "customer_id": int(np.random.randint(1, 6)),
        "first_name": ["Anna", "Ben", "Chris", "Diana", "Eva"][order_id % 5],
        "last_name": ["Lee", "Patel", "Garcia", "Chen", "Smith"][order_id % 5],
        "email": [
            "anna.lee@example.com",
            "ben.patel@example.com",
            "chris.garcia@example.com",
            "diana.chen@example.com",
            "eva.smith@example.com",
        ][order_id % 5],
        "country": ["DE", "FR", "ES", "US", "DE"][order_id % 5],
        "purchase_value_eur": purchase_value,
        "service_cogs_eur": service_cogs,
        "consent_marketing": bool(np.random.choice([True, False], p=[0.7, 0.3])),
        "consent_data_processing": bool(np.random.choice([True, False], p=[0.9, 0.1])),
        "ingestion_timestamp": datetime(2025, 10, 1) + timedelta(seconds=int(np.random.randint(0, 3600))),
    })

bronze_orders_df = spark.createDataFrame(bronze_rows)
bronze_orders_df.show(truncate=False)


## 2. Define GDPR Controls

We enforce:
- **Lawful basis:** rows missing `consent_data_processing` are removed.
- **Data minimisation:** drop direct identifiers (`first_name`, `last_name`, `email`).
- **Pseudonymisation:** hash emails to deterministic surrogate keys.
- **Purpose limitation:** keep marketing consent only for explicitly opted-in EU residents.
- **Auditability:** log all actions in a governance manifest.


In [None]:
EU_COUNTRIES = {"DE", "FR", "ES", "IT", "NL", "BE", "PT", "IE", "SE", "DK", "FI"}
SALT = "erp_demo_salt"

initial_count = bronze_orders_df.count()
filtered_df = bronze_orders_df.filter(F.col("consent_data_processing"))
filtered_count = filtered_df.count()

gdpr_log = [
    {
        "timestamp": datetime.utcnow().isoformat(),
        "action": "filter_consent",
        "details": {
            "dropped_rows": int(initial_count - filtered_count),
            "policy": "Only keep records with consent_data_processing=True",
        },
    }
]

silver_df = (
    filtered_df
    .withColumn(
        "customer_hash_id",
        F.concat(
            F.lit("cust_"),
            F.substring(F.sha2(F.concat(F.lit(SALT), F.lower(F.col("email"))), 256), 1, 18),
        ),
    )
    .withColumn(
        "marketing_opt_in",
        F.when(
            (F.col("country").isin(list(EU_COUNTRIES))) & F.col("consent_marketing"),
            F.lit(True),
        ).otherwise(F.lit(False)),
    )
)

gdpr_log.append({
    "timestamp": datetime.utcnow().isoformat(),
    "action": "pseudonymise_email",
    "details": {"method": "SHA256 with static salt", "column": "email"},
})

gdpr_log.append({
    "timestamp": datetime.utcnow().isoformat(),
    "action": "purpose_limited_marketing_opt_in",
    "details": {"rule": "EU residents must opt-in"},
})

columns_to_drop = ["first_name", "last_name", "email", "consent_marketing", "consent_data_processing"]
silver_orders_df = (
    silver_df
    .drop(*columns_to_drop)
    .select(
        "order_id",
        "customer_id",
        "customer_hash_id",
        "country",
        "purchase_value_eur",
        "service_cogs_eur",
        "marketing_opt_in",
        "ingestion_timestamp",
    )
    .orderBy("order_id")
    .cache()
)

gdpr_log.append({
    "timestamp": datetime.utcnow().isoformat(),
    "action": "drop_direct_identifiers",
    "details": {"dropped_columns": columns_to_drop},
})

gdpr_log.append({
    "timestamp": datetime.utcnow().isoformat(),
    "action": "finalise_silver",
    "details": {"records": int(silver_orders_df.count())},
})

silver_orders_df.show(truncate=False)


## 3. Transform & Audit


In [None]:
silver_orders_df.orderBy("order_id").show(truncate=False)


In [None]:
print(json.dumps(gdpr_log, indent=2))


## 4. Compliance Checks


In [None]:
PII_COLUMNS = {"first_name", "last_name", "email"}

assert PII_COLUMNS.isdisjoint(set(silver_orders_df.columns)), "PII columns leaked into silver dataset"
assert silver_orders_df.filter(~F.col("customer_hash_id").startswith("cust_")).count() == 0, "Unexpected hash prefix"
assert silver_orders_df.filter(F.col("purchase_value_eur") < 0).count() == 0, "Negative purchase values"

print("GDPR checks passed ✅")


## 5. Persist Silver Layer (optional)

Run the next cell inside Databricks to materialize the silver table used downstream in the growth pipeline.


In [None]:
try:
    spark
except NameError:
    print("Spark session not available. Skipping silver table write.")
else:
    spark.sql("CREATE DATABASE IF NOT EXISTS erp_demo")
    silver_orders_df.write.mode("overwrite").saveAsTable("erp_demo.silver_orders")
    print("Silver table 'erp_demo.silver_orders' updated.")
