# Bronze ➜ Silver GDPR Compliance

This didactic notebook demonstrates how to enforce GDPR principles while transforming raw bronze-layer ERP data into a curated silver dataset.


In [None]:
import hashlib
import json
from datetime import datetime

import numpy as np
import pandas as pd

np.random.seed(7)


## 1. Simulate Bronze Data

The bronze layer mirrors raw ingestion with personally identifiable information (PII) and consent metadata.


In [None]:
bronze_orders = pd.DataFrame({
    "order_id": range(1, 11),
    "customer_id": np.random.randint(1, 6, size=10),
    "first_name": ["Anna", "Ben", "Chris", "Diana", "Eva"] * 2,
    "last_name": ["Lee", "Patel", "Garcia", "Chen", "Smith"] * 2,
    "email": [
        "anna.lee@example.com",
        "ben.patel@example.com",
        "chris.garcia@example.com",
        "diana.chen@example.com",
        "eva.smith@example.com",
    ]
    * 2,
    "country": ["DE", "FR", "ES", "US", "DE"] * 2,
    "purchase_value_eur": np.round(np.random.uniform(49, 499, size=10), 2),
    "consent_marketing": np.random.choice([True, False], size=10, p=[0.7, 0.3]),
    "consent_data_processing": np.random.choice([True, False], size=10, p=[0.9, 0.1]),
    "ingestion_timestamp": pd.Timestamp("2025-10-01") + pd.to_timedelta(np.random.randint(0, 3600, size=10), unit="s"),
})

bronze_orders


## 2. Define GDPR Controls

We enforce:
- **Lawful basis:** rows missing `consent_data_processing` are removed.
- **Data minimisation:** drop direct identifiers (`first_name`, `last_name`, `email`).
- **Pseudonymisation:** hash emails to deterministic surrogate keys.
- **Purpose limitation:** keep marketing consent only for explicitly opted-in EU residents.
- **Auditability:** log all actions in a governance manifest.


In [None]:
EU_COUNTRIES = {"DE", "FR", "ES", "IT", "NL", "BE", "PT", "IE", "SE", "DK", "FI"}


def hash_email(email: str) -> str:
    """Return a salted SHA256 hash for pseudonymisation."""
    salt = "erp_demo_salt"
    digest = hashlib.sha256((salt + email.lower()).encode("utf-8")).hexdigest()
    return f"cust_{digest[:18]}"


def gdpr_transform(bronze_df: pd.DataFrame) -> tuple[pd.DataFrame, list[dict]]:
    """Apply GDPR-compliant transformations; return silver dataframe and audit log."""
    log = []

    filtered = bronze_df.loc[bronze_df["consent_data_processing"]].copy()
    log.append({
        "timestamp": datetime.utcnow().isoformat(),
        "action": "filter_consent",
        "details": {
            "dropped_rows": int(len(bronze_df) - len(filtered)),
            "policy": "Only keep records with consent_data_processing=True",
        },
    })

    filtered["customer_hash_id"] = filtered["email"].apply(hash_email)
    log.append({
        "timestamp": datetime.utcnow().isoformat(),
        "action": "pseudonymise_email",
        "details": {"method": "SHA256 with static salt", "column": "email"},
    })

    filtered["marketing_opt_in"] = np.where(
        (filtered["country"].isin(EU_COUNTRIES)) & (filtered["consent_marketing"]),
        True,
        False,
    )
    log.append({
        "timestamp": datetime.utcnow().isoformat(),
        "action": "purpose_limited_marketing_opt_in",
        "details": {"rule": "EU residents must opt-in"},
    })

    columns_to_drop = ["first_name", "last_name", "email", "consent_marketing", "consent_data_processing"]
    silver_df = filtered.drop(columns=columns_to_drop)
    log.append({
        "timestamp": datetime.utcnow().isoformat(),
        "action": "drop_direct_identifiers",
        "details": {"dropped_columns": columns_to_drop},
    })

    ordered_cols = [
        "order_id",
        "customer_id",
        "customer_hash_id",
        "country",
        "purchase_value_eur",
        "marketing_opt_in",
        "ingestion_timestamp",
    ]
    silver_df = silver_df[ordered_cols].sort_values("order_id").reset_index(drop=True)

    return silver_df, log



## 3. Transform & Audit


In [None]:
silver_orders, gdpr_log = gdpr_transform(bronze_orders)

silver_orders


In [None]:
print(json.dumps(gdpr_log, indent=2))


## 4. Compliance Checks


In [None]:
PII_COLUMNS = {"first_name", "last_name", "email"}

assert PII_COLUMNS.isdisjoint(silver_orders.columns), "PII columns leaked into silver dataset"
assert silver_orders["customer_hash_id"].str.startswith("cust_").all(), "Unexpected hash prefix"
assert silver_orders["purchase_value_eur"].ge(0).all(), "Negative purchase values"

print("GDPR checks passed ✅")


## 5. Persist Silver Layer (optional)

Run the next cell inside Databricks to materialize the silver table.


In [None]:
try:
    spark
except NameError:
    print("Spark session not available. Skipping silver table write.")
else:
    spark_df_silver_orders = spark.createDataFrame(silver_orders)
    spark.sql("CREATE DATABASE IF NOT EXISTS erp_demo")
    spark_df_silver_orders.write.mode("overwrite").saveAsTable("erp_demo.silver_orders")
    print("Silver table 'erp_demo.silver_orders' updated.")
