### GOLD TRANSFORMATION PIPELINE  
**SILVER → GOLD LAYER (DELTA TABLES)**

---

#### Purpose
This script creates views for the Gold layer in the data warehouse.  
The Gold layer represents the final dimension and fact tables (Star Schema)  
  
Each view performs transformations and combines data from the Silver layer to produce a clean, enriched, and business-ready dataset.

---

#### Actions Performed
- Joins Silver-layer conformed tables to build **dimensions** and **fact** tables
- Applies **surrogate keys** using row numbering
- Filters and cleans data (e.g., removes historical records)
- Writes final **analytics-ready Delta tables**
- Records **audit logs** for load tracking (table name, record counts, status, path)

---

#### Characteristics of GOLD Layer
- Materialized **fact** and **dimension** tables (Star Schema style)
- Denormalized, flattened, **business-friendly** structure
- **Optimized for BI** tools and reporting workloads
- Batch-updated for **consistent snapshots**
- Supports **low-latency queries** for dashboards

---

#### Audit / Lineage
- **Load metadata** is logged as a Delta table (`_gold_audit_logs`)
- Includes table name, target path, load timestamp, record counts, status, and errors (if any)
- Enables end-to-end traceability across Bronze → Silver → Gold

---

#### Parameters
- Paths to **Silver-layer Delta tables** (input)
- Target **GOLD Delta location**

---

#### Example Usage
- Databricks interactive notebook


In [0]:
from datetime import datetime
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType, TimestampType
from pyspark.sql.functions import to_date, to_timestamp
from pyspark.sql import functions as f
from pyspark.sql.window import Window

In [0]:
# Path
silver_path = 'gs://my-bucket-deep/Medallion/silver/'
gold_path = 'gs://my-bucket-deep/Medallion/gold/'

In [0]:
silver_tables = [
    "crm_cust_info",
    "crm_prd_info",
    "crm_sales_details",
    "erp_cust_az12",
    "erp_loc_a101",
    "erp_px_cat_g1v2"
]

silver_dfs = {
    table: spark.read.format('delta').load(f'{silver_path}{table}')
    for table in silver_tables
}

In [0]:
silver_dfs

{'crm_cust_info': DataFrame[cst_id: int, cst_key: string, cst_firstname: string, cst_lastname: string, cst_marital_status: string, cst_gndr: string, cst_create_date: date],
 'crm_prd_info': DataFrame[prd_id: int, cat_id: string, prd_key: string, prd_nm: string, prd_cost: int, prd_line: string, prd_start_dt: date, prd_end_dt: date],
 'crm_sales_details': DataFrame[sls_ord_num: string, sls_prd_key: string, sls_cust_id: int, sls_order_dt: date, sls_ship_dt: date, sls_due_dt: date, sls_sales: int, sls_quantity: int, sls_price: double],
 'erp_cust_az12': DataFrame[cid: string, bdate: date, gen: string],
 'erp_loc_a101': DataFrame[cid: string, cntry: string],
 'erp_px_cat_g1v2': DataFrame[id: string, cat: string, subcat: string, maintenance: string]}

In [0]:
# Gold Source
gold_sources = []

In [0]:
# Utility logger
def log(msg):
    print(f'[{datetime.now().isoformat()}] {msg}')

#### Create Dimension: `gold.dim_customers`

In [0]:
def build_dim_customer():
  log('Building dim_customer')

  df = silver_dfs['crm_cust_info'].alias('ci') \
      .join(silver_dfs['erp_cust_az12'].alias('ca'), f.col('ci.cst_key') == f.col('ca.cid'), 'left') \
      .join(silver_dfs['erp_loc_a101'].alias('cl'), f.col('ci.cst_key') == f.col('cl.cid'), 'left') \
      .select(
            f.col('ci.cst_id').alias('customer_id'),
            f.col('ci.cst_key').alias('customer_number'),
            f.col('ci.cst_firstname').alias('first_name'),
            f.col('ci.cst_lastname').alias('last_name'),
            f.col('ci.cst_marital_status').alias('marital_status'),
            f.col('cl.cntry').alias('country'),
            f.when(f.col('ci.cst_gndr') != 'n/a', f.col('ci.cst_gndr'))
             .otherwise(f.coalesce(f.col('ca.gen'), f.lit('n/a'))).alias('gender'),
            f.col('ca.bdate').alias('birthdate'),
            f.col('ci.cst_create_date').alias('create_date')
      )

  window_customer = Window.orderBy('customer_id')
  df = df.withColumn('customer_key', f.row_number().over(window_customer))    # Surrogate key

  # Reorder columns
  df = df.select(
      'customer_key',      # surrogate key first
      'customer_id',
      'customer_number',
      'first_name',
      'last_name',
      'marital_status',
      'country',
      'gender',
      'birthdate',
      'create_date'
  )

  gold_sources.append({
        "name": "dim_customer",
        "path": f"{gold_path}dim_customer",
        "df": df
  })


#### Create Dimension: `gold.dim_products` 

In [0]:
def build_dim_product():
    log("Building dim_product")
    
    prd_info_active = silver_dfs['crm_prd_info'].filter(f.col('prd_end_dt').isNull())
    
    df = prd_info_active.alias('pi') \
        .join(silver_dfs['erp_px_cat_g1v2'].alias('pc'), f.col('pi.cat_id') == f.col('pc.id'), 'left') \
        .select(
            f.col('pi.prd_id').alias('product_id'),
            f.col('pi.prd_key').alias('product_number'),
            f.col('pi.prd_nm').alias('product_name'),
            f.col('pi.cat_id').alias('category_id'),
            f.col('pc.cat').alias('category'),
            f.col('pc.subcat').alias('sub_category'),
            f.col('pc.maintenance'),
            f.col('pi.prd_cost').alias('product_cost'),
            f.col('pi.prd_line').alias('product_line'),
            f.col('pi.prd_start_dt').alias('product_start_date')
        )
    
    window_product = Window.orderBy('product_start_date', 'product_number')
    df = df.withColumn('product_key', f.row_number().over(window_product))  # Surrogate key

    df = df.select(
        'product_key',  # surrogate key first
        'product_id',
        'product_number',
        'product_name',
        'category_id',
        'category',
        'sub_category',
        'maintenance',
        'product_cost',
        'product_line',
        'product_start_date'
    )
    
    gold_sources.append({
        "name": "dim_product",
        "path": f"{gold_path}dim_product",
        "df": df
    })


#### Create Fact: `gold.fact_sales`

In [0]:
def build_fact_sales():
    log("Reading freshly written dimensions")
    
    dim_customer = spark.read.format('delta').load(f"{gold_path}dim_customer")
    dim_product = spark.read.format('delta').load(f"{gold_path}dim_product")
    
    log("Building fact_sales")
    
    df = silver_dfs['crm_sales_details'].alias('sd') \
        .join(dim_customer.alias('dc'), f.col('sd.sls_cust_id') == f.col('dc.customer_id'), 'left') \
        .join(dim_product.alias('dp'), f.col('sd.sls_prd_key') == f.col('dp.product_number'), 'left') \
        .select(
            f.col('sd.sls_ord_num').alias('order_number'),
            f.col('dc.customer_key'),
            f.col('dp.product_key'),
            f.col('sd.sls_sales').alias('sales_amount'),
            f.col('sd.sls_quantity').alias('quantity'),
            f.col('sd.sls_price').alias('price'),
            f.col('sd.sls_order_dt').alias('order_date'),
            f.col('sd.sls_ship_dt').alias('shipping_date'),
            f.col('sd.sls_due_dt').alias('due_date')
        )
    
    gold_sources.append({
        "name": "fact_sales",
        "path": f"{gold_path}/fact_sales",
        "df": df
    })

##### Add All Transformations

In [0]:
# Step 1: Build Dimensions
build_dim_customer()
build_dim_product()

# Step 2: Write Dimensions immediately (so they can be joined by fact)
for source in gold_sources:
    log(f"Writing dimension {source['name']}")
    source['df'].write.format('delta').mode('overwrite').option('overwriteSchema', 'true').save(source['path'])


[2025-07-05T15:17:07.520859] Building dim_customer
[2025-07-05T15:17:08.304375] Building dim_product
[2025-07-05T15:17:08.579866] Writing dimension dim_customer
[2025-07-05T15:17:38.929810] Writing dimension dim_product


In [0]:
gold_sources.clear()

In [0]:
# Build Fact
build_fact_sales()

[2025-07-05T15:18:38.869203] Reading freshly written dimensions
[2025-07-05T15:18:40.478793] Building fact_sales


In [0]:
def load_to_gold_table(source):
    try:
        log(f"START: Writing {source['name']} to {source['path']}")
        source['df'].write.format('delta').mode('overwrite').option('overwriteSchema', 'true').save(source['path'])

        # Basic check
        count = source['df'].count()
        log(f"SUCCESS: Written {source['name']} with {count} records")
        
        return {
            "table_name": source['name'],
            "status": "SUCCESS",
            "records": count,
            "error": "",
            "loaded_at": datetime.now().isoformat(),
            "target_path": source['path']
        }
    except Exception as e:
        log(f"ERROR writing {source['name']}: {str(e)}")
        return {
            "table_name": source['name'],
            "status": "FAILED",
            "records": 0,
            "error": str(e),
            "loaded_at": datetime.now().isoformat(),
            "target_path": source['path']
        }


In [0]:
# Define Audit Schema
audit_schema = StructType([
    StructField("table_name", StringType(), True),
    StructField("status", StringType(), True),
    StructField("records", IntegerType(), True),
    StructField("error", StringType(), True),
    StructField("loaded_at", StringType(), True),
    StructField("target_path", StringType(), True)
])

In [0]:
# Process all sources and collect audit records
audit_records = []
for source in gold_sources:
    result = load_to_gold_table(source)
    audit_records.append(result)


audit_df = spark.createDataFrame(audit_records, schema=audit_schema)

# Write audit logs
audit_log_path = f"{gold_path}_gold_audit_logs"
audit_df.write.format('delta').mode('append').save(audit_log_path)

log(f"SUCCESS: Gold layer audit logs written to {audit_log_path}")

[2025-07-05T15:22:51.196683] START: Writing fact_sales to gs://my-bucket-deep/Medallion/gold//fact_sales
[2025-07-05T15:23:05.253155] SUCCESS: Written fact_sales with 60398 records
[2025-07-05T15:23:11.942043] SUCCESS: Gold layer audit logs written to gs://my-bucket-deep/Medallion/gold/_gold_audit_logs


In [0]:
audit_df.display()

table_name,status,records,error,loaded_at,target_path
fact_sales,SUCCESS,60398,,2025-07-05T15:23:05.253298,gs://my-bucket-deep/Medallion/gold//fact_sales
