# Silver Layer Transformation - Enterprise Data Platform

## Overview
This notebook transforms Bronze Delta tables into Silver layer with:
- Data quality improvements
- Data type standardization
- Deduplication
- Conformance to business rules

**Prerequisites:**
- Bronze Delta tables created (run 01_ingest_to_bronze.ipynb first)

**Output:**
- Silver Delta tables with cleaned, conformed data

In [None]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
from delta.tables import DeltaTable
from datetime import datetime

print(f"Silver Transformation Started: {datetime.now()}")

## Configuration

In [None]:
# Configuration
SILVER_PREFIX = "Silver_"

# Tables to transform
TABLES_TO_TRANSFORM = [
    "DimCustomer",
    "DimProduct", 
    "DimEmployee",
    "FactSales",
    "FactSupport",
    "FactAttrition"
]

print(f"Tables to transform: {len(TABLES_TO_TRANSFORM)}")

## Transformation Functions

In [None]:
def transform_dim_customer(bronze_df):
    """Transform DimCustomer: standardize, clean, deduplicate."""
    
    # Remove duplicates (keep most recent by ingestion timestamp)
    window_spec = Window.partitionBy("customer_id").orderBy(desc("_ingestion_timestamp"))
    df = bronze_df.withColumn("row_num", row_number().over(window_spec)) \
                  .filter(col("row_num") == 1) \
                  .drop("row_num")
    
    # Standardize text fields
    df = df.withColumn("customer_name", trim(col("customer_name"))) \
           .withColumn("country", upper(col("country"))) \
           .withColumn("region", upper(col("region")))
    
    # Add derived fields
    df = df.withColumn("customer_age_days", 
                       datediff(current_date(), col("customer_since")))
    
    # Filter out inactive customers with no history (optional)
    # df = df.filter(col("is_active") == True)
    
    return df

def transform_fact_sales(bronze_df):
    """Transform FactSales: validate amounts, filter invalid records."""
    
    # Remove records with negative amounts (data quality issue)
    df = bronze_df.filter(col("net_amount") >= 0) \
                  .filter(col("quantity") > 0)
    
    # Standardize status values
    df = df.withColumn("status", initcap(col("status")))
    
    # Add derived metrics
    df = df.withColumn("margin_percent", 
                       when(col("net_amount") > 0, 
                            col("gross_margin") / col("net_amount") * 100)
                       .otherwise(0))
    
    # Add date flags
    df = df.withColumn("is_same_day_delivery",
                       col("order_date_id") == col("delivery_date_id"))
    
    return df

def transform_generic(bronze_df, table_name):
    """Generic transformation: remove duplicates, standardize."""
    
    # Get primary key column (assume it's the first column with 'id' in name)
    pk_cols = [c for c in bronze_df.columns if c.endswith("_id")]
    
    if pk_cols:
        pk_col = pk_cols[0]
        # Remove duplicates
        window_spec = Window.partitionBy(pk_col).orderBy(desc("_ingestion_timestamp"))
        df = bronze_df.withColumn("row_num", row_number().over(window_spec)) \
                      .filter(col("row_num") == 1) \
                      .drop("row_num")
    else:
        df = bronze_df
    
    return df

## Transform Dimensions

In [None]:
print("\n" + "="*80)
print("STEP 1: Transforming Dimension Tables")
print("="*80)

# Transform DimCustomer
try:
    print("\nTransforming DimCustomer...")
    bronze_customer = spark.table("DimCustomer")
    silver_customer = transform_dim_customer(bronze_customer)
    
    bronze_count = bronze_customer.count()
    silver_count = silver_customer.count()
    
    silver_customer.write.format("delta") \
        .mode("overwrite") \
        .saveAsTable(f"{SILVER_PREFIX}DimCustomer")
    
    print(f"✅ Silver_DimCustomer created: {bronze_count:,} → {silver_count:,} rows")
    if bronze_count > silver_count:
        print(f"   Removed {bronze_count - silver_count:,} duplicate records")
except Exception as e:
    print(f"❌ Error transforming DimCustomer: {str(e)}")

# Transform other dimensions with generic transformation
for table in ["DimProduct", "DimEmployee"]:
    try:
        print(f"\nTransforming {table}...")
        bronze_df = spark.table(table)
        silver_df = transform_generic(bronze_df, table)
        
        bronze_count = bronze_df.count()
        silver_count = silver_df.count()
        
        silver_df.write.format("delta") \
            .mode("overwrite") \
            .saveAsTable(f"{SILVER_PREFIX}{table}")
        
        print(f"✅ {SILVER_PREFIX}{table} created: {bronze_count:,} → {silver_count:,} rows")
    except Exception as e:
        print(f"⏭️  Skipping {table}: {str(e)}")

## Transform Fact Tables

In [None]:
print("\n" + "="*80)
print("STEP 2: Transforming Fact Tables")
print("="*80)

# Transform FactSales
try:
    print("\nTransforming FactSales...")
    bronze_sales = spark.table("FactSales")
    silver_sales = transform_fact_sales(bronze_sales)
    
    bronze_count = bronze_sales.count()
    silver_count = silver_sales.count()
    
    silver_sales.write.format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable(f"{SILVER_PREFIX}FactSales")
    
    print(f"✅ Silver_FactSales created: {bronze_count:,} → {silver_count:,} rows")
    if bronze_count > silver_count:
        print(f"   Filtered out {bronze_count - silver_count:,} invalid records")
except Exception as e:
    print(f"⏭️  Skipping FactSales: {str(e)}")

# Transform other fact tables with generic transformation
for table in ["FactSupport", "FactAttrition", "FactInventory"]:
    try:
        print(f"\nTransforming {table}...")
        bronze_df = spark.table(table)
        silver_df = transform_generic(bronze_df, table)
        
        silver_count = silver_df.count()
        
        silver_df.write.format("delta") \
            .mode("overwrite") \
            .saveAsTable(f"{SILVER_PREFIX}{table}")
        
        print(f"✅ {SILVER_PREFIX}{table} created: {silver_count:,} rows")
    except Exception as e:
        print(f"⏭️  Skipping {table}: {str(e)}")

## Data Quality Validation

In [None]:
print("\n" + "="*80)
print("STEP 3: Data Quality Validation")
print("="*80)

# Validate Silver_DimCustomer
try:
    df = spark.table(f"{SILVER_PREFIX}DimCustomer")
    
    # Check for nulls in key columns
    null_checks = {
        "customer_id": df.filter(col("customer_id").isNull()).count(),
        "customer_name": df.filter(col("customer_name").isNull()).count()
    }
    
    print("\nSilver_DimCustomer Quality Checks:")
    for field, null_count in null_checks.items():
        status = "✅" if null_count == 0 else "⚠️"
        print(f"  {status} {field}: {null_count} nulls")
    
    # Check data consistency
    distinct_count = df.select("customer_id").distinct().count()
    total_count = df.count()
    if distinct_count == total_count:
        print(f"  ✅ Primary key uniqueness: OK ({distinct_count:,} unique customers)")
    else:
        print(f"  ⚠️  Duplicate customer_ids found: {total_count - distinct_count}")
        
except Exception as e:
    print(f"⏭️  Skipping validation: {str(e)}")

# Validate Silver_FactSales
try:
    df = spark.table(f"{SILVER_PREFIX}FactSales")
    
    print("\nSilver_FactSales Quality Checks:")
    
    # Check for negative amounts
    negative_count = df.filter(col("net_amount") < 0).count()
    status = "✅" if negative_count == 0 else "⚠️"
    print(f"  {status} Negative amounts: {negative_count}")
    
    # Check for zero quantities
    zero_qty = df.filter(col("quantity") <= 0).count()
    status = "✅" if zero_qty == 0 else "⚠️"
    print(f"  {status} Zero/negative quantities: {zero_qty}")
    
    # Check status values
    print(f"\n  Status distribution:")
    status_dist = df.groupBy("status").count().orderBy(desc("count"))
    status_dist.show(10, truncate=False)
    
except Exception as e:
    print(f"⏭️  Skipping validation: {str(e)}")

## Summary

In [None]:
# List all Silver tables
print("\n" + "="*80)
print("SILVER TRANSFORMATION COMPLETE")
print("="*80)

silver_tables = [t for t in spark.catalog.listTables() if t.name.startswith("silver_")]

print(f"\nSilver tables created: {len(silver_tables)}\n")

for table in sorted(silver_tables, key=lambda x: x.name):
    df = spark.table(table.name)
    row_count = df.count()
    col_count = len(df.columns)
    print(f"  {table.name:35s} | {row_count:>10,} rows | {col_count:>3} columns")

print(f"\nCompletion Time: {datetime.now()}")
print("\nNext Step: Run notebook 03_build_gold_star_schema.ipynb")
print("="*80)