# Data Quality Checks - Enterprise Data Platform

## Overview
This notebook performs comprehensive data quality checks on Gold layer tables.

**Checks Performed:**
- Referential integrity (FK validation)
- Business rule compliance
- Data distribution analysis
- Anomaly detection

**Prerequisites:**
- Gold star schema created (run 03_build_gold_star_schema.ipynb first)

In [None]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime
import json

print(f"Data Quality Checks Started: {datetime.now()}")

## Check 1: Referential Integrity

In [None]:
print("\n" + "="*80)
print("CHECK 1: Referential Integrity Validation")
print("="*80)

# Define FK relationships to check
relationships = [
    ("FactSales", "customer_id", "DimCustomer", "customer_id"),
    ("FactSales", "product_id", "DimProduct", "product_id"),
    ("FactSales", "employee_id", "DimEmployee", "employee_id"),
    ("FactSales", "order_date_id", "DimDate", "date_id"),
]

integrity_results = []

for fact_table, fk_col, dim_table, pk_col in relationships:
    try:
        fact = spark.table(fact_table)
        dim = spark.table(dim_table)
        
        # Find orphaned FKs
        orphaned = fact.select(fk_col).distinct() \
            .join(dim.select(pk_col), fact[fk_col] == dim[pk_col], "left_anti") \
            .filter(col(fk_col).isNotNull())
        
        orphan_count = orphaned.count()
        total_distinct = fact.select(fk_col).filter(col(fk_col).isNotNull()).distinct().count()
        
        status = "‚úÖ PASS" if orphan_count == 0 else "‚ùå FAIL"
        
        print(f"\n{status} {fact_table}.{fk_col} ‚Üí {dim_table}.{pk_col}")
        print(f"   Orphaned: {orphan_count:,} / {total_distinct:,} distinct values")
        
        integrity_results.append({
            "relationship": f"{fact_table}.{fk_col} ‚Üí {dim_table}.{pk_col}",
            "orphan_count": orphan_count,
            "passed": orphan_count == 0
        })
        
        if orphan_count > 0:
            print("   Sample orphaned values:")
            orphaned.show(5, truncate=False)
            
    except Exception as e:
        print(f"‚è≠Ô∏è  Skipping {fact_table} ‚Üí {dim_table}: {str(e)}")

# Summary
passed = sum(1 for r in integrity_results if r["passed"])
total = len(integrity_results)
print(f"\n{'='*80}")
print(f"Referential Integrity: {passed}/{total} checks passed")

## Check 2: Null Value Analysis

In [None]:
print("\n" + "="*80)
print("CHECK 2: Null Value Analysis")
print("="*80)

# Tables to check
tables_to_check = ["DimCustomer", "DimProduct", "FactSales"]

for table_name in tables_to_check:
    try:
        print(f"\n{table_name}:")
        print("-" * 40)
        
        df = spark.table(table_name)
        total_rows = df.count()
        
        # Calculate null percentage for each column
        null_stats = []
        for col_name in df.columns:
            null_count = df.filter(col(col_name).isNull()).count()
            null_pct = (null_count / total_rows * 100) if total_rows > 0 else 0
            
            if null_count > 0:
                null_stats.append({
                    "column": col_name,
                    "null_count": null_count,
                    "null_percentage": null_pct
                })
        
        if null_stats:
            for stat in sorted(null_stats, key=lambda x: x["null_percentage"], reverse=True):
                status = "‚ö†Ô∏è" if stat["null_percentage"] > 10 else "‚ÑπÔ∏è"
                print(f"  {status} {stat['column']:30s} | {stat['null_count']:>8,} nulls ({stat['null_percentage']:>5.2f}%)")
        else:
            print("  ‚úÖ No null values found")
            
    except Exception as e:
        print(f"  ‚è≠Ô∏è  Skipping: {str(e)}")

## Check 3: Business Rule Validation

In [None]:
print("\n" + "="*80)
print("CHECK 3: Business Rule Validation")
print("="*80)

# Rule 1: FactSales - amounts should be non-negative
try:
    fact_sales = spark.table("FactSales")
    
    negative_amounts = fact_sales.filter(
        (col("net_amount") < 0) | 
        (col("total_amount") < 0) | 
        (col("quantity") <= 0)
    ).count()
    
    total_rows = fact_sales.count()
    status = "‚úÖ PASS" if negative_amounts == 0 else "‚ùå FAIL"
    
    print(f"\n{status} FactSales: Non-negative amounts rule")
    print(f"   Violations: {negative_amounts:,} / {total_rows:,} rows")
    
except Exception as e:
    print(f"‚è≠Ô∏è  Skipping FactSales amount check: {str(e)}")

# Rule 2: FactSales - ship_date >= order_date
try:
    fact_sales = spark.table("FactSales")
    
    invalid_dates = fact_sales.filter(
        col("ship_date_id") < col("order_date_id")
    ).count()
    
    total_rows = fact_sales.count()
    status = "‚úÖ PASS" if invalid_dates == 0 else "‚ùå FAIL"
    
    print(f"\n{status} FactSales: Ship date >= Order date rule")
    print(f"   Violations: {invalid_dates:,} / {total_rows:,} rows")
    
except Exception as e:
    print(f"‚è≠Ô∏è  Skipping FactSales date check: {str(e)}")

# Rule 3: DimCustomer - credit_limit should be positive
try:
    dim_customer = spark.table("DimCustomer")
    
    invalid_credit = dim_customer.filter(col("credit_limit") <= 0).count()
    total_rows = dim_customer.count()
    status = "‚úÖ PASS" if invalid_credit == 0 else "‚ùå FAIL"
    
    print(f"\n{status} DimCustomer: Positive credit limit rule")
    print(f"   Violations: {invalid_credit:,} / {total_rows:,} rows")
    
except Exception as e:
    print(f"‚è≠Ô∏è  Skipping DimCustomer credit check: {str(e)}")

## Check 4: Data Distribution Analysis

In [None]:
print("\n" + "="*80)
print("CHECK 4: Data Distribution Analysis")
print("="*80)

# FactSales - Revenue by status
try:
    print("\nFactSales - Revenue Distribution by Status:")
    print("-" * 40)
    
    fact_sales = spark.table("FactSales")
    
    status_dist = fact_sales.groupBy("status") \
        .agg(
            count("*").alias("order_count"),
            sum("net_amount").alias("total_revenue"),
            avg("net_amount").alias("avg_order_value")
        ) \
        .orderBy(desc("total_revenue"))
    
    status_dist.show(10, truncate=False)
    
except Exception as e:
    print(f"‚è≠Ô∏è  Skipping: {str(e)}")

# DimCustomer - Distribution by segment
try:
    print("\nDimCustomer - Distribution by Segment:")
    print("-" * 40)
    
    dim_customer = spark.table("DimCustomer")
    
    segment_dist = dim_customer.groupBy("segment") \
        .agg(count("*").alias("customer_count")) \
        .withColumn("percentage", 
                    round(col("customer_count") / dim_customer.count() * 100, 2)) \
        .orderBy(desc("customer_count"))
    
    segment_dist.show(truncate=False)
    
except Exception as e:
    print(f"‚è≠Ô∏è  Skipping: {str(e)}")

# FactSales - Monthly revenue trend
try:
    print("\nFactSales - Monthly Revenue Trend:")
    print("-" * 40)
    
    fact_sales = spark.table("FactSales")
    
    monthly_revenue = fact_sales \
        .withColumn("year_month", substring(col("order_date_id").cast("string"), 1, 6)) \
        .filter(col("status").isin(["Delivered", "Shipped"])) \
        .groupBy("year_month") \
        .agg(
            sum("net_amount").alias("revenue"),
            count("*").alias("order_count")
        ) \
        .orderBy("year_month")
    
    monthly_revenue.show(12, truncate=False)
    
except Exception as e:
    print(f"‚è≠Ô∏è  Skipping: {str(e)}")

## Check 5: Anomaly Detection

In [None]:
print("\n" + "="*80)
print("CHECK 5: Anomaly Detection")
print("="*80)

# Detect unusually high order amounts (>3 std dev from mean)
try:
    print("\nFactSales - Outlier Detection (Unusually High Order Amounts):")
    print("-" * 40)
    
    fact_sales = spark.table("FactSales")
    
    # Calculate statistics
    stats = fact_sales.select(
        mean("net_amount").alias("mean_amount"),
        stddev("net_amount").alias("stddev_amount")
    ).collect()[0]
    
    mean_val = stats["mean_amount"]
    stddev_val = stats["stddev_amount"]
    threshold = mean_val + (3 * stddev_val)
    
    outliers = fact_sales.filter(col("net_amount") > threshold) \
        .select("order_id", "customer_id", "net_amount", "quantity", "status") \
        .orderBy(desc("net_amount"))
    
    outlier_count = outliers.count()
    total_count = fact_sales.count()
    
    print(f"   Mean order amount: ${mean_val:,.2f}")
    print(f"   Std dev: ${stddev_val:,.2f}")
    print(f"   Outlier threshold (>3œÉ): ${threshold:,.2f}")
    print(f"   Outliers found: {outlier_count:,} / {total_count:,} orders ({outlier_count/total_count*100:.2f}%)")
    
    if outlier_count > 0:
        print(f"\n   Top 10 outliers:")
        outliers.show(10, truncate=False)
    
except Exception as e:
    print(f"‚è≠Ô∏è  Skipping: {str(e)}")

## Quality Report Summary

In [None]:
print("\n" + "="*80)
print("DATA QUALITY REPORT - SUMMARY")
print("="*80)

# Compile overall score
checks_passed = 0
total_checks = 0

# Count integrity checks
if integrity_results:
    total_checks += len(integrity_results)
    checks_passed += sum(1 for r in integrity_results if r["passed"])

print(f"\n‚úÖ Checks Passed: {checks_passed}")
print(f"‚ö†Ô∏è  Checks Failed: {total_checks - checks_passed}")
print(f"üìä Total Checks: {total_checks}")

if total_checks > 0:
    quality_score = (checks_passed / total_checks) * 100
    print(f"\nüéØ Data Quality Score: {quality_score:.1f}%")
    
    if quality_score >= 90:
        print("   ‚úÖ EXCELLENT - Data is production-ready")
    elif quality_score >= 75:
        print("   ‚ö†Ô∏è  GOOD - Minor issues to address")
    else:
        print("   ‚ùå NEEDS IMPROVEMENT - Review failed checks")

print(f"\nCompletion Time: {datetime.now()}")
print("="*80)