# Summary Pipeline v4.0 - Data Validation Notebook

This notebook allows you to explore and validate the summary pipeline results.

**Tables:**
- `default.default.accounts_all` - Source data
- `default.summary` - Summary with rolling history arrays
- `default.latest_summary` - Latest state per account

In [1]:
# Initialize Spark Session
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark = SparkSession.builder \
    .appName("SummaryValidation") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .getOrCreate()

print("Spark session ready!")

Spark session ready!


26/01/21 13:03:59 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


## 1. Table Overview

In [2]:
# Check table counts
print("=" * 60)
print("TABLE OVERVIEW")
print("=" * 60)

accounts_count = spark.table("default.default.accounts_all").count()
summary_count = spark.table("default.summary").count()
latest_count = spark.table("default.latest_summary").count()

print(f"Source (accounts_all):  {accounts_count:,} rows")
print(f"Summary:                {summary_count:,} rows")
print(f"Latest Summary:         {latest_count:,} rows")

TABLE OVERVIEW
Source (accounts_all):  5,950 rows
Summary:                5,950 rows
Latest Summary:         1,000 rows


In [None]:
# Check months in data
print("\nMonths in accounts_all:")
spark.sql("""
    SELECT rpt_as_of_mo, COUNT(*) as record_count 
    FROM default.default.accounts_all 
    GROUP BY rpt_as_of_mo 
    ORDER BY rpt_as_of_mo
""").show()

In [None]:
# Check months in summary
print("\nMonths in summary:")
spark.sql("""
    SELECT rpt_as_of_mo, COUNT(*) as record_count 
    FROM default.summary 
    GROUP BY rpt_as_of_mo 
    ORDER BY rpt_as_of_mo
""").show()

## 2. Schema Inspection

In [None]:
# Summary table schema
print("Summary Table Schema:")
print("=" * 60)
spark.table("default.summary").printSchema()

In [None]:
# Latest summary schema
print("Latest Summary Schema:")
print("=" * 60)
spark.table("default.latest_summary").printSchema()

## 3. Sample Data Exploration

In [None]:
# Pick a sample account to explore
sample_account = spark.sql("""
    SELECT cons_acct_key 
    FROM default.summary 
    GROUP BY cons_acct_key 
    HAVING COUNT(*) = 6 
    LIMIT 1
""").collect()[0][0]

print(f"Sample account with 6 months of data: {sample_account}")

In [None]:
# View source data for this account
print(f"\nSource data for account {sample_account}:")
spark.sql(f"""
    SELECT 
        cons_acct_key,
        rpt_as_of_mo,
        acct_bal_am as balance,
        days_past_due_ct_4in as dpd,
        actual_pymt_am as payment,
        asset_class_cd_4in as asset_class,
        base_ts
    FROM default.default.accounts_all
    WHERE cons_acct_key = {sample_account}
    ORDER BY rpt_as_of_mo
""").show(truncate=False)

In [None]:
# View summary data for this account
print(f"\nSummary data for account {sample_account}:")
spark.sql(f"""
    SELECT 
        cons_acct_key,
        rpt_as_of_mo,
        balance_am,
        days_past_due,
        payment_history_grid
    FROM default.summary
    WHERE cons_acct_key = {sample_account}
    ORDER BY rpt_as_of_mo
""").show(truncate=False)

In [None]:
# View rolling history arrays for this account
print(f"\nRolling history arrays for account {sample_account} (latest month):")
df = spark.sql(f"""
    SELECT 
        cons_acct_key,
        rpt_as_of_mo,
        balance_am_history,
        days_past_due_history,
        payment_rating_cd_history
    FROM default.summary
    WHERE cons_acct_key = {sample_account}
    ORDER BY rpt_as_of_mo DESC
    LIMIT 1
""")

row = df.collect()[0]
print(f"Account: {row['cons_acct_key']}")
print(f"Month: {row['rpt_as_of_mo']}")
print(f"\nBalance History (first 12): {row['balance_am_history'][:12]}")
print(f"DPD History (first 12):     {row['days_past_due_history'][:12]}")
print(f"Rating History (first 12):  {row['payment_rating_cd_history'][:12]}")

## 4. Validation Checks

In [None]:
# Check 1: No duplicates
print("CHECK 1: Duplicate Check")
print("=" * 60)

duplicates = spark.sql("""
    SELECT cons_acct_key, rpt_as_of_mo, COUNT(*) as cnt
    FROM default.summary
    GROUP BY cons_acct_key, rpt_as_of_mo
    HAVING COUNT(*) > 1
""")

dup_count = duplicates.count()
if dup_count == 0:
    print("PASSED: No duplicate (account, month) combinations")
else:
    print(f"FAILED: Found {dup_count} duplicates")
    duplicates.show(10)

In [None]:
# Check 2: Array lengths
print("\nCHECK 2: Array Length Check")
print("=" * 60)

array_check = spark.sql("""
    SELECT 
        'balance_am_history' as column_name,
        COUNT(*) as total_rows,
        SUM(CASE WHEN SIZE(balance_am_history) = 36 THEN 1 ELSE 0 END) as correct_length,
        SUM(CASE WHEN SIZE(balance_am_history) != 36 THEN 1 ELSE 0 END) as wrong_length
    FROM default.summary
    
    UNION ALL
    
    SELECT 
        'days_past_due_history',
        COUNT(*),
        SUM(CASE WHEN SIZE(days_past_due_history) = 36 THEN 1 ELSE 0 END),
        SUM(CASE WHEN SIZE(days_past_due_history) != 36 THEN 1 ELSE 0 END)
    FROM default.summary
    
    UNION ALL
    
    SELECT 
        'payment_rating_cd_history',
        COUNT(*),
        SUM(CASE WHEN SIZE(payment_rating_cd_history) = 36 THEN 1 ELSE 0 END),
        SUM(CASE WHEN SIZE(payment_rating_cd_history) != 36 THEN 1 ELSE 0 END)
    FROM default.summary
""")

array_check.show(truncate=False)

wrong = array_check.agg(F.sum("wrong_length")).collect()[0][0]
if wrong == 0:
    print("PASSED: All arrays have correct length (36)")
else:
    print(f"FAILED: {wrong} rows have incorrect array lengths")

In [None]:
# Check 3: Latest summary consistency
print("\nCHECK 3: Latest Summary Consistency")
print("=" * 60)

consistency = spark.sql("""
    WITH max_months AS (
        SELECT cons_acct_key, MAX(rpt_as_of_mo) as max_month
        FROM default.summary
        GROUP BY cons_acct_key
    )
    SELECT 
        COUNT(*) as total_accounts,
        SUM(CASE WHEN l.cons_acct_key IS NOT NULL THEN 1 ELSE 0 END) as in_latest,
        SUM(CASE WHEN l.cons_acct_key IS NULL THEN 1 ELSE 0 END) as missing_from_latest
    FROM max_months m
    LEFT JOIN default.latest_summary l ON m.cons_acct_key = l.cons_acct_key
""")

consistency.show()

missing = consistency.collect()[0]['missing_from_latest']
if missing == 0:
    print("PASSED: All accounts have entries in latest_summary")
else:
    print(f"FAILED: {missing} accounts missing from latest_summary")

In [None]:
# Check 4: Grid length
print("\nCHECK 4: Payment History Grid Length")
print("=" * 60)

grid_check = spark.sql("""
    SELECT 
        LENGTH(payment_history_grid) as grid_length,
        COUNT(*) as row_count
    FROM default.summary
    GROUP BY LENGTH(payment_history_grid)
    ORDER BY grid_length
""")

grid_check.show()

all_36 = grid_check.filter("grid_length = 36").count() == grid_check.count()
if all_36:
    print("PASSED: All payment_history_grid values have length 36")
else:
    print("WARNING: Some grids have unexpected lengths")

## 5. History Array Deep Dive

In [None]:
# Verify history shifts correctly over months
print("History Array Shift Verification")
print("=" * 60)
print("For each month, index 0 should be current value, index 1 should be previous month, etc.")
print()

# Get an account with multiple months
history_df = spark.sql(f"""
    SELECT 
        rpt_as_of_mo,
        balance_am_history[0] as current_balance,
        balance_am_history[1] as prev_1_month,
        balance_am_history[2] as prev_2_month,
        balance_am_history[3] as prev_3_month,
        balance_am_history[4] as prev_4_month,
        balance_am_history[5] as prev_5_month
    FROM default.summary
    WHERE cons_acct_key = {sample_account}
    ORDER BY rpt_as_of_mo
""")

print(f"Balance history for account {sample_account}:")
history_df.show(truncate=False)

In [None]:
# Compare with source values
print("\nComparison with source values:")
print("=" * 60)

comparison = spark.sql(f"""
    SELECT 
        a.rpt_as_of_mo,
        a.acct_bal_am as source_balance,
        s.balance_am_history[0] as history_current
    FROM default.default.accounts_all a
    JOIN default.summary s 
        ON a.cons_acct_key = s.cons_acct_key 
        AND a.rpt_as_of_mo = s.rpt_as_of_mo
    WHERE a.cons_acct_key = {sample_account}
    ORDER BY a.rpt_as_of_mo
""")

comparison.show(truncate=False)

# Verify match
mismatches = comparison.filter("source_balance != history_current").count()
if mismatches == 0:
    print("PASSED: Source values match history[0] values")
else:
    print(f"FAILED: {mismatches} mismatches found")

## 6. DPD and Payment Rating Analysis

In [None]:
# DPD distribution
print("DPD Distribution (current month values):")
spark.sql("""
    SELECT 
        CASE 
            WHEN days_past_due = 0 THEN '0 (Current)'
            WHEN days_past_due BETWEEN 1 AND 29 THEN '1-29 DPD'
            WHEN days_past_due BETWEEN 30 AND 59 THEN '30-59 DPD'
            WHEN days_past_due BETWEEN 60 AND 89 THEN '60-89 DPD'
            WHEN days_past_due BETWEEN 90 AND 119 THEN '90-119 DPD'
            WHEN days_past_due BETWEEN 120 AND 179 THEN '120-179 DPD'
            WHEN days_past_due >= 180 THEN '180+ DPD'
            ELSE 'Unknown'
        END as dpd_bucket,
        COUNT(*) as count,
        ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as pct
    FROM default.summary
    WHERE rpt_as_of_mo = (SELECT MAX(rpt_as_of_mo) FROM default.summary)
    GROUP BY 1
    ORDER BY 
        CASE 
            WHEN days_past_due = 0 THEN 0
            WHEN days_past_due BETWEEN 1 AND 29 THEN 1
            WHEN days_past_due BETWEEN 30 AND 59 THEN 2
            WHEN days_past_due BETWEEN 60 AND 89 THEN 3
            WHEN days_past_due BETWEEN 90 AND 119 THEN 4
            WHEN days_past_due BETWEEN 120 AND 179 THEN 5
            ELSE 6
        END
""").show(truncate=False)

In [None]:
# Payment rating distribution
print("\nPayment Rating Distribution (from grid, first character):")
spark.sql("""
    SELECT 
        SUBSTRING(payment_history_grid, 1, 1) as rating,
        CASE SUBSTRING(payment_history_grid, 1, 1)
            WHEN '0' THEN 'Current (0-29 DPD)'
            WHEN '1' THEN '30-59 DPD'
            WHEN '2' THEN '60-89 DPD'
            WHEN '3' THEN '90-119 DPD'
            WHEN '4' THEN '120-149 DPD'
            WHEN '5' THEN '150-179 DPD'
            WHEN '6' THEN '180+ DPD'
            WHEN 'S' THEN 'Standard'
            WHEN 'B' THEN 'Sub-Standard'
            WHEN 'D' THEN 'Doubtful'
            WHEN 'L' THEN 'Loss'
            WHEN '?' THEN 'Unknown'
            ELSE 'Other'
        END as description,
        COUNT(*) as count
    FROM default.summary
    WHERE rpt_as_of_mo = (SELECT MAX(rpt_as_of_mo) FROM default.summary)
    GROUP BY 1, 2
    ORDER BY count DESC
""").show(truncate=False)

## 7. Interactive Query

In [None]:
# Look up any specific account
# Change this value to explore different accounts
ACCOUNT_TO_EXPLORE = 100

print(f"\n" + "=" * 60)
print(f"Account {ACCOUNT_TO_EXPLORE} Details")
print("=" * 60)

# Source data
print("\nSource Data:")
spark.sql(f"""
    SELECT 
        rpt_as_of_mo, 
        acct_bal_am as balance,
        days_past_due_ct_4in as dpd,
        actual_pymt_am as payment,
        acct_stat_cd as status
    FROM default.default.accounts_all 
    WHERE cons_acct_key = {ACCOUNT_TO_EXPLORE}
    ORDER BY rpt_as_of_mo
""").show()

In [None]:
# Summary data for same account
print(f"\nSummary Data for Account {ACCOUNT_TO_EXPLORE}:")
spark.sql(f"""
    SELECT 
        rpt_as_of_mo,
        balance_am,
        days_past_due,
        payment_history_grid,
        SLICE(balance_am_history, 1, 6) as balance_hist_first6,
        SLICE(days_past_due_history, 1, 6) as dpd_hist_first6
    FROM default.summary
    WHERE cons_acct_key = {ACCOUNT_TO_EXPLORE}
    ORDER BY rpt_as_of_mo
""").show(truncate=False)

In [None]:
# Latest summary for same account
print(f"\nLatest Summary for Account {ACCOUNT_TO_EXPLORE}:")
spark.sql(f"""
    SELECT *
    FROM default.latest_summary
    WHERE cons_acct_key = {ACCOUNT_TO_EXPLORE}
""").show(vertical=True, truncate=False)

## 8. Summary Statistics

In [None]:
# Final summary stats
print("=" * 60)
print("FINAL SUMMARY STATISTICS")
print("=" * 60)

stats = spark.sql("""
    SELECT
        COUNT(*) as total_rows,
        COUNT(DISTINCT cons_acct_key) as unique_accounts,
        COUNT(DISTINCT rpt_as_of_mo) as months,
        MIN(rpt_as_of_mo) as first_month,
        MAX(rpt_as_of_mo) as last_month,
        AVG(balance_am) as avg_balance,
        AVG(days_past_due) as avg_dpd
    FROM default.summary
""")

stats.show(vertical=True, truncate=False)

In [None]:
# Accounts with gaps
print("\nAccounts with Missing Months (gaps):")
spark.sql("""
    WITH account_months AS (
        SELECT 
            cons_acct_key,
            COUNT(DISTINCT rpt_as_of_mo) as month_count,
            MIN(rpt_as_of_mo) as first_month,
            MAX(rpt_as_of_mo) as last_month
        FROM default.summary
        GROUP BY cons_acct_key
    )
    SELECT 
        month_count,
        COUNT(*) as account_count
    FROM account_months
    GROUP BY month_count
    ORDER BY month_count DESC
""").show()

## 9. Custom Query Cell

Use this cell to run your own queries:

In [None]:
# Run your own query here
# Example: Find accounts with DPD > 90

your_query = """
    SELECT 
        cons_acct_key,
        rpt_as_of_mo,
        days_past_due,
        balance_am,
        payment_history_grid
    FROM default.summary
    WHERE days_past_due > 90
    ORDER BY days_past_due DESC
    LIMIT 10
"""

spark.sql(your_query).show(truncate=False)

In [None]:
# Stop Spark session when done
# spark.stop()
print("Notebook complete! Uncomment spark.stop() to close the session.")