# Summary Pipeline v4.0 - Data Validation Notebook

This notebook allows you to explore and validate the summary pipeline results.

**Tables:**
- `default.default.accounts_all` - Source data
- `default.summary` - Summary with rolling history arrays
- `default.latest_summary` - Latest state per account

In [30]:
# Initialize Spark Session
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark = SparkSession.builder \
    .appName("SummaryValidation") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .getOrCreate()

print("Spark session ready!")

Spark session ready!


26/01/21 16:35:42 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


## 1. Table Overview

In [2]:
# Check table counts
print("=" * 60)
print("TABLE OVERVIEW")
print("=" * 60)

accounts_count = spark.table("default.default.accounts_all").count()
summary_count = spark.table("default.summary").count()
latest_count = spark.table("default.latest_summary").count()

print(f"Source (accounts_all):  {accounts_count:,} rows")
print(f"Summary:                {summary_count:,} rows")
print(f"Latest Summary:         {latest_count:,} rows")

TABLE OVERVIEW
Source (accounts_all):  5,950 rows
Summary:                5,950 rows
Latest Summary:         1,000 rows


In [5]:
%%sql
select cons_acct_key,rpt_as_of_mo from default.default.accounts_all order by cons_acct_key, rpt_as_of_mo;

cons_acct_key,rpt_as_of_mo
1,2024-01
1,2024-02
1,2024-03
1,2024-04
1,2024-05
1,2024-06
2,2024-01
2,2024-02
2,2024-03
2,2024-04


In [7]:
%%sql
select count(distinct rpt_as_of_mo) from default.default.accounts_all 

count(DISTINCT rpt_as_of_mo)
6


In [8]:
%%sql
with data as ( select cons_acct_key, count(1) as count from default.default.accounts_all group by cons_acct_key) select * from data where count<6

cons_acct_key,count
299,5
736,5
198,5
50,5
629,5
766,5
686,5
443,5
193,5
72,5


In [10]:
%%sql
select * from default.default.accounts_all where cons_acct_key = 387	order by cons_acct_key, rpt_as_of_mo;

cons_acct_key,bureau_mbr_id,port_type_cd,acct_type_dtl_cd,pymt_terms_cd,pymt_terms_dtl_cd,acct_open_dt,acct_closed_dt,acct_dt,last_pymt_dt,schd_pymt_dt,orig_pymt_due_dt,write_off_dt,acct_stat_cd,acct_pymt_stat_cd,acct_pymt_stat_dtl_cd,acct_credit_ext_am,acct_bal_am,past_due_am,actual_pymt_am,next_schd_pymt_am,write_off_am,asset_class_cd_4in,days_past_due_ct_4in,high_credit_am_4in,cash_limit_am_4in,collateral_am_4in,total_write_off_am_4in,principal_write_off_am_4in,settled_am_4in,interest_rate_4in,suit_filed_wilful_def_stat_cd_4in,wo_settled_stat_cd_4in,collateral_cd,rpt_as_of_mo,base_ts
387,BMB00387,R,220,M,2,2023-07-21,,2024-01-01,2023-12-15,2024-01-16,2023-08-20,,A,0,0,48232,28873,0,2513,19770,0,1,0,48232,14469,0,0,0,0,8.7814,,,,2024-01,2025-07-25 12:41:34.009371
387,BMB00387,R,220,M,3,2023-07-21,,2024-02-01,2024-01-23,2024-02-16,2023-08-20,,A,0,0,48232,34159,2391,1167,19770,0,1,7,48232,14469,0,0,0,0,15.7872,,,,2024-02,2025-08-24 12:41:34.009371
387,BMB00387,R,220,Q,2,2023-07-21,,2024-04-01,2024-03-28,2024-04-16,2023-08-20,,A,0,0,48232,18848,4523,594,19770,0,2,24,48232,14469,0,0,0,0,19.8012,,,,2024-04,2025-10-23 12:41:34.009371
387,BMB00387,R,220,W,3,2023-07-21,,2024-05-01,2024-04-16,2024-05-16,2023-08-20,,A,1,0,48232,36735,16530,1637,19770,0,2,45,48232,14469,0,0,0,0,11.963,,,,2024-05,2025-11-22 12:41:34.009371
387,BMB00387,R,220,W,3,2023-07-21,,2024-06-01,,2024-06-16,2023-08-20,,A,2,0,48232,35864,17932,0,19770,0,3,66,48232,14469,0,0,0,0,21.3902,,,,2024-06,2025-12-22 12:41:34.009371


In [11]:
# Check months in data
print("\nMonths in accounts_all:")
spark.sql("""
    SELECT rpt_as_of_mo, COUNT(*) as record_count 
    FROM default.default.accounts_all 
    GROUP BY rpt_as_of_mo 
    ORDER BY rpt_as_of_mo
""").show()


Months in accounts_all:
+------------+------------+
|rpt_as_of_mo|record_count|
+------------+------------+
|     2024-01|        1000|
|     2024-02|        1000|
|     2024-03|         990|
|     2024-04|         981|
|     2024-05|         979|
|     2024-06|        1000|
+------------+------------+



In [12]:
# Check months in summary
print("\nMonths in summary:")
spark.sql("""
    SELECT rpt_as_of_mo, COUNT(*) as record_count 
    FROM default.summary 
    GROUP BY rpt_as_of_mo 
    ORDER BY rpt_as_of_mo
""").show()


Months in summary:
+------------+------------+
|rpt_as_of_mo|record_count|
+------------+------------+
|     2024-01|        1000|
|     2024-02|        1000|
|     2024-03|         990|
|     2024-04|         981|
|     2024-05|         979|
|     2024-06|        1000|
+------------+------------+



## 2. Schema Inspection

In [13]:
# Summary table schema
print("Summary Table Schema:")
print("=" * 60)
spark.table("default.summary").printSchema()

Summary Table Schema:
root
 |-- cons_acct_key: long (nullable = true)
 |-- bureau_member_id: string (nullable = true)
 |-- portfolio_rating_type_cd: string (nullable = true)
 |-- acct_type_dtl_cd: string (nullable = true)
 |-- pymt_terms_cd: string (nullable = true)
 |-- pymt_terms_dtl_cd: string (nullable = true)
 |-- acct_stat_cd: string (nullable = true)
 |-- acct_pymt_stat_cd: string (nullable = true)
 |-- acct_pymt_stat_dtl_cd: string (nullable = true)
 |-- collateral_cd: string (nullable = true)
 |-- asset_class_cd: string (nullable = true)
 |-- suit_filed_willful_dflt: string (nullable = true)
 |-- written_off_and_settled_status: string (nullable = true)
 |-- open_dt: date (nullable = true)
 |-- closed_dt: date (nullable = true)
 |-- acct_dt: date (nullable = true)
 |-- last_payment_dt: date (nullable = true)
 |-- schd_pymt_dt: date (nullable = true)
 |-- orig_pymt_due_dt: date (nullable = true)
 |-- dflt_status_dt: date (nullable = true)
 |-- credit_limit_am: integer (nullable 

In [14]:
# Latest summary schema
print("Latest Summary Schema:")
print("=" * 60)
spark.table("default.latest_summary").printSchema()

Latest Summary Schema:
root
 |-- cons_acct_key: long (nullable = true)
 |-- acct_dt: date (nullable = true)
 |-- rpt_as_of_mo: string (nullable = true)
 |-- base_ts: timestamp (nullable = true)
 |-- bureau_member_id: string (nullable = true)
 |-- acct_pymt_stat_cd: string (nullable = true)
 |-- acct_type_dtl_cd: string (nullable = true)
 |-- acct_stat_cd: string (nullable = true)
 |-- pymt_terms_cd: string (nullable = true)
 |-- asset_class_cd: string (nullable = true)
 |-- suit_filed_willful_dflt: string (nullable = true)
 |-- written_off_and_settled_status: string (nullable = true)
 |-- portfolio_rating_type_cd: string (nullable = true)
 |-- open_dt: date (nullable = true)
 |-- closed_dt: date (nullable = true)
 |-- dflt_status_dt: date (nullable = true)
 |-- last_payment_dt: date (nullable = true)
 |-- emi_amt: integer (nullable = true)
 |-- balance_am: integer (nullable = true)
 |-- days_past_due: integer (nullable = true)
 |-- past_due_am: integer (nullable = true)
 |-- charge_off

## 3. Sample Data Exploration

In [15]:
# Pick a sample account to explore
sample_account = spark.sql("""
    SELECT cons_acct_key 
    FROM default.summary 
    GROUP BY cons_acct_key 
    HAVING COUNT(*) = 6 
    LIMIT 1
""").collect()[0][0]

print(f"Sample account with 6 months of data: {sample_account}")

Sample account with 6 months of data: 29


In [16]:
# View source data for this account
print(f"\nSource data for account {sample_account}:")
spark.sql(f"""
    SELECT 
        cons_acct_key,
        rpt_as_of_mo,
        acct_bal_am as balance,
        days_past_due_ct_4in as dpd,
        actual_pymt_am as payment,
        asset_class_cd_4in as asset_class,
        base_ts
    FROM default.default.accounts_all
    WHERE cons_acct_key = {sample_account}
    ORDER BY rpt_as_of_mo
""").show(truncate=False)


Source data for account 29:
+-------------+------------+-------+---+-------+-----------+--------------------------+
|cons_acct_key|rpt_as_of_mo|balance|dpd|payment|asset_class|base_ts                   |
+-------------+------------+-------+---+-------+-----------+--------------------------+
|29           |2024-01     |225492 |50 |9047   |B          |2025-07-25 12:41:34.009371|
|29           |2024-02     |277158 |69 |0      |B          |2025-08-24 12:41:34.009371|
|29           |2024-03     |353340 |98 |0      |4          |2025-09-23 12:41:34.009371|
|29           |2024-04     |306299 |93 |0      |L          |2025-10-23 12:41:34.009371|
|29           |2024-05     |202747 |83 |0      |03         |2025-11-22 12:41:34.009371|
|29           |2024-06     |223818 |78 |0      |03         |2025-12-22 12:41:34.009371|
+-------------+------------+-------+---+-------+-----------+--------------------------+



In [17]:
# View summary data for this account
print(f"\nSummary data for account {sample_account}:")
spark.sql(f"""
    SELECT 
        cons_acct_key,
        rpt_as_of_mo,
        balance_am,
        days_past_due,
        payment_history_grid
    FROM default.summary
    WHERE cons_acct_key = {sample_account}
    ORDER BY rpt_as_of_mo
""").show(truncate=False)


Summary data for account 29:
+-------------+------------+----------+-------------+------------------------------------+
|cons_acct_key|rpt_as_of_mo|balance_am|days_past_due|payment_history_grid                |
+-------------+------------+----------+-------------+------------------------------------+
|29           |2024-01     |225492    |50           |1???????????????????????????????????|
|29           |2024-02     |277158    |69           |21??????????????????????????????????|
|29           |2024-03     |353340    |98           |321?????????????????????????????????|
|29           |2024-04     |306299    |93           |3321????????????????????????????????|
|29           |2024-05     |202747    |83           |23321???????????????????????????????|
|29           |2024-06     |223818    |78           |223321??????????????????????????????|
+-------------+------------+----------+-------------+------------------------------------+



In [18]:
# View rolling history arrays for this account
print(f"\nRolling history arrays for account {sample_account} (latest month):")
df = spark.sql(f"""
    SELECT 
        cons_acct_key,
        rpt_as_of_mo,
        balance_am_history,
        days_past_due_history,
        payment_rating_cd_history
    FROM default.summary
    WHERE cons_acct_key = {sample_account}
    ORDER BY rpt_as_of_mo DESC
    LIMIT 1
""")

row = df.collect()[0]
print(f"Account: {row['cons_acct_key']}")
print(f"Month: {row['rpt_as_of_mo']}")
print(f"\nBalance History (first 12): {row['balance_am_history'][:12]}")
print(f"DPD History (first 12):     {row['days_past_due_history'][:12]}")
print(f"Rating History (first 12):  {row['payment_rating_cd_history'][:12]}")


Rolling history arrays for account 29 (latest month):
Account: 29
Month: 2024-06

Balance History (first 12): [223818, 202747, 306299, 353340, 277158, 225492, None, None, None, None, None, None]
DPD History (first 12):     [78, 83, 93, 98, 69, 50, None, None, None, None, None, None]
Rating History (first 12):  ['2', '2', '3', '3', '2', '1', None, None, None, None, None, None]


## 4. Validation Checks

In [19]:
# Check 1: No duplicates
print("CHECK 1: Duplicate Check")
print("=" * 60)

duplicates = spark.sql("""
    SELECT cons_acct_key, rpt_as_of_mo, COUNT(*) as cnt
    FROM default.summary
    GROUP BY cons_acct_key, rpt_as_of_mo
    HAVING COUNT(*) > 1
""")

dup_count = duplicates.count()
if dup_count == 0:
    print("PASSED: No duplicate (account, month) combinations")
else:
    print(f"FAILED: Found {dup_count} duplicates")
    duplicates.show(10)

CHECK 1: Duplicate Check
PASSED: No duplicate (account, month) combinations


In [20]:
# Check 2: Array lengths
print("\nCHECK 2: Array Length Check")
print("=" * 60)

array_check = spark.sql("""
    SELECT 
        'balance_am_history' as column_name,
        COUNT(*) as total_rows,
        SUM(CASE WHEN SIZE(balance_am_history) = 36 THEN 1 ELSE 0 END) as correct_length,
        SUM(CASE WHEN SIZE(balance_am_history) != 36 THEN 1 ELSE 0 END) as wrong_length
    FROM default.summary
    
    UNION ALL
    
    SELECT 
        'days_past_due_history',
        COUNT(*),
        SUM(CASE WHEN SIZE(days_past_due_history) = 36 THEN 1 ELSE 0 END),
        SUM(CASE WHEN SIZE(days_past_due_history) != 36 THEN 1 ELSE 0 END)
    FROM default.summary
    
    UNION ALL
    
    SELECT 
        'payment_rating_cd_history',
        COUNT(*),
        SUM(CASE WHEN SIZE(payment_rating_cd_history) = 36 THEN 1 ELSE 0 END),
        SUM(CASE WHEN SIZE(payment_rating_cd_history) != 36 THEN 1 ELSE 0 END)
    FROM default.summary
""")

array_check.show(truncate=False)

wrong = array_check.agg(F.sum("wrong_length")).collect()[0][0]
if wrong == 0:
    print("PASSED: All arrays have correct length (36)")
else:
    print(f"FAILED: {wrong} rows have incorrect array lengths")


CHECK 2: Array Length Check
+-------------------------+----------+--------------+------------+
|column_name              |total_rows|correct_length|wrong_length|
+-------------------------+----------+--------------+------------+
|balance_am_history       |5950      |5950          |0           |
|days_past_due_history    |5950      |5950          |0           |
|payment_rating_cd_history|5950      |5950          |0           |
+-------------------------+----------+--------------+------------+

PASSED: All arrays have correct length (36)


In [21]:
# Check 3: Latest summary consistency
print("\nCHECK 3: Latest Summary Consistency")
print("=" * 60)

consistency = spark.sql("""
    WITH max_months AS (
        SELECT cons_acct_key, MAX(rpt_as_of_mo) as max_month
        FROM default.summary
        GROUP BY cons_acct_key
    )
    SELECT 
        COUNT(*) as total_accounts,
        SUM(CASE WHEN l.cons_acct_key IS NOT NULL THEN 1 ELSE 0 END) as in_latest,
        SUM(CASE WHEN l.cons_acct_key IS NULL THEN 1 ELSE 0 END) as missing_from_latest
    FROM max_months m
    LEFT JOIN default.latest_summary l ON m.cons_acct_key = l.cons_acct_key
""")

consistency.show()

missing = consistency.collect()[0]['missing_from_latest']
if missing == 0:
    print("PASSED: All accounts have entries in latest_summary")
else:
    print(f"FAILED: {missing} accounts missing from latest_summary")


CHECK 3: Latest Summary Consistency
+--------------+---------+-------------------+
|total_accounts|in_latest|missing_from_latest|
+--------------+---------+-------------------+
|          1000|     1000|                  0|
+--------------+---------+-------------------+

PASSED: All accounts have entries in latest_summary


In [22]:
# Check 4: Grid length
print("\nCHECK 4: Payment History Grid Length")
print("=" * 60)

grid_check = spark.sql("""
    SELECT 
        LENGTH(payment_history_grid) as grid_length,
        COUNT(*) as row_count
    FROM default.summary
    GROUP BY LENGTH(payment_history_grid)
    ORDER BY grid_length
""")

grid_check.show()

all_36 = grid_check.filter("grid_length = 36").count() == grid_check.count()
if all_36:
    print("PASSED: All payment_history_grid values have length 36")
else:
    print("WARNING: Some grids have unexpected lengths")


CHECK 4: Payment History Grid Length
+-----------+---------+
|grid_length|row_count|
+-----------+---------+
|         36|     5950|
+-----------+---------+

PASSED: All payment_history_grid values have length 36


## 5. History Array Deep Dive

In [23]:
# Verify history shifts correctly over months
print("History Array Shift Verification")
print("=" * 60)
print("For each month, index 0 should be current value, index 1 should be previous month, etc.")
print()

# Get an account with multiple months
history_df = spark.sql(f"""
    SELECT 
        rpt_as_of_mo,
        balance_am_history[0] as current_balance,
        balance_am_history[1] as prev_1_month,
        balance_am_history[2] as prev_2_month,
        balance_am_history[3] as prev_3_month,
        balance_am_history[4] as prev_4_month,
        balance_am_history[5] as prev_5_month
    FROM default.summary
    WHERE cons_acct_key = {sample_account}
    ORDER BY rpt_as_of_mo
""")

print(f"Balance history for account {sample_account}:")
history_df.show(truncate=False)

History Array Shift Verification
For each month, index 0 should be current value, index 1 should be previous month, etc.

Balance history for account 29:
+------------+---------------+------------+------------+------------+------------+------------+
|rpt_as_of_mo|current_balance|prev_1_month|prev_2_month|prev_3_month|prev_4_month|prev_5_month|
+------------+---------------+------------+------------+------------+------------+------------+
|2024-01     |225492         |NULL        |NULL        |NULL        |NULL        |NULL        |
|2024-02     |277158         |225492      |NULL        |NULL        |NULL        |NULL        |
|2024-03     |353340         |277158      |225492      |NULL        |NULL        |NULL        |
|2024-04     |306299         |353340      |277158      |225492      |NULL        |NULL        |
|2024-05     |202747         |306299      |353340      |277158      |225492      |NULL        |
|2024-06     |223818         |202747      |306299      |353340      |277158   

In [24]:
# Compare with source values
print("\nComparison with source values:")
print("=" * 60)

comparison = spark.sql(f"""
    SELECT 
        a.rpt_as_of_mo,
        a.acct_bal_am as source_balance,
        s.balance_am_history[0] as history_current
    FROM default.default.accounts_all a
    JOIN default.summary s 
        ON a.cons_acct_key = s.cons_acct_key 
        AND a.rpt_as_of_mo = s.rpt_as_of_mo
    WHERE a.cons_acct_key = {sample_account}
    ORDER BY a.rpt_as_of_mo
""")

comparison.show(truncate=False)

# Verify match
mismatches = comparison.filter("source_balance != history_current").count()
if mismatches == 0:
    print("PASSED: Source values match history[0] values")
else:
    print(f"FAILED: {mismatches} mismatches found")


Comparison with source values:
+------------+--------------+---------------+
|rpt_as_of_mo|source_balance|history_current|
+------------+--------------+---------------+
|2024-01     |225492        |225492         |
|2024-02     |277158        |277158         |
|2024-03     |353340        |353340         |
|2024-04     |306299        |306299         |
|2024-05     |202747        |202747         |
|2024-06     |223818        |223818         |
+------------+--------------+---------------+

PASSED: Source values match history[0] values


## 6. DPD and Payment Rating Analysis

In [32]:
# DPD distribution
print("DPD Distribution (current month values):")
spark.sql("""
    SELECT 
        CASE 
            WHEN days_past_due = 0 THEN '0 (Current)'
            WHEN days_past_due BETWEEN 1 AND 29 THEN '1-29 DPD'
            WHEN days_past_due BETWEEN 30 AND 59 THEN '30-59 DPD'
            WHEN days_past_due BETWEEN 60 AND 89 THEN '60-89 DPD'
            WHEN days_past_due BETWEEN 90 AND 119 THEN '90-119 DPD'
            WHEN days_past_due BETWEEN 120 AND 179 THEN '120-179 DPD'
            WHEN days_past_due >= 180 THEN '180+ DPD'
            ELSE 'Unknown'
        END as dpd_bucket,
        COUNT(*) as count,
        ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as pct
    FROM default.summary
    WHERE rpt_as_of_mo = (SELECT MAX(rpt_as_of_mo) FROM default.summary)
    GROUP BY 1
    ORDER BY 
        1
""").show(truncate=False)

DPD Distribution (current month values):


26/01/21 16:36:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:36:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:36:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:36:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:36:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:36:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 1

+-----------+-----+-----+
|dpd_bucket |count|pct  |
+-----------+-----+-----+
|0 (Current)|268  |26.80|
|1-29 DPD   |258  |25.80|
|120-179 DPD|80   |8.00 |
|180+ DPD   |30   |3.00 |
|30-59 DPD  |184  |18.40|
|60-89 DPD  |114  |11.40|
|90-119 DPD |66   |6.60 |
+-----------+-----+-----+



In [33]:
# Payment rating distribution
print("\nPayment Rating Distribution (from grid, first character):")
spark.sql("""
    SELECT 
        SUBSTRING(payment_history_grid, 1, 1) as rating,
        CASE SUBSTRING(payment_history_grid, 1, 1)
            WHEN '0' THEN 'Current (0-29 DPD)'
            WHEN '1' THEN '30-59 DPD'
            WHEN '2' THEN '60-89 DPD'
            WHEN '3' THEN '90-119 DPD'
            WHEN '4' THEN '120-149 DPD'
            WHEN '5' THEN '150-179 DPD'
            WHEN '6' THEN '180+ DPD'
            WHEN 'S' THEN 'Standard'
            WHEN 'B' THEN 'Sub-Standard'
            WHEN 'D' THEN 'Doubtful'
            WHEN 'L' THEN 'Loss'
            WHEN '?' THEN 'Unknown'
            ELSE 'Other'
        END as description,
        COUNT(*) as count
    FROM default.summary
    WHERE rpt_as_of_mo = (SELECT MAX(rpt_as_of_mo) FROM default.summary)
    GROUP BY 1, 2
    ORDER BY count DESC
""").show(truncate=False)


Payment Rating Distribution (from grid, first character):
+------+------------------+-----+
|rating|description       |count|
+------+------------------+-----+
|0     |Current (0-29 DPD)|526  |
|1     |30-59 DPD         |184  |
|2     |60-89 DPD         |114  |
|3     |90-119 DPD        |66   |
|4     |120-149 DPD       |42   |
|5     |150-179 DPD       |38   |
|6     |180+ DPD          |30   |
+------+------------------+-----+



## 7. Interactive Query

In [35]:
# Look up any specific account
# Change this value to explore different accounts
ACCOUNT_TO_EXPLORE = 100

print(f"\n" + "=" * 60)
print(f"Account {ACCOUNT_TO_EXPLORE} Details")
print("=" * 60)

# Source data
print("\nSource Data:")
spark.sql(f"""
    SELECT 
        rpt_as_of_mo, 
        acct_bal_am as balance,
        days_past_due_ct_4in as dpd,
        actual_pymt_am as payment,
        acct_stat_cd as status
    FROM default.default.accounts_all 
    WHERE cons_acct_key = {ACCOUNT_TO_EXPLORE}
    ORDER BY rpt_as_of_mo
""").show()


Account 100 Details

Source Data:
+------------+-------+---+-------+------+
|rpt_as_of_mo|balance|dpd|payment|status|
+------------+-------+---+-------+------+
|     2024-01| 122363| 33|   2846|     A|
|     2024-02| 888888| 62|      0|     A|
|     2024-02| 132222| 62|      0|     A|
|     2024-02| 888888| 62|      0|     A|
|     2024-02| 888888| 62|      0|     A|
|     2024-02| 888888| 62|      0|     A|
|     2024-02| 888888| 62|      0|     A|
|     2024-02| 888888| 62|      0|     A|
|     2024-02| 888888| 62|      0|     A|
|     2024-02| 888888| 62|      0|     A|
|     2024-02| 888888| 62|      0|     A|
|     2024-02| 888888| 62|      0|     A|
|     2024-02| 888888| 62|      0|     A|
|     2024-02| 888888| 62|      0|     A|
|     2024-02| 888888| 62|      0|     A|
|     2024-02| 888888| 62|      0|     A|
|     2024-02| 888888| 62|      0|     A|
|     2024-03|  91874| 56|   1906|     A|
|     2024-04| 158580|106|      0|     A|
|     2024-05| 141480|130|   1414|     A|

In [36]:
# Summary data for same account
print(f"\nSummary Data for Account {ACCOUNT_TO_EXPLORE}:")
spark.sql(f"""
    SELECT 
        rpt_as_of_mo,
        balance_am,
        days_past_due,
        payment_history_grid,
        SLICE(balance_am_history, 1, 6) as balance_hist_first6,
        SLICE(days_past_due_history, 1, 6) as dpd_hist_first6
    FROM default.summary
    WHERE cons_acct_key = {ACCOUNT_TO_EXPLORE}
    ORDER BY rpt_as_of_mo
""").show(truncate=False)


Summary Data for Account 100:
+------------+----------+-------------+------------------------------------+-----------------------------------------------+----------------------------------+
|rpt_as_of_mo|balance_am|days_past_due|payment_history_grid                |balance_hist_first6                            |dpd_hist_first6                   |
+------------+----------+-------------+------------------------------------+-----------------------------------------------+----------------------------------+
|2024-01     |122363    |33           |1???????????????????????????????????|[122363, NULL, NULL, NULL, NULL, NULL]         |[33, NULL, NULL, NULL, NULL, NULL]|
|2024-02     |888888    |62           |21??????????????????????????????????|[888888, 122363, NULL, NULL, NULL, NULL]       |[62, 33, NULL, NULL, NULL, NULL]  |
|2024-03     |91874     |56           |121?????????????????????????????????|[91874, 888888, 122363, NULL, NULL, NULL]      |[56, 62, 33, NULL, NULL, NULL]    |
|2024-04 

In [28]:
# Latest summary for same account
print(f"\nLatest Summary for Account {ACCOUNT_TO_EXPLORE}:")
spark.sql(f"""
    SELECT *
    FROM default.latest_summary
    WHERE cons_acct_key = {ACCOUNT_TO_EXPLORE}
""").show(vertical=True, truncate=False)


Latest Summary for Account 100:
-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 cons_acct_key                  | 100                                                                                                                                                                                                                                  
 acct_dt                        | 2024-06-01                                                                                                                                                                                                                           
 rpt_as_of_mo                   | 2024-06                                                                                                                                      

## 8. Summary Statistics

In [29]:
# Final summary stats
print("=" * 60)
print("FINAL SUMMARY STATISTICS")
print("=" * 60)

stats = spark.sql("""
    SELECT
        COUNT(*) as total_rows,
        COUNT(DISTINCT cons_acct_key) as unique_accounts,
        COUNT(DISTINCT rpt_as_of_mo) as months,
        MIN(rpt_as_of_mo) as first_month,
        MAX(rpt_as_of_mo) as last_month,
        AVG(balance_am) as avg_balance,
        AVG(days_past_due) as avg_dpd
    FROM default.summary
""")

stats.show(vertical=True, truncate=False)

FINAL SUMMARY STATISTICS
-RECORD 0----------------------------
 total_rows      | 5950              
 unique_accounts | 1000              
 months          | 6                 
 first_month     | 2024-01           
 last_month      | 2024-06           
 avg_balance     | 162689.4080672269 
 avg_dpd         | 37.53243697478992 



In [None]:
# Accounts with gaps
print("\nAccounts with Missing Months (gaps):")
spark.sql("""
    WITH account_months AS (
        SELECT 
            cons_acct_key,
            COUNT(DISTINCT rpt_as_of_mo) as month_count,
            MIN(rpt_as_of_mo) as first_month,
            MAX(rpt_as_of_mo) as last_month
        FROM default.summary
        GROUP BY cons_acct_key
    )
    SELECT 
        month_count,
        COUNT(*) as account_count
    FROM account_months
    GROUP BY month_count
    ORDER BY month_count DESC
""").show()

## 9. Custom Query Cell

Use this cell to run your own queries:

In [None]:
# Run your own query here
# Example: Find accounts with DPD > 90

your_query = """
    SELECT 
        cons_acct_key,
        rpt_as_of_mo,
        days_past_due,
        balance_am,
        payment_history_grid
    FROM default.summary
    WHERE days_past_due > 90
    ORDER BY days_past_due DESC
    LIMIT 10
"""

spark.sql(your_query).show(truncate=False)

In [None]:
# Stop Spark session when done
# spark.stop()
print("Notebook complete! Uncomment spark.stop() to close the session.")