# Data Validation System - Main Notebook

This notebook orchestrates the multi-step data validation workflow.

## Workflow Steps:
1. **Load Compare List** - Load table pairs from XLSX
2. **Row Meta Check** - Validate row counts by date partition
3. **Column Meta Check** - Validate column mappings and types
4. **Column Statistics** - Analyze column-level statistics
5. **Row Hash Check** - Compare row-level hashes

## Setup and Imports

In [1]:
import sys
from pathlib import Path
from loguru import logger

# Add project root to path
project_root = Path.cwd()
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

# Import modules
import db  # noqa: E402
import services as svc  # noqa: E402
run_name = 'demo'

# Configure logger
logger.add(f"files/outputs/{run_name}/events.log", rotation="1 day")

print("✓ Imports successful")

✓ Imports successful


## Initialize Database

In [2]:
# Initialize database (creates schema if not exists)
database = db.init_db(db_path=f'files/outputs/{run_name}/data.db', reset=False)

# Check database stats
stats = database.get_table_stats()
print("\nDatabase Table Stats:")
for table, count in stats.items():
    print(f"  {table}: {count} rows")

[32m2025-11-19 06:49:00.479[0m | [1mINFO    [0m | [36mdb.database[0m:[36m__init__[0m:[36m29[0m - [1mDatabase URL: sqlite:///files/outputs/demo/data.db[0m
[32m2025-11-19 06:49:00.491[0m | [1mINFO    [0m | [36mdb.database[0m:[36minit_db[0m:[36m47[0m - [1mInitializing database schema...[0m
[32m2025-11-19 06:49:00.499[0m | [1mINFO    [0m | [36mdb.database[0m:[36minit_db[0m:[36m69[0m - [1mDatabase initialized successfully[0m



Database Table Stats:
  validation_runs_tbl: 0 rows
  compare_list_tbl: 0 rows
  row_meta_tbl: 0 rows
  cross_walk_tbl: 0 rows
  col_stat_tbl: 0 rows
  row_hash_tbl: 0 rows
  step_logs_tbl: 0 rows


## Create New Validation Run

In [3]:
# Create a new validation run
session = db.get_session()

run_name = "validation"  # Change this for each run
category = "dpst"  # or "loan"

# Check if run already exists
existing_run = session.query(db.ValidationRun).filter_by(run_name=run_name).first()

if existing_run:
    print(f"Using existing run: {run_name} (ID: {existing_run.run_id})")
    run = existing_run
else:
    run = db.ValidationRun(
        run_name=run_name,
        category=category,
        status='pending',
        notes='Test validation run'
    )
    session.add(run)
    session.commit()
    print(f"Created new run: {run_name} (ID: {run.run_id})")

run_id = run.run_id

Created new run: validation (ID: 1)


## Initialize Orchestrator

In [4]:
orchestrator = svc.ValidationOrchestrator(session, run_id)
print(f"Orchestrator initialized for: {orchestrator.run.run_name}")
print(f"Status: {orchestrator.get_status()}")

[32m2025-11-19 06:49:45.237[0m | [1mINFO    [0m | [36mservices.orchestrator[0m:[36m__init__[0m:[36m29[0m - [1mOrchestrator initialized for run: validation[0m


Orchestrator initialized for: validation
Status: {'run_id': 1, 'run_name': 'validation', 'status': 'pending', 'current_step': 0, 'updated_at': datetime.datetime(2025, 11, 19, 6, 49, 3, 701800)}


## Step 1: Load Compare List from XLSX

In [None]:
# Path to input XLSX file
# TODO: Update this path to your actual input file
xlsx_path = "files/inputs/compare_list.xlsx"

# Execute Step 1
result = orchestrator.execute_step(1, xlsx_path=xlsx_path)

print("\nStep 1 Results:")
print(f"  Total rows: {result['total_rows']}")
print(f"  Loaded: {result['loaded']}")
print(f"  Skipped: {result['skipped']}")

In [None]:
# View loaded compare list
import pandas as pd

compares = session.query(db.CompareList).filter_by(run_id=run_id).all()

df_compares = pd.DataFrame([
    {
        'id': c.id,
        'group': c.group_name,
        'pcds_tbl': c.pcds_tbl,
        'aws_tbl': c.aws_tbl,
        'partition': c.partition,
        'enabled': c.enabled
    }
    for c in compares
])

print(f"\nLoaded {len(df_compares)} table pairs:")
df_compares

## Step 2: Row Meta Check

In [None]:
# Execute Step 2 for all tables (or specify compare_ids)
compare_ids = None  # None = all tables, or [1, 2, 3] for specific tables

result = orchestrator.execute_step(2, compare_ids=compare_ids)

print("\nStep 2 Results:")
print(f"  Tables processed: {result['total_tables']}")

In [None]:
# View row meta comparison using SQL view
query = "SELECT * FROM v_row_meta_comparison WHERE run_name = :run_name"
df_row_meta = pd.read_sql(query, db.engine, params={'run_name': run.run_name})

print(f"\nRow Meta Comparison ({len(df_row_meta)} partitions):")
df_row_meta

## Step 3: Column Meta Check (CrossWalk)

In [None]:
# Execute Step 3
compare_ids = None  # None = all tables

result = orchestrator.execute_step(3, compare_ids=compare_ids)

print("\nStep 3 Results:")
print(f"  Tables processed: {result['total_tables']}")

In [None]:
# View crosswalk summary using SQL view
query = "SELECT * FROM v_crosswalk_summary WHERE run_name = :run_name"
df_crosswalk = pd.read_sql(query, db.engine, params={'run_name': run.run_name})

print(f"\nCrossWalk Summary:")
df_crosswalk.pivot_table(
    index=['pcds_tbl', 'aws_tbl'],
    columns='mapping_status',
    values='column_count',
    fill_value=0
)

## Step 4: Column Statistics Analysis

### 4A: Generate SAS Code for PCDS (Manual Execution Required)

In [None]:
# TODO: Generate SAS code for PCDS column statistics
# This will create .sas files that you need to run in SAS

print("SAS code generation not yet implemented")
print("After generating SAS code:")
print("  1. Run .sas files in SAS")
print("  2. Upload resulting CSV files using Step 4C below")

### 4B: Execute AWS Column Statistics (Automated)

In [None]:
# Execute Step 4 for AWS (PCDS requires manual SAS execution)
compare_ids = None  # None = all tables
vintages = None  # None = all vintages from row_meta
columns = None  # None = all comparable columns

result = orchestrator.execute_step(4, 
                                   compare_ids=compare_ids,
                                   vintages=vintages,
                                   columns=columns)

print("\nStep 4 Results:")
print(f"  Tables processed: {result['total_tables']}")

### 4C: Upload PCDS Statistics from SAS CSV

In [None]:
# After running SAS, upload the CSV results
from services.step4_col_stat import ColStatAnalyzer

analyzer = ColStatAnalyzer(session, run_id)

# TODO: Update these paths to your actual CSV files from SAS
pcds_csv_uploads = [
    {'compare_id': 1, 'vintage': '2024', 'csv_path': 'output/pcds_stats_compare1_vintage2024.csv'},
    # Add more as needed
]

for upload in pcds_csv_uploads:
    result = analyzer.upload_pcds_stats_csv(**upload)
    print(f"Uploaded {result['rows_inserted']} stats for compare_id={upload['compare_id']}, vintage={upload['vintage']}")

### 4D: View Column Statistics Comparison

In [None]:
# View column statistics mismatches using SQL view
query = "SELECT * FROM v_col_stat_mismatch WHERE run_name = :run_name"
df_stat_mismatch = pd.read_sql(query, db.engine, params={'run_name': run.run_name})

print(f"\nColumn Statistics Mismatches ({len(df_stat_mismatch)} found):")
df_stat_mismatch

## Step 5: Row Hash Check

In [None]:
# Execute Step 5
compare_ids = None  # None = all tables
vintages = None  # None = all vintages
sample_size = 10000  # Limit rows to check (None = all rows)

result = orchestrator.execute_step(5,
                                   compare_ids=compare_ids,
                                   vintages=vintages,
                                   sample_size=sample_size)

print("\nStep 5 Results:")
print(f"  Tables processed: {result['total_tables']}")

In [None]:
# View hash mismatches
from db.models import RowHash

mismatches = session.query(RowHash).filter_by(
    run_id=run_id,
    has_match=False
).limit(100).all()

df_mismatches = pd.DataFrame([
    {
        'compare_id': m.compare_id,
        'vintage': m.vintage,
        'platform': m.platform,
        'unique_ids': m.unique_id_json,
        'row_hash': m.row_hash
    }
    for m in mismatches
])

print(f"\nRow Hash Mismatches (first 100):")
df_mismatches

## Export Results

In [None]:
# Export all results to Excel
output_path = f"output/validation_results_{run.run_name}.xlsx"
db.export_to_excel(output_path, run_id=run_id)

print(f"Results exported to: {output_path}")

In [None]:
# Backup database
backup_path = db.backup_db()
print(f"Database backed up to: {backup_path}")

## Summary

In [None]:
# Get final status
status = orchestrator.get_status()

print("\n" + "="*60)
print("VALIDATION RUN SUMMARY")
print("="*60)
print(f"Run Name: {status['run_name']}")
print(f"Run ID: {status['run_id']}")
print(f"Status: {status['status']}")
print(f"Current Step: {status['current_step']}")
print(f"Last Updated: {status['updated_at']}")
print("="*60)

# Table counts
stats = db.get_table_stats()
print("\nDatabase Stats:")
for table, count in stats.items():
    print(f"  {table}: {count} rows")

In [None]:
# Close session
session.close()
print("Session closed")