# Step 2: Row Meta Check

This notebook:
- Loads configuration from Step 1
- Queries row counts for each partition/vintage
- Compares PCDS vs AWS row counts
- Updates JSON with results

**Input**: `data/config.json`  
**Output**: Updated `data/config.json` with row_meta results

In [None]:
import json
import pandas as pd
from datetime import datetime
from collections import defaultdict

print("Step 2: Row Meta Check")
print("=" * 50)

## 2.1 Load Configuration

In [None]:
CONFIG_JSON = "data/config.json"

with open(CONFIG_JSON, 'r') as f:
    config = json.load(f)

print(f"Loaded configuration: {config['run_name']}")
print(f"Total tables: {config['metadata']['total_tables']}")
print(f"Step 1 completed: {config['status']['step1_completed']}")

## 2.2 Define Row Count Query Functions

These functions need to be implemented with actual database connections.

In [None]:
def get_pcds_row_count(table_name, date_col, vintage, partition_type, where_clause=None):
    """
    Query PCDS (Oracle) for row count
    
    Args:
        table_name: PCDS table name
        date_col: Date column name
        vintage: Date partition value (e.g., '2024-01-01') or 'whole'
        partition_type: 'month', 'year', 'whole'
        where_clause: Additional WHERE conditions
    
    Returns:
        tuple: (row_count, query_time_seconds)
    """
    # TODO: Implement actual Oracle query
    # Example SQL template:
    # SELECT COUNT(*) FROM {table_name}
    # WHERE {date_col} = TO_DATE('{vintage}', 'YYYY-MM-DD')
    # AND {where_clause}
    
    print(f"  [PCDS] Querying {table_name} for {vintage}...")
    
    # Placeholder - replace with actual query
    import time
    time.sleep(0.1)  # Simulate query time
    row_count = 1000  # Placeholder count
    query_time = 0.1
    
    return row_count, query_time


def get_aws_row_count(table_name, date_col, vintage, partition_type, where_clause=None):
    """
    Query AWS Athena for row count
    
    Args:
        table_name: AWS table name
        date_col: Date column name
        vintage: Date partition value or 'whole'
        partition_type: 'month', 'year', 'whole'
        where_clause: Additional WHERE conditions
    
    Returns:
        tuple: (row_count, query_time_seconds)
    """
    # TODO: Implement actual Athena query
    # Example SQL template:
    # SELECT COUNT(*) FROM {table_name}
    # WHERE {date_col} = '{vintage}'
    # AND {where_clause}
    
    print(f"  [AWS]  Querying {table_name} for {vintage}...")
    
    # Placeholder - replace with actual query
    import time
    time.sleep(0.1)  # Simulate query time
    row_count = 1000  # Placeholder count
    query_time = 0.1
    
    return row_count, query_time


def get_vintages(table_pair, partition_type):
    """
    Determine which vintages/partitions to check
    
    Args:
        table_pair: Dict with start_dt, end_dt, partition info
        partition_type: 'month', 'year', 'whole'
    
    Returns:
        list: List of vintage values to check
    """
    if partition_type == 'whole':
        return ['whole']
    
    # TODO: Generate date range based on start_dt, end_dt, and partition_type
    # For now, return placeholder
    return ['2024-01-01', '2024-02-01', '2024-03-01']  # Example monthly vintages


print("Query functions defined (need implementation)")

## 2.3 Run Row Count Checks

In [None]:
# Store results
row_meta_results = []

for idx, table_pair in enumerate(config['table_pairs'], 1):
    print(f"\n[{idx}/{len(config['table_pairs'])}] Processing:")
    print(f"  PCDS: {table_pair['pcds_tbl']}")
    print(f"  AWS:  {table_pair['aws_tbl']}")
    
    # Get vintages to check
    partition_type = table_pair.get('partition', 'whole')
    vintages = get_vintages(table_pair, partition_type)
    
    print(f"  Vintages: {len(vintages)}")
    
    for vintage in vintages:
        # Query PCDS
        pcds_count, pcds_time = get_pcds_row_count(
            table_pair['pcds_tbl'],
            table_pair.get('pcds_dt'),
            vintage,
            partition_type,
            table_pair.get('pcds_where')
        )
        
        # Query AWS
        aws_count, aws_time = get_aws_row_count(
            table_pair['aws_tbl'],
            table_pair.get('aws_dt'),
            vintage,
            partition_type,
            table_pair.get('aws_where')
        )
        
        # Calculate match
        match = pcds_count == aws_count
        diff = aws_count - pcds_count
        
        # Store result
        result = {
            'table_index': idx - 1,
            'pcds_tbl': table_pair['pcds_tbl'],
            'aws_tbl': table_pair['aws_tbl'],
            'vintage': vintage,
            'pcds_count': pcds_count,
            'aws_count': aws_count,
            'match': match,
            'diff': diff,
            'pcds_query_time': pcds_time,
            'aws_query_time': aws_time
        }
        row_meta_results.append(result)
        
        if not match:
            print(f"    ⚠ Mismatch at {vintage}: PCDS={pcds_count}, AWS={aws_count}, diff={diff}")

print(f"\n✓ Row meta check complete: {len(row_meta_results)} checks performed")

## 2.4 Summary Report

In [None]:
# Convert to DataFrame for analysis
df_results = pd.DataFrame(row_meta_results)

print("Row Meta Check Summary:")
print("=" * 50)
print(f"Total checks: {len(df_results)}")
print(f"Matches: {df_results['match'].sum()}")
print(f"Mismatches: {(~df_results['match']).sum()}")
print(f"\nTotal PCDS rows: {df_results['pcds_count'].sum():,}")
print(f"Total AWS rows: {df_results['aws_count'].sum():,}")
print(f"Total diff: {df_results['diff'].sum():,}")

# Show mismatches if any
if (~df_results['match']).any():
    print("\nMismatches:")
    display(df_results[~df_results['match']][['pcds_tbl', 'aws_tbl', 'vintage', 'pcds_count', 'aws_count', 'diff']])
else:
    print("\n✓ All row counts match!")

## 2.5 Update Configuration JSON

In [None]:
# Add row_meta results to config
config['row_meta'] = row_meta_results
config['status']['step2_completed'] = True
config['status']['step2_completed_at'] = datetime.now().isoformat()

# Save updated config
with open(CONFIG_JSON, 'w') as f:
    json.dump(config, f, indent=2, default=str)

print(f"\n✓ Configuration updated and saved to: {CONFIG_JSON}")
print(f"\n✓ Step 2 Complete - Ready for Step 3")