# Step 5: Value-to-Value Check (Hash Comparison)

This notebook:
- Loads configuration with column mappings
- Generates normalized hash of comparable columns
- Includes unique ID columns for backtracking
- Allows filtering by partition and columns for performance
- **PCDS**: Generates SAS code to compute row hashes
- **AWS**: Generates Athena SQL to compute row hashes
- Compares hashes to find mismatched rows

**SQL Format**: `SELECT MD5(normalized_col1 || '|' || normalized_col2 || ...) AS _hash, unique_id1, unique_id2, ... FROM table`

**Input**: `data/config.json`  
**Output**: SAS file, Athena SQL, mismatch report

In [None]:
import json
import pandas as pd
from datetime import datetime
from pathlib import Path

# Import normalization utilities
from utils import (
    normalize_oracle_column,
    normalize_athena_column,
    build_oracle_hash_expr,
    build_athena_hash_expr
)

print("Step 5: Value-to-Value Check (Hash Comparison)")
print("=" * 50)

## 5.1 Load Configuration

In [None]:
CONFIG_JSON = "data/config.json"

with open(CONFIG_JSON, 'r') as f:
    config = json.load(f)

print(f"Loaded configuration: {config['run_name']}")
print(f"Total tables: {config['metadata']['total_tables']}")
print(f"Step 4 completed: {config['status']['step4_completed']}")

# Output paths
SAS_HASH_FILE = f"output/pcds_hash_{config['run_name']}.sas"
ATHENA_HASH_FILE = f"output/aws_hash_{config['run_name']}.sql"

Path("output").mkdir(exist_ok=True)

## 5.2 Configuration - Select Tables, Columns, Partitions

Optionally filter to specific tables/columns/partitions for performance.

In [None]:
# Filter options (None = all)
FILTER_TABLE_INDICES = None  # e.g., [0, 1, 2] for first 3 tables
FILTER_VINTAGES = None       # e.g., ['2024-01-01', '2024-02-01']
FILTER_COLUMNS = None        # e.g., ['CUSTOMER_ID', 'AMOUNT'] - only these comparable columns
MAX_ROWS_PER_VINTAGE = None  # e.g., 100000 for testing

print("Hash check configuration:")
print(f"  Table filter: {FILTER_TABLE_INDICES or 'All tables'}")
print(f"  Vintage filter: {FILTER_VINTAGES or 'All vintages'}")
print(f"  Column filter: {FILTER_COLUMNS or 'All comparable columns'}")
print(f"  Max rows per vintage: {MAX_ROWS_PER_VINTAGE or 'Unlimited'}")

## 5.3 Generate SAS Code for PCDS Row Hashes

In [None]:
def generate_sas_hash_code(table_pair, column_mapping, vintage='whole', filter_cols=None, max_rows=None):
    """
    Generate SAS code to compute row hashes
    
    Args:
        table_pair: Dict with table info
        column_mapping: Dict with column mappings
        vintage: Date partition or 'whole'
        filter_cols: List of PCDS columns to include (None = all)
        max_rows: Maximum rows to process (None = all)
    
    Returns:
        str: SAS code
    """
    pcds_tbl = table_pair['pcds_tbl']
    
    # Get comparable columns
    comparable_cols = [
        c for c in column_mapping['comparable_columns']
        if not filter_cols or c['pcds_col'] in filter_cols
    ]
    
    # Get unique ID columns
    unique_id_cols = []
    if table_pair.get('unique_id_cols'):
        unique_id_cols = [c.strip() for c in table_pair['unique_id_cols'].split(',')]
    
    if not unique_id_cols:
        return f"/* Skipping {pcds_tbl} - No unique ID columns specified */\n"
    
    # Build WHERE clause
    where_parts = []
    if vintage != 'whole' and table_pair.get('pcds_dt'):
        where_parts.append(f"{table_pair['pcds_dt']} = '{vintage}'")
    if table_pair.get('pcds_where'):
        where_parts.append(table_pair['pcds_where'])
    
    where_clause = " AND ".join(where_parts) if where_parts else "1=1"
    
    # Build normalized concatenation for hash
    normalized_exprs = []
    for col_info in comparable_cols:
        pcds_col = col_info['pcds_col']
        pcds_type = col_info['pcds_type']
        
        # Normalize based on type
        norm_expr = normalize_oracle_column(pcds_col, pcds_type)
        normalized_exprs.append(norm_expr)
    
    # Concatenate with separator
    concat_expr = " || '|' || ".join(normalized_exprs)
    
    # Build unique ID select
    unique_id_select = ",\n    ".join(unique_id_cols)
    
    # Build SAS code
    limit_clause = f"(OBS={max_rows})" if max_rows else ""
    
    sas_code = f"""
/* Hash computation for {pcds_tbl} - Vintage: {vintage} */
DATA work.hash_{pcds_tbl.replace('.', '_')}_{vintage.replace('-', '')};
  SET {pcds_tbl}{limit_clause};
  WHERE {where_clause};
  
  /* Compute normalized hash */
  _concat_str = {concat_expr};
  _hash = MD5(_concat_str);
  
  /* Keep only hash and unique IDs */
  KEEP _hash {' '.join(unique_id_cols)};
RUN;

PROC EXPORT DATA=work.hash_{pcds_tbl.replace('.', '_')}_{vintage.replace('-', '')}
  OUTFILE='/path/to/output/pcds_hash_{pcds_tbl.replace('.', '_')}_{vintage.replace('-', '')}.csv'
  DBMS=CSV REPLACE;
RUN;
"""
    
    return sas_code


print("SAS hash code generation function defined")

In [None]:
# Generate SAS hash code for all tables
sas_hash_parts = []
sas_hash_parts.append("/* PCDS Row Hash Computation */")
sas_hash_parts.append("/* Generated: " + datetime.now().isoformat() + " */\n")

for idx, (table_pair, col_mapping) in enumerate(zip(config['table_pairs'], config['column_mappings'])):
    # Apply table filter
    if FILTER_TABLE_INDICES and idx not in FILTER_TABLE_INDICES:
        continue
    
    # Get vintages
    if 'row_meta' in config:
        vintages = list(set([
            r['vintage'] for r in config['row_meta'] 
            if r['pcds_tbl'] == table_pair['pcds_tbl']
        ]))
    else:
        vintages = ['whole']
    
    # Apply vintage filter
    if FILTER_VINTAGES:
        vintages = [v for v in vintages if v in FILTER_VINTAGES]
    
    for vintage in vintages:
        sas_code = generate_sas_hash_code(
            table_pair, 
            col_mapping, 
            vintage, 
            FILTER_COLUMNS,
            MAX_ROWS_PER_VINTAGE
        )
        sas_hash_parts.append(sas_code)

full_sas_hash = "\n".join(sas_hash_parts)

# Save SAS hash file
with open(SAS_HASH_FILE, 'w') as f:
    f.write(full_sas_hash)

print(f"✓ SAS hash code generated: {SAS_HASH_FILE}")
print(f"  Total lines: {len(full_sas_hash.splitlines())}")
print(f"\n⚠ ACTION REQUIRED:")
print(f"  1. Copy {SAS_HASH_FILE} to your SAS server")
print(f"  2. Run the SAS program")
print(f"  3. Collect the generated CSV files")

## 5.4 Generate Athena SQL for AWS Row Hashes

In [None]:
def generate_athena_hash_sql(table_pair, column_mapping, vintage='whole', filter_cols=None, max_rows=None):
    """
    Generate Athena SQL to compute row hashes
    
    Args:
        table_pair: Dict with table info
        column_mapping: Dict with column mappings
        vintage: Date partition or 'whole'
        filter_cols: List of AWS columns to include (None = all)
        max_rows: Maximum rows to process (None = all)
    
    Returns:
        str: Athena SQL
    """
    aws_tbl = table_pair['aws_tbl']
    
    # Get comparable columns (map to AWS column names)
    comparable_cols = column_mapping['comparable_columns']
    
    if filter_cols:
        # Filter by PCDS column names, then map to AWS
        pcds_to_aws = column_mapping['pcds_to_aws']
        filter_aws_cols = [pcds_to_aws.get(c) for c in filter_cols if c in pcds_to_aws]
        comparable_cols = [c for c in comparable_cols if c['aws_col'] in filter_aws_cols]
    
    # Get unique ID columns (map to AWS)
    unique_id_cols = []
    if table_pair.get('unique_id_cols'):
        pcds_ids = [c.strip() for c in table_pair['unique_id_cols'].split(',')]
        pcds_to_aws = column_mapping['pcds_to_aws']
        unique_id_cols = [pcds_to_aws.get(pid, pid.lower()) for pid in pcds_ids]
    
    if not unique_id_cols:
        return f"-- Skipping {aws_tbl} - No unique ID columns specified\n"
    
    # Build WHERE clause
    where_parts = []
    if vintage != 'whole' and table_pair.get('aws_dt'):
        where_parts.append(f"{table_pair['aws_dt']} = '{vintage}'")
    if table_pair.get('aws_where'):
        where_parts.append(table_pair['aws_where'])
    
    where_clause = " AND ".join(where_parts) if where_parts else "1=1"
    
    # Build normalized concatenation for hash
    normalized_exprs = []
    for col_info in comparable_cols:
        aws_col = col_info['aws_col']
        aws_type = col_info['aws_type']
        
        # Normalize based on type
        norm_expr = normalize_athena_column(aws_col, aws_type)
        normalized_exprs.append(norm_expr)
    
    # Concatenate with separator
    concat_expr = " || '|' || ".join(normalized_exprs)
    
    # Build unique ID select
    unique_id_select = ",\n  ".join(unique_id_cols)
    
    # Build SQL
    limit_clause = f"LIMIT {max_rows}" if max_rows else ""
    
    sql = f"""
-- Hash computation for {aws_tbl} - Vintage: {vintage}
SELECT
  MD5(TO_UTF8({concat_expr})) AS _hash,
  {unique_id_select}
FROM {aws_tbl}
WHERE {where_clause}
{limit_clause};
"""
    
    return sql


print("Athena hash SQL generation function defined")

In [None]:
# Generate Athena hash SQL for all tables
athena_hash_parts = []
athena_hash_parts.append("-- AWS Athena Row Hash Computation")
athena_hash_parts.append("-- Generated: " + datetime.now().isoformat())
athena_hash_parts.append("")

all_hash_queries = []

for idx, (table_pair, col_mapping) in enumerate(zip(config['table_pairs'], config['column_mappings'])):
    # Apply table filter
    if FILTER_TABLE_INDICES and idx not in FILTER_TABLE_INDICES:
        continue
    
    # Get vintages
    if 'row_meta' in config:
        vintages = list(set([
            r['vintage'] for r in config['row_meta'] 
            if r['aws_tbl'] == table_pair['aws_tbl']
        ]))
    else:
        vintages = ['whole']
    
    # Apply vintage filter
    if FILTER_VINTAGES:
        vintages = [v for v in vintages if v in FILTER_VINTAGES]
    
    for vintage in vintages:
        sql = generate_athena_hash_sql(
            table_pair, 
            col_mapping, 
            vintage,
            FILTER_COLUMNS,
            MAX_ROWS_PER_VINTAGE
        )
        athena_hash_parts.append(sql)
        athena_hash_parts.append("")
        
        all_hash_queries.append({
            'table': table_pair['aws_tbl'],
            'vintage': vintage,
            'sql': sql
        })

full_athena_hash = "\n".join(athena_hash_parts)

# Save Athena hash SQL file
with open(ATHENA_HASH_FILE, 'w') as f:
    f.write(full_athena_hash)

print(f"✓ Athena hash SQL generated: {ATHENA_HASH_FILE}")
print(f"  Total queries: {len(all_hash_queries)}")
print(f"  Total lines: {len(full_athena_hash.splitlines())}")

## 5.5 Execute Athena Hash Queries (Optional)

In [None]:
# Placeholder for Athena execution
# Similar to Step 4, implement execute_athena_query or use AWS console

print("\n⚠ Athena hash queries can be executed:")
print("  1. Programmatically (implement boto3 integration)")
print("  2. Copy/paste from SQL file to Athena console")
print("  3. Review and approve SQL file first")

## 5.6 Load and Compare Hash Results

After collecting PCDS and AWS hash CSVs, compare to find mismatches.

In [None]:
def compare_hashes(pcds_csv, aws_csv, unique_id_cols):
    """
    Compare PCDS and AWS hash CSVs to find mismatches
    
    Args:
        pcds_csv: Path to PCDS hash CSV
        aws_csv: Path to AWS hash CSV
        unique_id_cols: List of unique ID column names
    
    Returns:
        dict: Comparison results
    """
    df_pcds = pd.read_csv(pcds_csv)
    df_aws = pd.read_csv(aws_csv)
    
    print(f"  PCDS rows: {len(df_pcds)}")
    print(f"  AWS rows: {len(df_aws)}")
    
    # Merge on unique IDs
    df_merged = df_pcds.merge(
        df_aws,
        on=unique_id_cols,
        how='outer',
        suffixes=('_pcds', '_aws'),
        indicator=True
    )
    
    # Analyze results
    both = df_merged[df_merged['_merge'] == 'both']
    pcds_only = df_merged[df_merged['_merge'] == 'left_only']
    aws_only = df_merged[df_merged['_merge'] == 'right_only']
    
    # Among rows in both, find hash mismatches
    hash_match = both['_hash_pcds'] == both['_hash_aws']
    hash_mismatch = both[~hash_match]
    
    results = {
        'total_pcds': len(df_pcds),
        'total_aws': len(df_aws),
        'in_both': len(both),
        'pcds_only': len(pcds_only),
        'aws_only': len(aws_only),
        'hash_match': hash_match.sum(),
        'hash_mismatch': len(hash_mismatch),
        'mismatch_rows': hash_mismatch[unique_id_cols + ['_hash_pcds', '_hash_aws']].to_dict('records')
    }
    
    return results


# Example usage (uncomment when CSVs are ready)
# results = compare_hashes(
#     'output/pcds_hash_table1_whole.csv',
#     'output/aws_hash_table1_whole.csv',
#     ['customer_id', 'transaction_id']
# )
# print(f"Hash matches: {results['hash_match']}")
# print(f"Hash mismatches: {results['hash_mismatch']}")

print("Hash comparison function defined")
print("\nLoad CSV files after SAS and Athena complete to compare hashes")

## 5.7 Update Configuration JSON

In [None]:
# Update config
config['row_hash_check'] = {
    'sas_file': SAS_HASH_FILE,
    'athena_sql_file': ATHENA_HASH_FILE,
    'filter_tables': FILTER_TABLE_INDICES,
    'filter_vintages': FILTER_VINTAGES,
    'filter_columns': FILTER_COLUMNS,
    'max_rows_per_vintage': MAX_ROWS_PER_VINTAGE,
    'status': 'code_generated_awaiting_execution'
}
config['status']['step5_completed'] = True
config['status']['step5_completed_at'] = datetime.now().isoformat()

# Save updated config
with open(CONFIG_JSON, 'w') as f:
    json.dump(config, f, indent=2, default=str)

print(f"\n✓ Configuration updated and saved to: {CONFIG_JSON}")
print(f"\n✓ Step 5 Complete - All validation steps finished!")
print(f"\n⚠ Next steps:")
print(f"  1. Execute SAS hash code on SAS server")
print(f"  2. Execute Athena hash queries")
print(f"  3. Collect CSV results")
print(f"  4. Run hash comparison using compare_hashes() function")
print(f"  5. Investigate any mismatched rows using unique IDs")