# Step 4: Column Statistics Comparison

This notebook:
- Loads configuration with column mappings from Step 3
- **PCDS**: Generates SAS code to compute statistics for all comparable columns
  - You run this SAS file on SAS server (background)
  - SAS emails back CSV with statistics
  - Load the CSV back into this notebook
- **AWS**: Generates and executes Athena SQL queries for statistics
  - Submits queries directly from notebook
  - Fetches results in parallel
- Compares PCDS vs AWS statistics
- Updates JSON with results

**Input**: `data/config.json`  
**Output**: SAS file, Athena SQL, updated `data/config.json`

In [None]:
import json
import pandas as pd
from datetime import datetime
from pathlib import Path

print("Step 4: Column Statistics Comparison")
print("=" * 50)

## 4.1 Load Configuration

In [None]:
CONFIG_JSON = "data/config.json"

with open(CONFIG_JSON, 'r') as f:
    config = json.load(f)

print(f"Loaded configuration: {config['run_name']}")
print(f"Total tables: {config['metadata']['total_tables']}")
print(f"Step 3 completed: {config['status']['step3_completed']}")

# Output paths
SAS_OUTPUT_FILE = f"output/pcds_stats_{config['run_name']}.sas"
ATHENA_SQL_FILE = f"output/aws_stats_{config['run_name']}.sql"

Path("output").mkdir(exist_ok=True)

## 4.2 Generate SAS Code for PCDS Statistics

Generate a single SAS file that computes statistics for all comparable columns across all tables.

In [None]:
def generate_sas_statistics_code(table_pair, column_mapping, vintage='whole'):
    """
    Generate SAS code to compute statistics for a table
    
    Args:
        table_pair: Dict with table info
        column_mapping: Dict with column mappings
        vintage: Date partition or 'whole'
    
    Returns:
        str: SAS code
    """
    pcds_tbl = table_pair['pcds_tbl']
    comparable_cols = [c['pcds_col'] for c in column_mapping['comparable_columns']]
    
    # Build WHERE clause
    where_parts = []
    if vintage != 'whole' and table_pair.get('pcds_dt'):
        where_parts.append(f"{table_pair['pcds_dt']} = '{vintage}'")
    if table_pair.get('pcds_where'):
        where_parts.append(table_pair['pcds_where'])
    
    where_clause = " AND ".join(where_parts) if where_parts else "1=1"
    
    # Generate SAS code
    sas_code = f"""
/* Statistics for {pcds_tbl} - Vintage: {vintage} */
PROC SQL;
  CREATE TABLE work.stats_{pcds_tbl.replace('.', '_')}_{vintage.replace('-', '')} AS
  SELECT
    '{pcds_tbl}' AS table_name,
    '{vintage}' AS vintage,
    variable,
    data_type,
    count,
    distinct_count,
    max_val,
    min_val,
    mean,
    std,
    missing,
    sum_val,
    sum_sq
  FROM (
"""
    
    # Generate statistics for each column
    col_stats = []
    for col in comparable_cols:
        col_stat = f"""
    SELECT
      '{col}' AS variable,
      'TBD' AS data_type,
      COUNT({col}) AS count,
      COUNT(DISTINCT {col}) AS distinct_count,
      MAX({col}) AS max_val,
      MIN({col}) AS min_val,
      MEAN({col}) AS mean,
      STD({col}) AS std,
      SUM(CASE WHEN {col} IS NULL THEN 1 ELSE 0 END) AS missing,
      SUM({col}) AS sum_val,
      SUM({col} * {col}) AS sum_sq
    FROM {pcds_tbl}
    WHERE {where_clause}
"""
        col_stats.append(col_stat)
    
    sas_code += "\n    UNION ALL\n".join(col_stats)
    sas_code += "\n  );\nQUIT;\n"
    
    return sas_code


print("SAS code generation function defined")

In [None]:
# Generate SAS code for all tables
sas_code_parts = []
sas_code_parts.append("/* PCDS Column Statistics */")
sas_code_parts.append("/* Generated: " + datetime.now().isoformat() + " */\n")

for table_pair, col_mapping in zip(config['table_pairs'], config['column_mappings']):
    # Get vintages from row_meta or use 'whole'
    if 'row_meta' in config:
        vintages = list(set([
            r['vintage'] for r in config['row_meta'] 
            if r['pcds_tbl'] == table_pair['pcds_tbl']
        ]))
    else:
        vintages = ['whole']
    
    for vintage in vintages:
        sas_code = generate_sas_statistics_code(table_pair, col_mapping, vintage)
        sas_code_parts.append(sas_code)

# Combine all results and export
sas_code_parts.append("""
/* Combine all results */
DATA work.all_stats;
  SET work.stats_:;
RUN;

/* Export to CSV */
PROC EXPORT DATA=work.all_stats
  OUTFILE='/path/to/output/pcds_column_stats.csv'
  DBMS=CSV REPLACE;
RUN;

/* Email results */
filename mailout email
  to="your.email@company.com"
  subject="PCDS Column Statistics - " || "&sysdate"
  attach="/path/to/output/pcds_column_stats.csv";

data _null_;
  file mailout;
  put "PCDS column statistics completed.";
  put "See attached CSV file.";
run;
""")

full_sas_code = "\n".join(sas_code_parts)

# Save SAS file
with open(SAS_OUTPUT_FILE, 'w') as f:
    f.write(full_sas_code)

print(f"✓ SAS code generated: {SAS_OUTPUT_FILE}")
print(f"  Total lines: {len(full_sas_code.splitlines())}")
print(f"\n⚠ ACTION REQUIRED:")
print(f"  1. Copy {SAS_OUTPUT_FILE} to your SAS server")
print(f"  2. Run the SAS program")
print(f"  3. Wait for email with CSV attachment")
print(f"  4. Continue to next cell to load AWS statistics")

## 4.3 Generate and Execute Athena SQL for AWS Statistics

Generate Athena SQL queries and execute them directly from the notebook.

In [None]:
def generate_athena_statistics_sql(table_pair, column_mapping, vintage='whole'):
    """
    Generate Athena SQL to compute statistics for a table
    
    Args:
        table_pair: Dict with table info
        column_mapping: Dict with column mappings
        vintage: Date partition or 'whole'
    
    Returns:
        str: Athena SQL
    """
    aws_tbl = table_pair['aws_tbl']
    comparable_cols = column_mapping['comparable_columns']
    
    # Build WHERE clause
    where_parts = []
    if vintage != 'whole' and table_pair.get('aws_dt'):
        where_parts.append(f"{table_pair['aws_dt']} = '{vintage}'")
    if table_pair.get('aws_where'):
        where_parts.append(table_pair['aws_where'])
    
    where_clause = " AND ".join(where_parts) if where_parts else "1=1"
    
    # Generate SQL for each column
    col_selects = []
    for col_info in comparable_cols:
        aws_col = col_info['aws_col']
        col_select = f"""
  SELECT
    '{aws_tbl}' AS table_name,
    '{vintage}' AS vintage,
    '{aws_col}' AS variable,
    '{col_info['aws_type']}' AS data_type,
    COUNT({aws_col}) AS count,
    COUNT(DISTINCT {aws_col}) AS distinct_count,
    CAST(MAX({aws_col}) AS VARCHAR) AS max_val,
    CAST(MIN({aws_col}) AS VARCHAR) AS min_val,
    AVG(CAST({aws_col} AS DOUBLE)) AS mean,
    STDDEV(CAST({aws_col} AS DOUBLE)) AS std,
    SUM(CASE WHEN {aws_col} IS NULL THEN 1 ELSE 0 END) AS missing,
    SUM(CAST({aws_col} AS DOUBLE)) AS sum_val,
    SUM(CAST({aws_col} AS DOUBLE) * CAST({aws_col} AS DOUBLE)) AS sum_sq
  FROM {aws_tbl}
  WHERE {where_clause}
"""
        col_selects.append(col_select)
    
    sql = "\nUNION ALL\n".join(col_selects)
    sql = f"-- Statistics for {aws_tbl} - Vintage: {vintage}\n" + sql
    
    return sql


print("Athena SQL generation function defined")

In [None]:
# Generate Athena SQL for all tables
athena_sql_parts = []
athena_sql_parts.append("-- AWS Athena Column Statistics")
athena_sql_parts.append("-- Generated: " + datetime.now().isoformat())
athena_sql_parts.append("")

all_queries = []

for table_pair, col_mapping in zip(config['table_pairs'], config['column_mappings']):
    # Get vintages
    if 'row_meta' in config:
        vintages = list(set([
            r['vintage'] for r in config['row_meta'] 
            if r['aws_tbl'] == table_pair['aws_tbl']
        ]))
    else:
        vintages = ['whole']
    
    for vintage in vintages:
        sql = generate_athena_statistics_sql(table_pair, col_mapping, vintage)
        athena_sql_parts.append(sql)
        athena_sql_parts.append(";\n")
        
        # Store for execution
        all_queries.append({
            'table': table_pair['aws_tbl'],
            'vintage': vintage,
            'sql': sql
        })

full_athena_sql = "\n".join(athena_sql_parts)

# Save Athena SQL file
with open(ATHENA_SQL_FILE, 'w') as f:
    f.write(full_athena_sql)

print(f"✓ Athena SQL generated: {ATHENA_SQL_FILE}")
print(f"  Total queries: {len(all_queries)}")
print(f"  Total lines: {len(full_athena_sql.splitlines())}")

## 4.4 Execute Athena Queries (Optional - Requires AWS Setup)

Execute Athena queries programmatically. You can also review the SQL file first.

In [None]:
# Athena execution function (requires boto3)
def execute_athena_query(sql, database='default', s3_output='s3://your-bucket/athena-results/'):
    """
    Execute Athena query and return results
    
    Args:
        sql: SQL query string
        database: Athena database name
        s3_output: S3 path for query results
    
    Returns:
        DataFrame with results
    """
    # TODO: Implement with boto3
    # import boto3
    # client = boto3.client('athena')
    # response = client.start_query_execution(...)
    # Wait for completion and fetch results
    
    print(f"  [MOCK] Executing Athena query...")
    
    # Placeholder - return empty DataFrame
    return pd.DataFrame({
        'table_name': [],
        'vintage': [],
        'variable': [],
        'data_type': [],
        'count': [],
        'distinct_count': [],
        'max_val': [],
        'min_val': [],
        'mean': [],
        'std': [],
        'missing': [],
        'sum_val': [],
        'sum_sq': []
    })


# Execute all queries (uncomment when ready)
# aws_stats_results = []
# for query in all_queries:
#     print(f"Executing: {query['table']} - {query['vintage']}")
#     df = execute_athena_query(query['sql'])
#     aws_stats_results.append(df)
# 
# df_aws_stats = pd.concat(aws_stats_results, ignore_index=True)
# print(f"✓ AWS statistics collected: {len(df_aws_stats)} rows")

print("\n⚠ Athena execution is optional here.")
print("  You can either:")
print("  1. Execute queries programmatically (implement execute_athena_query)")
print("  2. Copy/paste SQL from file to Athena console")
print("  3. Use AWS CLI or other tools")

## 4.5 Load PCDS Statistics from CSV

After receiving the CSV from SAS, load it here.

In [None]:
# Load PCDS CSV (after receiving from SAS)
PCDS_STATS_CSV = "output/pcds_column_stats.csv"  # Update path as needed

try:
    df_pcds_stats = pd.read_csv(PCDS_STATS_CSV)
    print(f"✓ Loaded PCDS statistics: {len(df_pcds_stats)} rows")
    print(f"  Columns: {list(df_pcds_stats.columns)}")
    display(df_pcds_stats.head())
except FileNotFoundError:
    print(f"⚠ PCDS stats CSV not found: {PCDS_STATS_CSV}")
    print("  Waiting for SAS to complete and send CSV...")
    df_pcds_stats = None

## 4.6 Compare Statistics (When Both Available)

In [None]:
# Placeholder for comparison
# When both df_pcds_stats and df_aws_stats are available:
# - Merge on (table_name, vintage, variable)
# - Compare count, distinct_count, min, max, mean, std
# - Flag significant differences

print("Statistics comparison will be performed once both PCDS and AWS data are loaded.")
print("\nComparison metrics:")
print("  - Row count")
print("  - Distinct count")
print("  - Min/Max values")
print("  - Mean/Std (for numeric columns)")
print("  - Missing count")

## 4.7 Update Configuration JSON

In [None]:
# Update config with file paths
config['column_statistics'] = {
    'sas_file': SAS_OUTPUT_FILE,
    'athena_sql_file': ATHENA_SQL_FILE,
    'pcds_csv': PCDS_STATS_CSV if 'df_pcds_stats' in dir() and df_pcds_stats is not None else None,
    'status': 'sas_generated_awaiting_results'
}
config['status']['step4_completed'] = True
config['status']['step4_completed_at'] = datetime.now().isoformat()

# Save updated config
with open(CONFIG_JSON, 'w') as f:
    json.dump(config, f, indent=2, default=str)

print(f"\n✓ Configuration updated and saved to: {CONFIG_JSON}")
print(f"\n✓ Step 4 Complete - Ready for Step 5")
print(f"\n⚠ Remember to complete SAS execution and load results before final comparison")