# Step 3: Column Mapping and Crosswalk

This notebook:
- Loads configuration from Step 2
- Loads column crosswalk mappings from XLSX
- Queries actual PCDS and AWS column metadata
- Identifies comparable columns (PCDS ↔ AWS)
- Updates JSON with comparable column mappings

**Input**: `data/config.json`, crosswalk XLSX  
**Output**: Updated `data/config.json` with column_mappings

In [None]:
import json
import pandas as pd
from datetime import datetime
from collections import defaultdict

print("Step 3: Column Mapping and Crosswalk")
print("=" * 50)

## 3.1 Load Configuration

In [None]:
CONFIG_JSON = "data/config.json"

with open(CONFIG_JSON, 'r') as f:
    config = json.load(f)

print(f"Loaded configuration: {config['run_name']}")
print(f"Total tables: {config['metadata']['total_tables']}")
print(f"Step 2 completed: {config['status']['step2_completed']}")

## 3.2 Load Crosswalk Mapping (Optional)

If you have a predefined crosswalk XLSX, load it here. Otherwise, we'll infer mappings.

In [None]:
# Crosswalk file (optional)
CROSSWALK_XLSX = "../files/inputs/crosswalk.xlsx"  # Update path as needed

try:
    df_crosswalk = pd.read_excel(CROSSWALK_XLSX)
    print(f"Loaded crosswalk with {len(df_crosswalk)} mappings")
    print(f"Columns: {list(df_crosswalk.columns)}")
    display(df_crosswalk.head())
except FileNotFoundError:
    print(f"Crosswalk file not found: {CROSSWALK_XLSX}")
    print("Will infer column mappings from schema metadata")
    df_crosswalk = None

## 3.3 Define Column Metadata Query Functions

In [None]:
def get_pcds_columns(table_name):
    """
    Query PCDS (Oracle) for column metadata
    
    Args:
        table_name: PCDS table name
    
    Returns:
        dict: {column_name: {'data_type': str, 'nullable': bool, ...}}
    """
    # TODO: Implement actual Oracle query
    # Example SQL:
    # SELECT column_name, data_type, nullable
    # FROM all_tab_columns
    # WHERE table_name = '{table_name}'
    # ORDER BY column_id
    
    print(f"  [PCDS] Querying schema for {table_name}...")
    
    # Placeholder - replace with actual query
    return {
        'CUSTOMER_ID': {'data_type': 'NUMBER', 'nullable': False},
        'CUSTOMER_NAME': {'data_type': 'VARCHAR2', 'nullable': True},
        'CREATE_DT': {'data_type': 'DATE', 'nullable': True},
    }


def get_aws_columns(table_name):
    """
    Query AWS Athena for column metadata
    
    Args:
        table_name: AWS table name
    
    Returns:
        dict: {column_name: {'data_type': str, 'nullable': bool, ...}}
    """
    # TODO: Implement actual Athena query
    # Example SQL:
    # DESCRIBE {table_name}
    # or
    # SELECT * FROM information_schema.columns
    # WHERE table_name = '{table_name}'
    
    print(f"  [AWS]  Querying schema for {table_name}...")
    
    # Placeholder - replace with actual query
    return {
        'customer_id': {'data_type': 'bigint', 'nullable': False},
        'customer_name': {'data_type': 'string', 'nullable': True},
        'create_dt': {'data_type': 'timestamp', 'nullable': True},
    }


def map_column_types(pcds_type, aws_type):
    """
    Check if PCDS and AWS column types are compatible
    
    Returns:
        tuple: (is_compatible, compatibility_status)
    """
    # Type mapping rules
    type_map = {
        'NUMBER': ['bigint', 'int', 'double', 'decimal'],
        'VARCHAR2': ['string', 'varchar'],
        'DATE': ['timestamp', 'date'],
        'TIMESTAMP': ['timestamp'],
    }
    
    pcds_upper = pcds_type.upper()
    aws_lower = aws_type.lower()
    
    for pcds_key, aws_values in type_map.items():
        if pcds_upper.startswith(pcds_key):
            if any(aws_lower.startswith(av) for av in aws_values):
                return True, 'compatible'
            else:
                return False, 'type_mismatch'
    
    return False, 'unknown_type'


print("Column metadata query functions defined")

## 3.4 Build Column Mappings for Each Table Pair

In [None]:
column_mappings = []

for idx, table_pair in enumerate(config['table_pairs'], 1):
    print(f"\n[{idx}/{len(config['table_pairs'])}] Processing:")
    print(f"  PCDS: {table_pair['pcds_tbl']}")
    print(f"  AWS:  {table_pair['aws_tbl']}")
    
    # Get column metadata
    pcds_cols = get_pcds_columns(table_pair['pcds_tbl'])
    aws_cols = get_aws_columns(table_pair['aws_tbl'])
    
    print(f"  PCDS columns: {len(pcds_cols)}")
    print(f"  AWS columns: {len(aws_cols)}")
    
    # Build mapping
    pcds_to_aws = {}
    aws_to_pcds = {}
    comparable_cols = []
    pcds_only = []
    aws_only = []
    type_mismatches = []
    
    # Create lowercase mapping for AWS (case-insensitive matching)
    aws_cols_lower = {k.lower(): (k, v) for k, v in aws_cols.items()}
    
    # Map PCDS columns to AWS
    for pcds_col, pcds_meta in pcds_cols.items():
        # Try exact match (case-insensitive)
        aws_col_lower = pcds_col.lower()
        
        if aws_col_lower in aws_cols_lower:
            aws_col, aws_meta = aws_cols_lower[aws_col_lower]
            
            # Check type compatibility
            is_compatible, compat_status = map_column_types(
                pcds_meta['data_type'], 
                aws_meta['data_type']
            )
            
            pcds_to_aws[pcds_col] = aws_col
            aws_to_pcds[aws_col] = pcds_col
            
            if is_compatible:
                comparable_cols.append({
                    'pcds_col': pcds_col,
                    'aws_col': aws_col,
                    'pcds_type': pcds_meta['data_type'],
                    'aws_type': aws_meta['data_type'],
                    'status': 'perfect_match'
                })
            else:
                type_mismatches.append({
                    'pcds_col': pcds_col,
                    'aws_col': aws_col,
                    'pcds_type': pcds_meta['data_type'],
                    'aws_type': aws_meta['data_type'],
                    'status': 'type_mismatch'
                })
        else:
            pcds_only.append(pcds_col)
    
    # Find AWS-only columns
    for aws_col in aws_cols.keys():
        if aws_col not in aws_to_pcds:
            aws_only.append(aws_col)
    
    # Store mapping result
    mapping_result = {
        'table_index': idx - 1,
        'pcds_tbl': table_pair['pcds_tbl'],
        'aws_tbl': table_pair['aws_tbl'],
        'pcds_to_aws': pcds_to_aws,
        'aws_to_pcds': aws_to_pcds,
        'comparable_columns': comparable_cols,
        'type_mismatches': type_mismatches,
        'pcds_only': pcds_only,
        'aws_only': aws_only,
        'stats': {
            'total_comparable': len(comparable_cols),
            'type_mismatches': len(type_mismatches),
            'pcds_only': len(pcds_only),
            'aws_only': len(aws_only)
        }
    }
    
    column_mappings.append(mapping_result)
    
    print(f"  ✓ Comparable: {len(comparable_cols)}")
    print(f"  ⚠ Type mismatches: {len(type_mismatches)}")
    print(f"  PCDS only: {len(pcds_only)}")
    print(f"  AWS only: {len(aws_only)}")

print(f"\n✓ Column mapping complete for {len(column_mappings)} tables")

## 3.5 Summary Report

In [None]:
print("Column Mapping Summary:")
print("=" * 50)

total_comparable = sum(m['stats']['total_comparable'] for m in column_mappings)
total_mismatches = sum(m['stats']['type_mismatches'] for m in column_mappings)
total_pcds_only = sum(m['stats']['pcds_only'] for m in column_mappings)
total_aws_only = sum(m['stats']['aws_only'] for m in column_mappings)

print(f"Total comparable columns: {total_comparable}")
print(f"Total type mismatches: {total_mismatches}")
print(f"Total PCDS-only columns: {total_pcds_only}")
print(f"Total AWS-only columns: {total_aws_only}")

# Show tables with type mismatches
if total_mismatches > 0:
    print("\n⚠ Tables with type mismatches:")
    for m in column_mappings:
        if m['stats']['type_mismatches'] > 0:
            print(f"  - {m['pcds_tbl']}: {m['stats']['type_mismatches']} mismatches")
            for tm in m['type_mismatches']:
                print(f"    • {tm['pcds_col']} ({tm['pcds_type']}) -> {tm['aws_col']} ({tm['aws_type']})")

## 3.6 Update Configuration JSON

In [None]:
# Add column mappings to config
config['column_mappings'] = column_mappings
config['status']['step3_completed'] = True
config['status']['step3_completed_at'] = datetime.now().isoformat()

# Save updated config
with open(CONFIG_JSON, 'w') as f:
    json.dump(config, f, indent=2, default=str)

print(f"\n✓ Configuration updated and saved to: {CONFIG_JSON}")
print(f"\n✓ Step 3 Complete - Ready for Step 4")