# Step 1: Load Configuration and Table List

This notebook loads:
- Compare table list from XLSX
- Configuration settings

**Output**: `data/config.json` containing all table pairs and settings

In [None]:
import pandas as pd
import json
from pathlib import Path
from datetime import datetime

print("Step 1: Load Configuration")
print("=" * 50)

## 1.1 Configuration

In [None]:
# Input file path
COMPARE_LIST_XLSX = "../files/inputs/compare_list.xlsx"

# Output file path
OUTPUT_JSON = "data/config.json"

# Validation run metadata
RUN_NAME = "validation_run_" + datetime.now().strftime("%Y%m%d_%H%M%S")
CATEGORY = "dpst"  # or "dpst", "dpsm", etc.

## 1.2 Load Compare Table List

In [None]:
# Read Excel file
df = pd.read_excel(COMPARE_LIST_XLSX)

print(f"Loaded {len(df)} table pairs from {COMPARE_LIST_XLSX}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst 3 rows:")
df.head(3)

## 1.3 Validate and Clean Data

In [None]:
# Required columns
required_cols = ['pcds_tbl', 'aws_tbl']

# Check required columns exist
missing_cols = set(required_cols) - set(df.columns)
if missing_cols:
    raise ValueError(f"Missing required columns: {missing_cols}")

# Add default values for optional columns
df['enabled'] = df.get('enabled', True)
df['partition'] = df.get('partition', 'whole')
df['group'] = df.get('group', 'default')

# Filter enabled tables only
df_enabled = df[df['enabled'] == True].copy()

print(f"Enabled tables: {len(df_enabled)} / {len(df)}")

## 1.4 Build Configuration JSON

In [None]:
# Convert DataFrame to list of dicts
table_pairs = df_enabled.to_dict('records')

# Build config structure
config = {
    "run_name": RUN_NAME,
    "category": CATEGORY,
    "created_at": datetime.now().isoformat(),
    "table_pairs": table_pairs,
    "metadata": {
        "total_tables": len(table_pairs),
        "source_file": COMPARE_LIST_XLSX
    },
    "status": {
        "step1_completed": True,
        "step2_completed": False,
        "step3_completed": False,
        "step4_completed": False,
        "step5_completed": False
    }
}

print(f"Configuration built with {len(table_pairs)} table pairs")

## 1.5 Save to JSON

In [None]:
# Ensure output directory exists
Path(OUTPUT_JSON).parent.mkdir(parents=True, exist_ok=True)

# Save JSON
with open(OUTPUT_JSON, 'w') as f:
    json.dump(config, f, indent=2, default=str)

print(f"\nConfiguration saved to: {OUTPUT_JSON}")
print(f"\nSummary:")
print(f"  Run Name: {RUN_NAME}")
print(f"  Category: {CATEGORY}")
print(f"  Total Tables: {len(table_pairs)}")
print(f"\nâœ“ Step 1 Complete - Ready for Step 2")