# Bronze Layer Creation

This notebook creates the bronze layer by ingesting raw data from S3 into DuckDB tables.

In [None]:
# Import required libraries
import sys
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime

# Add project root to path
project_root = Path().absolute()
sys.path.insert(0, str(project_root))

from utils.database import DuckDBManager
from config.settings import config

# Initialize components
config.setup_logging()
db_manager = DuckDBManager()

print("🏗️ Bronze Layer Creation Started")
print(f"🔗 S3 Ingestion Path: {config.INGESTION_PATH}")

## 1. Create Bronze Schema

In [None]:
# Create bronze schema and tables
print("📊 Creating bronze schema...")

bronze_schema_sql = """
-- Create bronze schema
CREATE SCHEMA IF NOT EXISTS bronze;

-- Create bronze layer table for bond data
CREATE OR REPLACE TABLE bronze.bond_data AS
SELECT 
    *,
    filename as source_file,
    REGEXP_EXTRACT(filename, '(\\\\d{4}-\\\\d{2}-\\\\d{2})', 1) as data_date,
    current_timestamp as ingestion_timestamp
FROM read_csv(?, 
             AUTO_DETECT=true, 
             FILENAME=true,
             UNION_BY_NAME=true)
"""

try:
    # Create the bronze table
    result = db_manager.execute_sql(
        bronze_schema_sql.replace('?', f"'{config.INGESTION_PATH}/*/*.csv.gz'")
    )
    
    print("✅ Bronze schema and tables created successfully")
    
    # Get row count
    count_result = db_manager.execute_query("SELECT COUNT(*) as row_count FROM bronze.bond_data")
    row_count = count_result['row_count'].iloc[0]
    print(f"📊 Bronze layer contains {row_count:,} rows")
    
except Exception as e:
    print(f"❌ Bronze layer creation failed: {e}")

## 2. Data Quality Checks

In [None]:
# Perform data quality checks on bronze layer
print("🔍 Performing data quality checks...")

quality_checks = {
    "Total Records": "SELECT COUNT(*) as count FROM bronze.bond_data",
    "Unique Files": "SELECT COUNT(DISTINCT source_file) as count FROM bronze.bond_data",
    "Date Range": "SELECT MIN(data_date) as min_date, MAX(data_date) as max_date FROM bronze.bond_data WHERE data_date IS NOT NULL",
    "Null Values": "SELECT COUNT(*) as count FROM bronze.bond_data WHERE data_date IS NULL"
}

try:
    print("\n📊 Bronze Layer Quality Report:")
    for check_name, query in quality_checks.items():
        result = db_manager.execute_query(query)
        print(f"  {check_name}: {result.iloc[0].to_dict()}")
    
    # Sample data preview
    sample_data = db_manager.execute_query("SELECT * FROM bronze.bond_data LIMIT 5")
    print("\n🔍 Sample Data:")
    print(sample_data)
    
except Exception as e:
    print(f"❌ Quality checks failed: {e}")

## 3. Create Bronze Layer Summary

In [None]:
# Generate bronze layer summary
print("📋 Bronze Layer Summary")
print("=" * 30)

try:
    # Get table info
    table_info = db_manager.execute_query("DESCRIBE bronze.bond_data")
    print(f"✅ Table created: bronze.bond_data")
    print(f"📊 Columns: {len(table_info)}")
    print(f"🗂️  Schema: bronze")
    
    # Log completion
    completion_log = """
    INSERT INTO audit.data_ingestion_log 
    (log_id, source_path, record_count, processing_status, error_message)
    VALUES (nextval('audit.log_seq'), 'bronze_layer', ?, 'completed', 'Bronze layer created successfully')
    """
    
    try:
        count_result = db_manager.execute_query("SELECT COUNT(*) as count FROM bronze.bond_data")
        record_count = count_result['count'].iloc[0]
        db_manager.execute_sql(completion_log.replace('?', str(record_count)))
        print("✅ Process logged to audit table")
    except:
        print("⚠️  Audit logging skipped (table may not exist)")
    
    print("\n🎉 Bronze layer creation completed successfully!")
    print("📌 Next: Run 04_silver_layer.ipynb to clean and transform the data")
    
except Exception as e:
    print(f"❌ Summary generation failed: {e}")