# Bronze Layer Creation

This notebook creates the bronze layer by ingesting raw data from S3 into DuckDB tables.

In [20]:
# Import required libraries
import sys
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime

# Add project root to path
project_root = Path().absolute()
sys.path.insert(0, str(project_root))

from utils.database import DuckDBManager
from config.settings import config

# Initialize components
config.setup_logging()
db_manager = DuckDBManager()

print("🏗️ Bronze Layer Creation Started")
print(f"🔗 S3 Ingestion Path: {config.INGESTION_PATH}")

🏗️ Bronze Layer Creation Started
🔗 S3 Ingestion Path: s3://vendor-data-s3/LSEG/TRTH/LSE/ingestion


## 1. Create Bronze Schema

In [21]:
# Create bronze schema and tables
print("📊 Creating bronze schema...")

bronze_schema_sql = """
-- Create bronze schema
CREATE SCHEMA IF NOT EXISTS bronze;

-- Create bronze layer table for bond data
CREATE OR REPLACE TABLE bronze.bond_data AS
SELECT 
    *,
    filename as source_file,
    REGEXP_EXTRACT(filename, '(\\\\d{4}-\\\\d{2}-\\\\d{2})', 1) as data_date,
    current_timestamp as ingestion_timestamp
FROM read_csv(?, 
             AUTO_DETECT=true, 
             FILENAME=true,
             UNION_BY_NAME=true)
"""

try:
    # Create the bronze table
    result = db_manager.execute_sql(
        bronze_schema_sql.replace('?', f"'{config.INGESTION_PATH}/*/*.csv.gz'")
    )
    
    print("✅ Bronze schema and tables created successfully")
    
    # Get row count
    count_result = db_manager.execute_query("SELECT COUNT(*) as row_count FROM bronze.bond_data")
    row_count = count_result['row_count'].iloc[0]
    print(f"📊 Bronze layer contains {row_count:,} rows")
    
except Exception as e:
    print(f"❌ Bronze layer creation failed: {e}")

2025-06-18 17:25:48,716 - utils.database - INFO - S3 credentials configured
2025-06-18 17:25:48,717 - utils.database - INFO - DuckDB connection configured successfully


📊 Creating bronze schema...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

2025-06-18 17:31:55,161 - utils.database - INFO - SQL statement executed successfully
2025-06-18 17:31:55,171 - utils.database - INFO - Query executed successfully. Returned 1 rows


✅ Bronze schema and tables created successfully
📊 Bronze layer contains 100,822,111 rows


## 2. Data Quality Checks

In [22]:
# Perform data quality checks on bronze layer
print("🔍 Performing data quality checks...")

quality_checks = {
    "Total Records": "SELECT COUNT(*) as count FROM bronze.bond_data",
    "Unique Files": "SELECT COUNT(DISTINCT source_file) as count FROM bronze.bond_data",
    "Date Range": "SELECT MIN(data_date) as min_date, MAX(data_date) as max_date FROM bronze.bond_data WHERE data_date IS NOT NULL",
    "Null Values": "SELECT COUNT(*) as count FROM bronze.bond_data WHERE data_date IS NULL"
}

try:
    print("\n📊 Bronze Layer Quality Report:")
    for check_name, query in quality_checks.items():
        result = db_manager.execute_query(query)
        print(f"  {check_name}: {result.iloc[0].to_dict()}")
    
    # Sample data preview
    sample_data = db_manager.execute_query("SELECT * FROM bronze.bond_data LIMIT 5")
    print("\n🔍 Sample Data:")
    print(sample_data)
    
except Exception as e:
    print(f"❌ Quality checks failed: {e}")

🔍 Performing data quality checks...

📊 Bronze Layer Quality Report:


2025-06-18 17:31:55,435 - utils.database - INFO - Query executed successfully. Returned 1 rows
2025-06-18 17:31:55,492 - utils.database - INFO - Query executed successfully. Returned 1 rows


  Total Records: {'count': 100822111}
  Unique Files: {'count': 1}


2025-06-18 17:31:55,935 - utils.database - INFO - Query executed successfully. Returned 1 rows
2025-06-18 17:31:55,940 - utils.database - INFO - Query executed successfully. Returned 1 rows
2025-06-18 17:31:55,976 - utils.database - INFO - Query executed successfully. Returned 5 rows


  Date Range: {'min_date': '', 'max_date': ''}
  Null Values: {'count': 0}

🔍 Sample Data:
        #RIC        Domain                        Date-Time GMT Offset   Type  \
0  .TRX50GBP  Market Price 2023-09-01 02:59:58.447680-04:00         +1  Trade   
1  .TRX50GBP  Market Price 2023-09-01 03:00:58.438312-04:00         +1  Trade   
2  .TRX50GBP  Market Price 2023-09-01 03:01:58.408014-04:00         +1  Trade   
3  .TRX50GBP  Market Price 2023-09-01 03:02:58.438451-04:00         +1  Trade   
4  .TRX50GBP  Market Price 2023-09-01 03:03:58.432486-04:00         +1  Trade   

  Ex/Cntrb.ID   LOC   Price  Volume  Market VWAP  ... Implied Yield  Delta  \
0        None  None  113.90    <NA>          NaN  ...          None   None   
1        None  None  114.23    <NA>          NaN  ...          None   None   
2        None  None  114.20    <NA>          NaN  ...          None   None   
3        None  None  114.17    <NA>          NaN  ...          None   None   
4        None  None  114.15    <

## 3. Create Bronze Layer Summary

In [23]:
# Generate bronze layer summary
print("📋 Bronze Layer Summary")
print("=" * 30)

try:
    # Get table info
    table_info = db_manager.execute_query("DESCRIBE bronze.bond_data")
    print(f"✅ Table created: bronze.bond_data")
    print(f"📊 Columns: {len(table_info)}")
    print(f"🗂️  Schema: bronze")
    
    # Log completion
    completion_log = """
    INSERT INTO audit.data_ingestion_log 
    (log_id, source_path, record_count, processing_status, error_message)
    VALUES (nextval('audit.log_seq'), 'bronze_layer', ?, 'completed', 'Bronze layer created successfully')
    """
    
    try:
        count_result = db_manager.execute_query("SELECT COUNT(*) as count FROM bronze.bond_data")
        record_count = count_result['count'].iloc[0]
        db_manager.execute_sql(completion_log.replace('?', str(record_count)))
        print("✅ Process logged to audit table")
    except:
        print("⚠️  Audit logging skipped (table may not exist)")
    
    print("\n🎉 Bronze layer creation completed successfully!")
    print("📌 Next: Run 04_silver_layer.ipynb to clean and transform the data")
    
except Exception as e:
    print(f"❌ Summary generation failed: {e}")

2025-06-18 17:31:56,018 - utils.database - INFO - Query executed successfully. Returned 117 rows
2025-06-18 17:31:56,036 - utils.database - INFO - Query executed successfully. Returned 1 rows
2025-06-18 17:31:56,042 - utils.database - INFO - SQL statement executed successfully


📋 Bronze Layer Summary
✅ Table created: bronze.bond_data
📊 Columns: 117
🗂️  Schema: bronze
✅ Process logged to audit table

🎉 Bronze layer creation completed successfully!
📌 Next: Run 04_silver_layer.ipynb to clean and transform the data
