# Silver Layer Creation

This notebook creates the silver layer by cleaning and transforming bronze layer data.

In [24]:
# Import required libraries
import sys
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime

# Add project root to path
project_root = Path().absolute()
sys.path.insert(0, str(project_root))

from utils.database import DuckDBManager
from config.settings import config

# Initialize components
config.setup_logging()
db_manager = DuckDBManager()

print("🥈 Silver Layer Creation Started")
print("📊 Cleaning and transforming bronze layer data...")

🥈 Silver Layer Creation Started
📊 Cleaning and transforming bronze layer data...


## 1. Create Silver Schema

In [25]:
# Create silver schema
print("🏗️ Creating silver schema...")

try:
    db_manager.execute_sql("CREATE SCHEMA IF NOT EXISTS silver;")
    print("✅ Silver schema created")
    
    # Check if bronze data exists
    bronze_check = db_manager.execute_query("SELECT COUNT(*) as count FROM bronze.bond_data")
    bronze_count = bronze_check['count'].iloc[0]
    print(f"📊 Bronze layer contains {bronze_count:,} rows to process")
    
except Exception as e:
    print(f"❌ Schema creation failed: {e}")

2025-06-18 17:33:46,458 - utils.database - INFO - S3 credentials configured
2025-06-18 17:33:46,458 - utils.database - INFO - DuckDB connection configured successfully
2025-06-18 17:33:46,459 - utils.database - INFO - SQL statement executed successfully
2025-06-18 17:33:46,484 - utils.database - INFO - Query executed successfully. Returned 1 rows


🏗️ Creating silver schema...
✅ Silver schema created
📊 Bronze layer contains 100,822,111 rows to process


## 2. Data Cleaning and Transformation

In [26]:
# Clean and transform data
print("🧹 Cleaning and transforming data...")

silver_transformation_sql = """
CREATE OR REPLACE TABLE silver.cleaned_bond_data AS
SELECT 
    -- Remove duplicates and clean data
    CAST(data_date AS DATE) as trade_date,
    source_file,
    
    -- Clean numeric columns (adjust column names based on your schema)
    CASE 
        WHEN TRY_CAST(\"Price\" AS DOUBLE) IS NOT NULL THEN CAST(\"Price\" AS DOUBLE)
        ELSE NULL 
    END as price,
    
    CASE 
        WHEN TRY_CAST(\"Yield\" AS DOUBLE) IS NOT NULL THEN CAST(\"Yield\" AS DOUBLE)
        ELSE NULL 
    END as yield_value,
    
    -- Add validation flags
    CASE 
        WHEN \"Price\" IS NOT NULL AND \"Yield\" IS NOT NULL THEN 'VALID'
        WHEN \"Price\" IS NULL OR \"Yield\" IS NULL THEN 'INCOMPLETE'
        ELSE 'INVALID'
    END as data_quality_flag,
    
    ingestion_timestamp,
    current_timestamp as silver_processed_timestamp
FROM bronze.bond_data
WHERE data_date IS NOT NULL
    AND data_date >= '2020-01-01'  -- Filter reasonable date range
"""

try:
    db_manager.execute_sql(silver_transformation_sql)
    
    # Get row count
    count_result = db_manager.execute_query("SELECT COUNT(*) as row_count FROM silver.cleaned_bond_data")
    row_count = count_result['row_count'].iloc[0]
    print(f"✅ Silver layer created with {row_count:,} cleaned rows")
    
except Exception as e:
    print(f"❌ Data transformation failed: {e}")

🧹 Cleaning and transforming data...


2025-06-18 17:33:46,503 - utils.database - INFO - SQL statement executed successfully
2025-06-18 17:33:46,507 - utils.database - INFO - Query executed successfully. Returned 1 rows


✅ Silver layer created with 0 cleaned rows


## 3. Data Quality Validation

In [27]:
# Validate cleaned data
print("✅ Validating cleaned data...")

validation_queries = {
    "Total Records": "SELECT COUNT(*) as count FROM silver.cleaned_bond_data",
    "Valid Records": "SELECT COUNT(*) as count FROM silver.cleaned_bond_data WHERE data_quality_flag = 'VALID'",
    "Incomplete Records": "SELECT COUNT(*) as count FROM silver.cleaned_bond_data WHERE data_quality_flag = 'INCOMPLETE'",
    "Invalid Records": "SELECT COUNT(*) as count FROM silver.cleaned_bond_data WHERE data_quality_flag = 'INVALID'",
    "Date Range": "SELECT MIN(trade_date) as min_date, MAX(trade_date) as max_date FROM silver.cleaned_bond_data"
}

try:
    print("\n📊 Silver Layer Validation Report:")
    for check_name, query in validation_queries.items():
        result = db_manager.execute_query(query)
        print(f"  {check_name}: {result.iloc[0].to_dict()}")
    
    # Sample cleaned data
    sample_data = db_manager.execute_query("SELECT * FROM silver.cleaned_bond_data LIMIT 5")
    print("\n🔍 Sample Cleaned Data:")
    print(sample_data)
    
except Exception as e:
    print(f"❌ Validation failed: {e}")

2025-06-18 17:33:46,525 - utils.database - INFO - Query executed successfully. Returned 1 rows
2025-06-18 17:33:46,527 - utils.database - INFO - Query executed successfully. Returned 1 rows


✅ Validating cleaned data...

📊 Silver Layer Validation Report:
  Total Records: {'count': 0}
  Valid Records: {'count': 0}


2025-06-18 17:33:46,532 - utils.database - INFO - Query executed successfully. Returned 1 rows
2025-06-18 17:33:46,534 - utils.database - INFO - Query executed successfully. Returned 1 rows
2025-06-18 17:33:46,539 - utils.database - INFO - Query executed successfully. Returned 1 rows
2025-06-18 17:33:46,544 - utils.database - INFO - Query executed successfully. Returned 0 rows


  Incomplete Records: {'count': 0}
  Invalid Records: {'count': 0}
  Date Range: {'min_date': NaT, 'max_date': NaT}

🔍 Sample Cleaned Data:
Empty DataFrame
Columns: [trade_date, source_file, price, yield_value, data_quality_flag, ingestion_timestamp, silver_processed_timestamp]
Index: []


## 4. Create Silver Layer Summary

In [28]:
# Generate silver layer summary
print("📋 Silver Layer Summary")
print("=" * 30)

try:
    # Get table info
    table_info = db_manager.execute_query("DESCRIBE silver.cleaned_bond_data")
    print(f"✅ Table created: silver.cleaned_bond_data")
    print(f"📊 Columns: {len(table_info)}")
    print(f"🗂️  Schema: silver")
    
    # Calculate data quality metrics
    quality_summary = db_manager.execute_query("""
        SELECT 
            data_quality_flag,
            COUNT(*) as record_count,
            ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as percentage
        FROM silver.cleaned_bond_data 
        GROUP BY data_quality_flag
    """)
    
    print("\n📊 Data Quality Summary:")
    print(quality_summary)
    
    print("\n🎉 Silver layer creation completed successfully!")
    print("📌 Next: Run 05_gold_layer.ipynb to create analytics-ready data")
    
except Exception as e:
    print(f"❌ Summary generation failed: {e}")

2025-06-18 17:33:46,566 - utils.database - INFO - Query executed successfully. Returned 7 rows
2025-06-18 17:33:46,579 - utils.database - INFO - Query executed successfully. Returned 0 rows


📋 Silver Layer Summary
✅ Table created: silver.cleaned_bond_data
📊 Columns: 7
🗂️  Schema: silver

📊 Data Quality Summary:
Empty DataFrame
Columns: [data_quality_flag, record_count, percentage]
Index: []

🎉 Silver layer creation completed successfully!
📌 Next: Run 05_gold_layer.ipynb to create analytics-ready data
