<a href="https://colab.research.google.com/github/Dee-Nwanjah/SQL-Database-Fundamental-Projects/blob/main/6.)ETL_PIPELINE_(CSV_%E2%86%92_Database_%E2%86%92_Dashboard).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# =============================================================================
# ETL PIPELINE (CSV → Database → Dashboard)
# =============================================================================

# Setup and Create Sample CSV Files
import pandas as pd
import numpy as np
import sqlite3
from IPython import get_ipython
from datetime import datetime, timedelta
import io

# Setup database and SQL magic
conn = sqlite3.connect('etl_database.db')
get_ipython().run_line_magic('load_ext', 'sql')
get_ipython().run_line_magic('sql', 'sqlite:///etl_database.db')

print("✅ ETL Pipeline setup complete!")

# =============================================================================
# EXTRACTION PHASE: Create Sample CSV Files
# =============================================================================

print("📥 EXTRACT PHASE: Creating sample CSV files...")

# Create sample CSV data as strings (simulating external files)
new_customers_csv = """customer_id,first_name,last_name,email,phone,address,city,state,zip_code,registration_date
1001,John,Doe,john.doe@email.com,555-0101,123 Main St,Boston,MA,02101,2024-02-01
1002,Jane,Smith,jane.smith@email.com,555-0102,456 Oak Ave,Boston,MA,02102,2024-02-01
1003,Mike,Johnson,mike.johnson@email.com,555-0103,789 Pine St,Cambridge,MA,02139,2024-02-02
1004,Sarah,Wilson,sarah.wilson@email.com,555-0104,321 Elm St,Boston,MA,02103,2024-02-02
1005,Tom,Brown,tom.brown@email.com,555-0105,654 Maple Ave,Cambridge,MA,02140,2024-02-03"""

new_orders_csv = """order_id,customer_id,product_id,order_date,quantity,unit_price,total_amount,status
5001,1001,1,2024-02-01,2,29.99,59.98,completed
5002,1002,5,2024-02-01,1,149.99,149.99,completed
5003,1003,10,2024-02-02,3,19.99,59.97,pending
5004,1004,2,2024-02-02,1,89.99,89.99,completed
5005,1005,8,2024-02-03,2,45.50,91.00,completed
5006,1001,15,2024-02-03,1,199.99,199.99,processing"""

product_updates_csv = """product_id,stock_quantity,price_update,last_updated
1,45,31.99,2024-02-01
2,23,94.99,2024-02-02
5,12,139.99,2024-02-01
8,67,47.50,2024-02-03
10,34,21.99,2024-02-02
15,8,209.99,2024-02-03"""

# Convert CSV strings to DataFrames
customers_df = pd.read_csv(io.StringIO(new_customers_csv))
orders_df = pd.read_csv(io.StringIO(new_orders_csv))
products_df = pd.read_csv(io.StringIO(product_updates_csv))

print(f"✅ Sample CSV files created:")
print(f"   - New customers: {len(customers_df)} records")
print(f"   - New orders: {len(orders_df)} records")
print(f"   - Product updates: {len(products_df)} records")

✅ ETL Pipeline setup complete!
📥 EXTRACT PHASE: Creating sample CSV files...
✅ Sample CSV files created:
   - New customers: 5 records
   - New orders: 6 records
   - Product updates: 6 records


In [2]:
# ==============================================================================
# TRANSFORMAION PHASE - Data Cleaning and Validation
# ==============================================================================
print("\n🔄 TRANSFORM PHASE: Cleaning and validating data...")

def clean_customer_data(df):
    """Clean and validate customer data"""
    print("   Cleaning customer data...")
    original_count = len(df)

    # Remove duplicates based on email
    df = df.drop_duplicates(subset=['email'])

    # Validate email format (basic validation)
    df = df[df['email'].str.contains('@', na=False)]

    # Clean phone numbers
    df['phone'] = df['phone'].str.replace('[^0-9]', '', regex=True)

    # Standardize names
    df['first_name'] = df['first_name'].str.title()
    df['last_name'] = df['last_name'].str.title()

    # Validate required fields are not null
    df = df.dropna(subset=['customer_id', 'email', 'first_name', 'last_name'])

    cleaned_count = len(df)
    print(f"   Customer records: {original_count} → {cleaned_count}")
    return df

def validate_orders(df):
    """Validate order data"""
    print("   Validating order data...")
    original_count = len(df)

    # Remove orders with invalid quantities or amounts
    df = df[(df['quantity'] > 0) & (df['total_amount'] > 0)]

    # Validate price consistency (allow small rounding differences)
    df['calculated_total'] = df['quantity'] * df['unit_price']
    df = df[abs(df['total_amount'] - df['calculated_total']) < 0.01]

    # Remove the helper column
    df = df.drop('calculated_total', axis=1)

    # Validate required fields
    df = df.dropna(subset=['order_id', 'customer_id', 'product_id'])

    validated_count = len(df)
    print(f"   Order records: {original_count} → {validated_count}")
    return df

def clean_product_updates(df):
    """Clean product update data"""
    print("   Cleaning product updates...")
    original_count = len(df)

    # Validate numeric fields
    df = df[(df['stock_quantity'] >= 0) & (df['price_update'] > 0)]

    # Round prices to 2 decimal places
    df['price_update'] = df['price_update'].round(2)

    cleaned_count = len(df)
    print(f"   Product update records: {original_count} → {cleaned_count}")
    return df

# Apply transformations
cleaned_customers = clean_customer_data(customers_df.copy())
validated_orders = validate_orders(orders_df.copy())
cleaned_products = clean_product_updates(products_df.copy())

print("✅ Data transformation completed!")


🔄 TRANSFORM PHASE: Cleaning and validating data...
   Cleaning customer data...
   Customer records: 5 → 5
   Validating order data...
   Order records: 6 → 6
   Cleaning product updates...
   Product update records: 6 → 6
✅ Data transformation completed!


In [3]:
# Create Base Tables (if they don't exist)
print("\n🏗️  Setting up base tables...")

# Create base tables with sample data
base_customers_data = {
    'customer_id': range(1, 1001),
    'first_name': [f'Customer_{i}' for i in range(1, 1001)],
    'last_name': [f'Last_{i}' for i in range(1, 1001)],
    'email': [f'customer{i}@email.com' for i in range(1, 1001)],
    'city': np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston'], 1000),
    'registration_date': pd.date_range('2023-01-01', periods=1000, freq='D')
}

base_products_data = {
    'product_id': range(1, 21),
    'product_name': [f'Product_{i}' for i in range(1, 21)],
    'category': np.random.choice(['Electronics', 'Clothing', 'Books'], 20),
    'price': np.round(np.random.uniform(10, 200, 20), 2),
    'stock_quantity': np.random.randint(10, 100, 20)
}

base_orders_data = {
    'order_id': range(1, 5001),
    'customer_id': np.random.randint(1, 1001, 5000),
    'product_id': np.random.randint(1, 21, 5000),
    'order_date': pd.date_range('2024-01-01', periods=5000, freq='H'),
    'quantity': np.random.randint(1, 3, 5000),
    'total_amount': np.round(np.random.uniform(20, 300, 5000), 2)
}

# Create base DataFrames
base_customers_df = pd.DataFrame(base_customers_data)
base_products_df = pd.DataFrame(base_products_data)
base_orders_df = pd.DataFrame(base_orders_data)

# Convert dates to strings
base_customers_df['registration_date'] = base_customers_df['registration_date'].dt.strftime('%Y-%m-%d')
base_orders_df['order_date'] = base_orders_df['order_date'].dt.strftime('%Y-%m-%d')

# Save base tables to database
base_customers_df.to_sql('customers', conn, if_exists='replace', index=False)
base_products_df.to_sql('products', conn, if_exists='replace', index=False)
base_orders_df.to_sql('orders', conn, if_exists='replace', index=False)

print("✅ Base tables created!")


🏗️  Setting up base tables...
✅ Base tables created!


  'order_date': pd.date_range('2024-01-01', periods=5000, freq='H'),


In [4]:
# ==============================================================================
# LOADING PHASE - Insert New Data
# ==============================================================================
print("\n📤 LOAD PHASE: Loading transformed data into database...")

def upsert_customers(df, conn):
    """Insert new customers or update existing ones"""
    cursor = conn.cursor()
    inserted = 0
    updated = 0

    for _, row in df.iterrows():
        # Check if customer exists
        cursor.execute("SELECT customer_id FROM customers WHERE customer_id = ?", (row['customer_id'],))
        exists = cursor.fetchone()

        if exists:
            # Update existing customer
            cursor.execute("""
                UPDATE customers
                SET first_name = ?, last_name = ?, email = ?, city = ?
                WHERE customer_id = ?
            """, (row['first_name'], row['last_name'], row['email'],
                  row['city'], row['customer_id']))
            updated += 1
        else:
            # Insert new customer
            cursor.execute("""
                INSERT INTO customers
                (customer_id, first_name, last_name, email, city, registration_date)
                VALUES (?, ?, ?, ?, ?, ?)
            """, (row['customer_id'], row['first_name'], row['last_name'],
                  row['email'], row['city'], row['registration_date']))
            inserted += 1

    conn.commit()
    print(f"   Customers: {inserted} inserted, {updated} updated")

def insert_orders(df, conn):
    """Insert new orders"""
    cursor = conn.cursor()
    inserted = 0

    for _, row in df.iterrows():
        # Check if order already exists
        cursor.execute("SELECT order_id FROM orders WHERE order_id = ?", (row['order_id'],))
        exists = cursor.fetchone()

        if not exists:
            cursor.execute("""
                INSERT INTO orders
                (order_id, customer_id, product_id, order_date, quantity, total_amount)
                VALUES (?, ?, ?, ?, ?, ?)
            """, (row['order_id'], row['customer_id'], row['product_id'],
                  row['order_date'], row['quantity'], row['total_amount']))
            inserted += 1

    conn.commit()
    print(f"   Orders: {inserted} new orders inserted")

def update_products(df, conn):
    """Update product information"""
    cursor = conn.cursor()
    updated = 0

    for _, row in df.iterrows():
        cursor.execute("""
            UPDATE products
            SET stock_quantity = ?, price = ?
            WHERE product_id = ?
        """, (row['stock_quantity'], row['price_update'], row['product_id']))

        if cursor.rowcount > 0:
            updated += 1

    conn.commit()
    print(f"   Products: {updated} products updated")

# Execute load operations
upsert_customers(cleaned_customers, conn)
insert_orders(validated_orders, conn)
update_products(cleaned_products, conn)

print("✅ Data loading completed!")


📤 LOAD PHASE: Loading transformed data into database...
   Customers: 5 inserted, 0 updated
   Orders: 6 new orders inserted
   Products: 6 products updated
✅ Data loading completed!


In [5]:
# Data Quality Checks
print("\n🔍 DATA QUALITY CHECKS:")

# Check for data integrity
quality_checks = {
    "Total customers": "SELECT COUNT(*) as count FROM customers",
    "Total orders": "SELECT COUNT(*) as count FROM orders",
    "Total products": "SELECT COUNT(*) as count FROM products",
    "Orders without customers": """
        SELECT COUNT(*) as count FROM orders o
        LEFT JOIN customers c ON o.customer_id = c.customer_id
        WHERE c.customer_id IS NULL
    """,
    "Orders without products": """
        SELECT COUNT(*) as count FROM orders o
        LEFT JOIN products p ON o.product_id = p.product_id
        WHERE p.product_id IS NULL
    """,
    "Products with zero stock": "SELECT COUNT(*) as count FROM products WHERE stock_quantity = 0",
    "Recent orders (last 7 days)": """
        SELECT COUNT(*) as count FROM orders
        WHERE order_date >= date('now', '-7 days')
    """
}

for check_name, query in quality_checks.items():
    result = pd.read_sql(query, conn)
    count = result.iloc[0]['count']
    status = "✅" if count >= 0 else "❌"
    print(f"   {status} {check_name}: {count:,}")


🔍 DATA QUALITY CHECKS:
   ✅ Total customers: 1,005
   ✅ Total orders: 5,006
   ✅ Total products: 20
   ✅ Orders without customers: 0
   ✅ Orders without products: 0
   ✅ Products with zero stock: 0
   ✅ Recent orders (last 7 days): 0


In [6]:
# Create Business Dashboard Data
print("\n📊 DASHBOARD PHASE: Creating business intelligence data...")

def create_dashboard_data():
    """Generate comprehensive dashboard data"""

    # Key Performance Indicators
    kpi_data = pd.read_sql("""
        SELECT
            (SELECT COUNT(*) FROM customers) as total_customers,
            (SELECT COUNT(*) FROM orders) as total_orders,
            (SELECT COUNT(*) FROM products) as total_products,
            (SELECT ROUND(SUM(total_amount), 2) FROM orders) as total_revenue,
            (SELECT ROUND(AVG(total_amount), 2) FROM orders) as avg_order_value,
            (SELECT COUNT(DISTINCT customer_id) FROM orders
             WHERE order_date >= date('now', '-30 days')) as active_customers_30d
    """, conn)

    # Monthly revenue trends
    monthly_trends = pd.read_sql("""
        SELECT
            strftime('%Y-%m', order_date) as month,
            COUNT(order_id) as order_count,
            ROUND(SUM(total_amount), 2) as monthly_revenue,
            COUNT(DISTINCT customer_id) as unique_customers,
            ROUND(AVG(total_amount), 2) as avg_order_value
        FROM orders
        GROUP BY strftime('%Y-%m', order_date)
        ORDER BY month DESC
        LIMIT 12
    """, conn)

    # Top products by revenue
    top_products = pd.read_sql("""
        SELECT
            p.product_name,
            p.category,
            p.price,
            p.stock_quantity,
            COUNT(o.order_id) as order_count,
            SUM(o.quantity) as total_sold,
            ROUND(SUM(o.total_amount), 2) as total_revenue
        FROM products p
        LEFT JOIN orders o ON p.product_id = o.product_id
        GROUP BY p.product_id, p.product_name, p.category, p.price, p.stock_quantity
        ORDER BY total_revenue DESC NULLS LAST
        LIMIT 10
    """, conn)

    # Customer analysis
    customer_analysis = pd.read_sql("""
        SELECT
            c.city,
            COUNT(DISTINCT c.customer_id) as customer_count,
            COALESCE(COUNT(o.order_id), 0) as total_orders,
            COALESCE(ROUND(SUM(o.total_amount), 2), 0) as total_revenue,
            COALESCE(ROUND(AVG(o.total_amount), 2), 0) as avg_order_value
        FROM customers c
        LEFT JOIN orders o ON c.customer_id = o.customer_id
        GROUP BY c.city
        ORDER BY total_revenue DESC
    """, conn)

    # Inventory alerts
    inventory_alerts = pd.read_sql("""
        SELECT
            product_name,
            category,
            stock_quantity,
            price,
            CASE
                WHEN stock_quantity = 0 THEN 'OUT_OF_STOCK'
                WHEN stock_quantity <= 5 THEN 'CRITICAL_LOW'
                WHEN stock_quantity <= 10 THEN 'LOW_STOCK'
                ELSE 'ADEQUATE'
            END as stock_status
        FROM products
        WHERE stock_quantity <= 10
        ORDER BY stock_quantity ASC
    """, conn)

    return {
        'kpis': kpi_data,
        'monthly_trends': monthly_trends,
        'top_products': top_products,
        'customer_analysis': customer_analysis,
        'inventory_alerts': inventory_alerts
    }

dashboard_data = create_dashboard_data()
print("✅ Dashboard data created!")

# Display summary
print(f"\nDASHBOARD SUMMARY:")
kpis = dashboard_data['kpis'].iloc[0]
print(f"   📊 Total Revenue: ${kpis['total_revenue']:,}")
print(f"   👥 Total Customers: {kpis['total_customers']:,}")
print(f"   📦 Total Orders: {kpis['total_orders']:,}")
print(f"   💰 Avg Order Value: ${kpis['avg_order_value']}")
print(f"   🔥 Active Customers (30d): {kpis['active_customers_30d']:,}")

# =============================================================================


📊 DASHBOARD PHASE: Creating business intelligence data...
✅ Dashboard data created!

DASHBOARD SUMMARY:
   📊 Total Revenue: $788,280.13
   👥 Total Customers: 1,005.0
   📦 Total Orders: 5,006.0
   💰 Avg Order Value: $157.47
   🔥 Active Customers (30d): 0.0


In [7]:
# Generate HTML Dashboard
print("\n🌐 Creating HTML Dashboard...")

def generate_html_dashboard(data):
    """Generate a complete HTML dashboard"""

    kpis = data['kpis'].iloc[0]

    # Create HTML content
    html_content = f"""
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>ETL Pipeline Dashboard</title>
        <style>
            body {{
                font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
                margin: 0;
                padding: 20px;
                background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
                min-height: 100vh;
            }}

            .container {{
                max-width: 1200px;
                margin: 0 auto;
                background: white;
                border-radius: 15px;
                padding: 30px;
                box-shadow: 0 20px 40px rgba(0,0,0,0.1);
            }}

            .header {{
                text-align: center;
                margin-bottom: 40px;
                border-bottom: 3px solid #667eea;
                padding-bottom: 20px;
            }}

            .header h1 {{
                color: #333;
                margin: 0;
                font-size: 2.5em;
            }}

            .header p {{
                color: #666;
                font-size: 1.2em;
                margin: 10px 0 0 0;
            }}

            .kpi-grid {{
                display: grid;
                grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
                gap: 20px;
                margin-bottom: 40px;
            }}

            .kpi-card {{
                background: linear-gradient(135deg, #ff6b6b, #ee5a24);
                color: white;
                padding: 25px;
                border-radius: 10px;
                text-align: center;
                box-shadow: 0 5px 15px rgba(0,0,0,0.1);
                transition: transform 0.3s ease;
            }}

            .kpi-card:hover {{
                transform: translateY(-5px);
            }}

            .kpi-card:nth-child(2) {{
                background: linear-gradient(135deg, #4834d4, #341f97);
            }}

            .kpi-card:nth-child(3) {{
                background: linear-gradient(135deg, #00d2d3, #54a0ff);
            }}

            .kpi-card:nth-child(4) {{
                background: linear-gradient(135deg, #ff9ff3, #f368e0);
            }}

            .kpi-card:nth-child(5) {{
                background: linear-gradient(135deg, #feca57, #ff9ff3);
            }}

            .kpi-card:nth-child(6) {{
                background: linear-gradient(135deg, #48dbfb, #0abde3);
            }}

            .kpi-value {{
                font-size: 2.5em;
                font-weight: bold;
                margin-bottom: 10px;
            }}

            .kpi-label {{
                font-size: 1.1em;
                opacity: 0.9;
            }}

            .section {{
                margin-bottom: 40px;
                background: #f8f9fa;
                padding: 25px;
                border-radius: 10px;
                border-left: 5px solid #667eea;
            }}

            .section h3 {{
                color: #333;
                margin-top: 0;
                font-size: 1.8em;
                border-bottom: 2px solid #eee;
                padding-bottom: 10px;
            }}

            table {{
                width: 100%;
                border-collapse: collapse;
                margin-top: 15px;
                background: white;
                border-radius: 8px;
                overflow: hidden;
                box-shadow: 0 3px 10px rgba(0,0,0,0.1);
            }}

            th {{
                background: linear-gradient(135deg, #667eea, #764ba2);
                color: white;
                padding: 15px;
                text-align: left;
                font-weight: 600;
            }}

            td {{
                padding: 12px 15px;
                border-bottom: 1px solid #eee;
            }}

            tr:hover {{
                background-color: #f5f5f5;
            }}

            .status-critical {{ background-color: #ffebee; color: #c62828; }}
            .status-low {{ background-color: #fff3e0; color: #ef6c00; }}
            .status-out {{ background-color: #fce4ec; color: #ad1457; }}

            .footer {{
                text-align: center;
                margin-top: 40px;
                padding-top: 20px;
                border-top: 1px solid #eee;
                color: #666;
            }}

            .timestamp {{
                background: linear-gradient(135deg, #667eea, #764ba2);
                color: white;
                padding: 10px 20px;
                border-radius: 25px;
                display: inline-block;
                margin-bottom: 20px;
            }}
        </style>
    </head>
    <body>
        <div class="container">
            <div class="header">
                <h1>📊 ETL Pipeline Dashboard</h1>
                <p>Real-time Business Intelligence & Analytics</p>
                <div class="timestamp">
                    Last Updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
                </div>
            </div>

            <div class="kpi-grid">
                <div class="kpi-card">
                    <div class="kpi-value">${kpis['total_revenue']:,}</div>
                    <div class="kpi-label">Total Revenue</div>
                </div>
                <div class="kpi-card">
                    <div class="kpi-value">{kpis['total_customers']:,}</div>
                    <div class="kpi-label">Total Customers</div>
                </div>
                <div class="kpi-card">
                    <div class="kpi-value">{kpis['total_orders']:,}</div>
                    <div class="kpi-label">Total Orders</div>
                </div>
                <div class="kpi-card">
                    <div class="kpi-value">{kpis['total_products']:,}</div>
                    <div class="kpi-label">Total Products</div>
                </div>
                <div class="kpi-card">
                    <div class="kpi-value">${kpis['avg_order_value']}</div>
                    <div class="kpi-label">Avg Order Value</div>
                </div>
                <div class="kpi-card">
                    <div class="kpi-value">{kpis['active_customers_30d']:,}</div>
                    <div class="kpi-label">Active Customers</div>
                </div>
            </div>
    """

    # Add Monthly Trends section
    html_content += """
            <div class="section">
                <h3>📈 Monthly Revenue Trends</h3>
                <table>
                    <tr>
                        <th>Month</th>
                        <th>Orders</th>
                        <th>Revenue</th>
                        <th>Customers</th>
                        <th>Avg Order Value</th>
                    </tr>
    """

    for _, row in data['monthly_trends'].iterrows():
        html_content += f"""
                    <tr>
                        <td>{row['month']}</td>
                        <td>{row['order_count']:,}</td>
                        <td>${row['monthly_revenue']:,}</td>
                        <td>{row['unique_customers']:,}</td>
                        <td>${row['avg_order_value']}</td>
                    </tr>
        """

    # Add Top Products section
    html_content += """
                </table>
            </div>

            <div class="section">
                <h3>🏆 Top Products by Revenue</h3>
                <table>
                    <tr>
                        <th>Product Name</th>
                        <th>Category</th>
                        <th>Price</th>
                        <th>Stock</th>
                        <th>Orders</th>
                        <th>Revenue</th>
                    </tr>
    """

    for _, row in data['top_products'].iterrows():
        revenue = row['total_revenue'] if pd.notnull(row['total_revenue']) else 0
        orders = row['order_count'] if pd.notnull(row['order_count']) else 0
        html_content += f"""
                    <tr>
                        <td>{row['product_name']}</td>
                        <td>{row['category']}</td>
                        <td>${row['price']}</td>
                        <td>{row['stock_quantity']}</td>
                        <td>{orders:,}</td>
                        <td>${revenue:,}</td>
                    </tr>
        """

    # Add Inventory Alerts section
    html_content += """
                </table>
            </div>

            <div class="section">
                <h3>⚠️ Inventory Alerts</h3>
                <table>
                    <tr>
                        <th>Product Name</th>
                        <th>Category</th>
                        <th>Stock Level</th>
                        <th>Price</th>
                        <th>Status</th>
                    </tr>
    """

    for _, row in data['inventory_alerts'].iterrows():
        status_class = f"status-{row['stock_status'].lower().replace('_', '-')}"
        html_content += f"""
                    <tr>
                        <td>{row['product_name']}</td>
                        <td>{row['category']}</td>
                        <td>{row['stock_quantity']}</td>
                        <td>${row['price']}</td>
                        <td class="{status_class}">{row['stock_status'].replace('_', ' ')}</td>
                    </tr>
        """

    # Add Customer Analysis section
    html_content += """
                </table>
            </div>

            <div class="section">
                <h3>🌍 Customer Analysis by City</h3>
                <table>
                    <tr>
                        <th>City</th>
                        <th>Customers</th>
                        <th>Total Orders</th>
                        <th>Total Revenue</th>
                        <th>Avg Order Value</th>
                    </tr>
    """

    for _, row in data['customer_analysis'].iterrows():
        html_content += f"""
                    <tr>
                        <td>{row['city']}</td>
                        <td>{row['customer_count']:,}</td>
                        <td>{row['total_orders']:,}</td>
                        <td>${row['total_revenue']:,}</td>
                        <td>${row['avg_order_value']}</td>
                    </tr>
        """

    # Close HTML
    html_content += """
                </table>
            </div>

            <div class="footer">
                <p>🚀 Generated by ETL Pipeline | Data processed automatically from CSV sources</p>
                <p>💡 This dashboard updates in real-time as new data arrives</p>
            </div>
        </div>
    </body>
    </html>
    """

    return html_content

# Generate and save dashboard
dashboard_html = generate_html_dashboard(dashboard_data)

# Save to file
with open('etl_dashboard.html', 'w', encoding='utf-8') as f:
    f.write(dashboard_html)

print("✅ HTML Dashboard created: etl_dashboard.html")


🌐 Creating HTML Dashboard...
✅ HTML Dashboard created: etl_dashboard.html


In [8]:
# ETL Pipeline Summary and Automation
print("\n🎯 ETL PIPELINE SUMMARY:")
print("="*50)
print("✅ EXTRACTION: Sample CSV files processed")
print("✅ TRANSFORMATION: Data cleaned and validated")
print("✅ LOADING: Data successfully inserted/updated")
print("✅ QUALITY: Data integrity checks passed")
print("✅ DASHBOARD: Business intelligence generated")
print("="*50)

# Create automation function
def run_etl_pipeline():
    """Complete ETL pipeline function"""
    print("🚀 Running automated ETL pipeline...")

    # This would normally read from actual files
    # For demo, I'll simulate new data arrival

    steps = [
        "📥 Extracting data from source files",
        "🔄 Transforming and cleaning data",
        "📤 Loading data into warehouse",
        "🔍 Running quality checks",
        "📊 Updating dashboard metrics"
    ]

    import time
    for i, step in enumerate(steps, 1):
        print(f"Step {i}/5: {step}")
        time.sleep(1)  # Simulate processing time
        print("   ✅ Completed")

    print("🎉 ETL Pipeline completed successfully!")
    return True

# Demonstrate automation
automation_success = run_etl_pipeline()

print(f"\n📋 DELIVERABLES:")
print("   📄 etl_dashboard.html - Interactive business dashboard")
print("   🗄️  etl_database.db - Updated data warehouse")
print("   📊 Quality reports - Data validation results")
print("   🔄 ETL functions - Reusable pipeline components")

print(f"\n🔗 NEXT STEPS:")
print("   1. Open etl_dashboard.html in your browser")
print("   2. Schedule ETL pipeline to run daily/hourly")
print("   3. Add data validation alerts")
print("   4. Implement error handling and logging")
print("   5. Connect to real data sources (APIs, databases)")


🎯 ETL PIPELINE SUMMARY:
✅ EXTRACTION: Sample CSV files processed
✅ TRANSFORMATION: Data cleaned and validated
✅ LOADING: Data successfully inserted/updated
✅ QUALITY: Data integrity checks passed
✅ DASHBOARD: Business intelligence generated
🚀 Running automated ETL pipeline...
Step 1/5: 📥 Extracting data from source files
   ✅ Completed
Step 2/5: 🔄 Transforming and cleaning data
   ✅ Completed
Step 3/5: 📤 Loading data into warehouse
   ✅ Completed
Step 4/5: 🔍 Running quality checks
   ✅ Completed
Step 5/5: 📊 Updating dashboard metrics
   ✅ Completed
🎉 ETL Pipeline completed successfully!

📋 DELIVERABLES:
   📄 etl_dashboard.html - Interactive business dashboard
   🗄️  etl_database.db - Updated data warehouse
   📊 Quality reports - Data validation results
   🔄 ETL functions - Reusable pipeline components

🔗 NEXT STEPS:
   1. Open etl_dashboard.html in your browser
   2. Schedule ETL pipeline to run daily/hourly
   3. Add data validation alerts
   4. Implement error handling and logging
 