In [2]:
from datetime import datetime
from airflow.models import DAG
from airflow.operators.python import PythonOperator
import graphviz
from IPython.display import display, Markdown, HTML
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from ipywidgets import interact, Dropdown
import psycopg2
from psycopg2 import sql

# Configure visualization settings
plt.style.use('ggplot')
plt.rcParams["figure.figsize"] = (12, 6)
sns.set_palette("pastel")

# Enhanced Postgres ETL with visualization
def hybrid_postgres_etl(**kwargs):
    """Enhanced Postgres ETL with interactive elements"""
    display(Markdown("## 🐘 Postgres Data Pipeline"))
    
    try:
        # Connection with visual feedback
        display(Markdown("### 🔌 Database Connection"))
        conn = psycopg2.connect(
            host="postgres",
            database="postgres",
            user="postgres",
            password="postgres",
            port="5432"
        )
        display(Markdown(f"✅ Connected to `{conn.info.dbname}` at {datetime.now().strftime('%H:%M:%S')}"))
        
        # Interactive table selector
        display(Markdown("### 🔍 Table Explorer"))
        with conn.cursor() as cur:
            cur.execute("""
                SELECT table_name 
                FROM information_schema.tables 
                WHERE table_schema = 'public'
            """)
            tables = [t[0] for t in cur.fetchall()]
            
            @interact(table=Dropdown(options=tables, description="Select table:"))
            def show_table_stats(table):
                with conn.cursor() as cur:
                    try:
                        # Get row count
                        cur.execute(sql.SQL("SELECT COUNT(*) FROM {}").format(sql.Identifier(table)))
                        count = cur.fetchone()[0]
                        
                        # Get sample data
                        cur.execute(sql.SQL("SELECT * FROM {} LIMIT 5").format(sql.Identifier(table)))
                        cols = [desc[0] for desc in cur.description]
                        data = cur.fetchall()
                        
                        # Display stats
                        display(Markdown(f"**{table}** contains `{count:,}` records"))
                        
                        # Show sample data
                        if data:
                            df = pd.DataFrame(data, columns=cols)
                            display(df.style
                                  .set_caption(f"Sample from {table}")
                                  .set_table_styles([{
                                      'selector': 'caption',
                                      'props': [('font-size', '16px'), 
                                               ('font-weight', 'bold')]
                                  }]))
                    except Exception as e:
                        display(Markdown(f"❌ Error reading {table}: `{str(e)}`"))
        
        # Main ETL operation
        display(Markdown("### ⚙️ ETL Operation"))
        with conn.cursor() as cur:
            cur.execute("SELECT COUNT(*) FROM dim_customer")
            count = cur.fetchone()[0]
            
            # Create summary visualization
            fig, ax = plt.subplots()
            pd.DataFrame({
                'Customers': [count],
                'Target': [count * 1.1]  # Mock target
            }).plot(kind='bar', ax=ax, width=0.3)
            ax.set_title("Customer Data Volume")
            ax.set_ylabel("Record Count")
            ax.set_xticklabels(["Current", "Target"], rotation=0)
            plt.show()
            
            return {
                "status": "success",
                "records_processed": count,
                "timestamp": datetime.now().isoformat()
            }
            
    except Exception as e:
        display(Markdown(f"## ❌ Pipeline Failed\n`{str(e)}`"))
        raise
    finally:
        if 'conn' in locals():
            conn.close()
            display(Markdown("🛑 Connection closed"))

# Create DAG with enhanced metadata
with DAG(
    dag_id="production_postgres_etl",
    schedule="@daily",
    start_date=datetime(2024, 1, 1),
    catchup=False,
    default_args={
        "retries": 3,
        "retry_delay": timedelta(minutes=5),
        "owner": "data_engineering"
    },
    tags=["postgres", "etl"],
    doc_md="""### Production Postgres ETL
    **Purpose**: Daily customer data processing
    
    **Tables**:
    - `dim_customer` - Master customer records
    - `fact_transactions` - Customer activity
    
    **Dependencies**:
    - Postgres 15+
    - Airflow 2.9+
    """
) as dag:
    
    etl_task = PythonOperator(
        task_id="customer_data_processing",
        python_callable=hybrid_postgres_etl,
        doc="""Extracts and transforms customer data including:
        - Record counts
        - Data quality checks
        - Summary statistics"""
    )

# Enhanced Visualization
display(Markdown("## 🔗 Pipeline Architecture"))
dot = graphviz.Digraph(graph_attr={'rankdir': 'TB'})
dot.node('customer_data_processing', 
         shape='cylinder',
         style='filled',
         fillcolor='#ADD8E6',
         fontname='Helvetica',
         tooltip='Postgres ETL')
display(dot)

# Interactive Testing
display(Markdown("## 🧪 Test Execution"))
test_result = etl_task.execute(context={"execution_date": datetime.now()})
display(Markdown("### 📊 Execution Results"))
display(pd.DataFrame([test_result]).T.rename(columns={0: "Value"}))

# Pipeline Documentation
display(Markdown("## 📝 Technical Metadata"))
metadata = {
    "DAG ID": dag.dag_id,
    "Schedule": dag.schedule_interval,
    "Owner": dag.default_args["owner"],
    "Retries": dag.default_args["retries"],
    "Tags": ", ".join(dag.tags)
}
display(pd.DataFrame(metadata.items(), columns=["Property", "Value"]).set_index("Property"))

<DAG: hybrid_postgres_dag>