In [3]:
from datetime import datetime
from airflow.models import DAG
from airflow.operators.python import PythonOperator
import graphviz
from IPython.display import display, Markdown, HTML
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from ipywidgets import interact, FloatSlider

# Configure visualization settings
plt.style.use('seaborn')
plt.rcParams["figure.figsize"] = (12, 5)
sns.set_palette("husl")

# Enhanced mock function with interactive elements
def mock_spark_minio_etl(**kwargs):
    """Mock Spark/MinIO ETL with visual feedback"""
    display(Markdown("## 🚀 Spark/MinIO ETL Simulation"))
    
    # Simulate data processing
    with plt.ioff():
        fig, (ax1, ax2) = plt.subplots(1, 2)
        
        # Mock data distribution
        data = pd.DataFrame({
            'value': np.random.normal(100, 15, 10000),
            'timestamp': pd.date_range(end=datetime.now(), periods=10000, freq='T')
        })
        sns.histplot(data['value'], bins=30, ax=ax1, kde=True)
        ax1.set_title("📊 Smart Meter Value Distribution")
        
        # Mock throughput timeline
        throughput = np.cumsum(np.random.poisson(500, 24))
        sns.lineplot(x=range(24), y=throughput, ax=ax2, marker='o')
        ax2.set_title("⏱ Records Processed by Hour")
        ax2.set_xlabel("Hour of Day")
        ax2.set_ylabel("Total Records")
        
        plt.tight_layout()
        plt.show()
    
    # Interactive parameter control
    display(Markdown("### 🎚 Processing Parameters"))
    def show_effect(processing_time=2.0, error_rate=0.05):
        display(Markdown(f"""
        - **Simulated Processing Time**: {processing_time:.1f}s per 1000 records
        - **Simulated Error Rate**: {error_rate:.1%}
        """))
    
    interact(show_effect, 
             processing_time=FloatSlider(1.0, 0.5, 5.0, 0.5),
             error_rate=FloatSlider(0.05, 0.0, 0.2, 0.01))
    
    return {
        "status": "success",
        "records_processed": 10000,
        "output_path": "s3a://default/output/processed_data.parquet"
    }

# Create DAG with enhanced metadata
with DAG(
    dag_id="spark_minio_etl",
    schedule="@hourly",
    start_date=datetime(2024, 1, 1),
    catchup=False,
    default_args={
        "retries": 2,
        "retry_delay": timedelta(minutes=3),
    },
    doc_md="""### Spark/MinIO Data Pipeline
    **Purpose**: Process smart meter data from MinIO using Spark
    """
) as dag:
    
    processing_task = PythonOperator(
        task_id="data_processing",
        python_callable=mock_spark_minio_etl,
        doc="Transform raw JSON data to processed Parquet"
    )

# Enhanced Visualization
display(Markdown("## 🔍 Pipeline Visualization"))
dot = graphviz.Digraph(graph_attr={'rankdir': 'LR'})
dot.node('data_processing', 
         shape='cylinder',
         style='filled',
         fillcolor='#FFD700',  # Gold color for Spark tasks
         fontname='Helvetica',
         tooltip='Spark/MinIO Processing')
display(dot)

# Interactive Testing Panel
display(Markdown("## 🧪 Interactive Testing"))
display(Markdown("Execute the Spark task with different parameters:"))

test_result = processing_task.execute(context={
    "execution_date": datetime.now(),
    "params": {
        "processing_time": 1.5,
        "error_rate": 0.03
    }
})

display(Markdown("### 📊 Execution Results"))
display(pd.DataFrame([test_result]).T.rename(columns={0: "Value"}))

# Pipeline Documentation
display(Markdown("## 📝 Pipeline Metadata"))
metadata = {
    "DAG ID": dag.dag_id,
    "Schedule": dag.schedule_interval,
    "Start Date": dag.start_date.strftime("%Y-%m-%d"),
    "Tasks": [t.task_id for t in dag.tasks],
    "Retry Policy": f"{dag.default_args['retries']} retries, {dag.default_args['retry_delay']} delay"
}
display(pd.DataFrame(metadata.items(), columns=["Property", "Value"]).set_index("Property"))

<DAG: mock_spark_minio_etl>