In [None]:
import json
from datetime import datetime

In [None]:
# Load configuration
def load_config():
    """Load configuration from config.json"""
    config_path = "/dbfs/FileStore/configs/config.json"
    with open(config_path, 'r') as f:
        return json.load(f)

config = load_config()

In [None]:

def run_pipeline():
    """Orchestrate the entire pipeline"""
    start_time = datetime.now()
    print(f"Pipeline started at: {start_time}")
    
    try:
        # Step 1: Ingest data to bronze layer
        print("\n== STEP 1: Bronze Layer - Data Ingestion ==")
        bronze_result = dbutils.notebook.run("01_bronze_layer", timeout_seconds=1800)
        
        # Step 2: Process data to silver layer
        print("\n== STEP 2: Silver Layer - Data Transformation ==")
        silver_result = dbutils.notebook.run("02_silver_layer", timeout_seconds=1200)
        
        # Step 3: Create gold layer
        print("\n== STEP 3: Gold Layer - Analytics Views ==")
        gold_result = dbutils.notebook.run("03_gold_layer", timeout_seconds=1200)
        
        # Calculate execution time
        end_time = datetime.now()
        duration = (end_time - start_time).total_seconds() / 60
        
        print("\n== PIPELINE EXECUTION SUMMARY ==")
        print(f"Pipeline completed successfully!")
        print(f"Start time: {start_time}")
        print(f"End time: {end_time}")
        print(f"Duration: {duration:.2f} minutes")
        
        # Display final data counts
        print("\n== DATA SUMMARY ==")
        # Read final data counts from each layer
        bronze_df = spark.read.format("delta").load(config["data_storage"]["bronze_path"])
        silver_df = spark.read.format("delta").load(config["data_storage"]["silver_path"])
        daily_df = spark.read.format("delta").load(config["data_storage"]["gold_daily_path"])
        monthly_df = spark.read.format("delta").load(config["data_storage"]["gold_monthly_path"])
        
        print(f"Bronze layer: {bronze_df.count()} records")
        print(f"Silver layer: {silver_df.count()} records")
        print(f"Gold daily metrics: {daily_df.count()} records")
        print(f"Gold monthly metrics: {monthly_df.count()} records")
        
        return {
            "status": "success",
            "duration_minutes": duration,
            "bronze_count": bronze_df.count(),
            "silver_count": silver_df.count(),
            "gold_daily_count": daily_df.count(),
            "gold_monthly_count": monthly_df.count()
        }
        
    except Exception as e:
        print(f"Pipeline failed with error: {str(e)}")
        raise e


In [None]:
# Run the pipeline
pipeline_results = run_pipeline()

In [None]:
# Create a visualization of the pipeline results
if "status" in pipeline_results and pipeline_results["status"] == "success":
    # Create a DataFrame with layer information
    layer_data = [
        ("Bronze Layer", pipeline_results["bronze_count"]),
        ("Silver Layer", pipeline_results["silver_count"]),
        ("Gold Daily", pipeline_results["gold_daily_count"]),
        ("Gold Monthly", pipeline_results["gold_monthly_count"])
    ]
    
    summary_df = spark.createDataFrame(layer_data, ["Layer", "Record Count"])
    display(summary_df)