In [1]:
"""
This notebook implements an end-to-end ETL pipeline that:
1. Reads streaming data from Kafka
2. Transforms the data with Spark
3. Writes results to PostgreSQL (for analytics) and MinIO (for archival)
"""

'\nThis notebook implements an end-to-end ETL pipeline that:\n1. Reads streaming data from Kafka\n2. Transforms the data with Spark\n3. Writes results to PostgreSQL (for analytics) and MinIO (for archival)\n'

In [1]:
# Import libraries
import psycopg2
import sys
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import (
    col, from_json, current_timestamp, to_date, hour, dayofweek, when, lit, udf
)
from pyspark.sql.types import (
    StructType, StructField, StringType, TimestampType, 
    DoubleType, IntegerType, BooleanType
)
from typing import Any
from time import sleep 
from psycopg2 import sql

In [9]:
# Create Spark session with MinIO/S3 support
spark = (SparkSession.builder
    .appName("SmartMeterETL")
    
    # JAR Configuration - Add Hadoop AWS and related jars
    .config("spark.jars", ",".join([
        "/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.5.0.jar",
        "/opt/spark/jars/kafka-clients-3.5.0.jar",
        "/opt/spark/jars/kafka_2.12-3.5.0.jar",
        "/opt/spark/jars/commons-pool2-2.11.1.jar",
        "/opt/spark/jars/lz4-java-1.8.0.jar",
        "/opt/spark/jars/snappy-java-1.1.10.1.jar",
        "/opt/spark/jars/hadoop-aws-3.3.4.jar",
        "/opt/spark/jars/aws-java-sdk-bundle-1.12.262.jar"
    ]))
    
    # Classpath Configuration
    .config("spark.driver.extraClassPath", "/opt/spark/jars/*")
    .config("spark.executor.extraClassPath", "/opt/spark/jars/*")
    .config("spark.executor.userClassPathFirst", "true")
    
    # MinIO/S3 Configuration
    .config("spark.hadoop.fs.s3a.access.key", "your-minio-access-key")
    .config("spark.hadoop.fs.s3a.secret.key", "your-minio-secret-key")
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    
    # Kafka Specific Settings
    .config("spark.sql.streaming.kafka.useDeprecatedOffsetFetching", "false")
    .config("spark.kafka.consumer.cache.enabled", "false")
    .config("spark.streaming.kafka.maxRatePerPartition", "1000")
    
    # JVM Options
    .config("spark.driver.extraJavaOptions",
           "-Dio.netty.tryReflectionSetAccessible=true " +
           "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED " +
           "--add-opens=java.base/java.lang=ALL-UNNAMED " +
           "--add-opens=java.base/java.util=ALL-UNNAMED")
    
    # Performance Tuning
    .config("spark.sql.shuffle.partitions", "4")
    .config("spark.default.parallelism", "4")
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    
    .getOrCreate())

In [10]:
# Test Kafka connectivity before attempting to read
from pyspark.sql.utils import StreamingQueryException

try:
    # Simple test connection
    test_df = spark.read \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "kafka-1:9092") \
        .option("subscribe", "dummy") \
        .option("startingOffsets", "earliest") \
        .load()
    print("✅ Kafka test connection successful")
except Exception as e:
    print(f"❌ Kafka connection failed: {e}")
    raise

✅ Kafka test connection successful


In [13]:
## 1. Extract: Read Streaming Data from Kafka

# Define schema for smart meter data
meter_schema = StructType([
    StructField("meter_id", StringType()),
    StructField("timestamp", TimestampType()),
    StructField("kwh_usage", DoubleType()),
    StructField("voltage", IntegerType()),
    StructField("customer_id", StringType()),
    StructField("region", StringType())
])

kafka_df = (spark
    .readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka-1:9092,kafka-2:9095")
    .option("subscribe", "smart_meter_data")
    .option("startingOffsets", "latest")
    .option("kafka.security.protocol", "PLAINTEXT")
    .option("failOnDataLoss", "false")
    .option("minPartitions", "1")
    .load())

# Parse JSON data
parsed_df = kafka_df.select(
    from_json(col("value").cast("string"), meter_schema).alias("data")
).select("data.*")

In [16]:
## 2. Transform: Clean and Enrich Data

# Define validation UDFs
@udf(returnType=BooleanType())
def is_valid_voltage(voltage: int) -> bool:
    """Check if voltage is valid (230V or 240V)."""
    return voltage in [230, 240]

@udf(returnType=BooleanType())
def is_valid_kwh(kwh: float | int) -> bool:
    """Check if kWh usage is within reasonable bounds."""
    return 0 <= kwh <= 20

# Transformation pipeline with proper parentheses
enhanced_df = (
    parsed_df
    # Current transformations
    .withColumn("processing_time", current_timestamp())
    .withColumn("date", to_date(col("timestamp")))
    .withColumn("hour_of_day", hour(col("timestamp")))
    .withColumn("day_of_week", dayofweek(col("timestamp")))
    .withColumn("cost", 
        when(col("region") == "Auckland", col("kwh_usage") * 0.25)
        .when(col("region") == "Wellington", col("kwh_usage") * 0.23)
        .otherwise(col("kwh_usage") * 0.20))
    .withColumn("is_peak", 
        ((col("hour_of_day") >= 17) & (col("hour_of_day") <= 21)))
    
    # Enhanced data quality checks
    .withColumn("is_weekend", col("day_of_week").isin([1, 7]))
    .withColumn("is_valid_voltage", is_valid_voltage(col("voltage")))
    .withColumn("is_valid_kwh", is_valid_kwh(col("kwh_usage")))
    .withColumn("data_quality_flag",
        when(col("is_valid_voltage") & col("is_valid_kwh"), "VALID")
        .otherwise("INVALID"))
    
    # Improved null handling
    .filter(
        col("meter_id").isNotNull() & 
        col("customer_id").isNotNull() &
        col("timestamp").isNotNull()
    )
    
    # Add record source
    .withColumn("source_system", lit("kafka_stream"))
)

In [17]:
## 3. Load: Write to Postgres

# Function to write batch to PostgreSQL
def write_to_postgres(batch_df: DataFrame, batch_id: Any) -> None:
    """Write batch data to PostgreSQL with better error handling."""
    if batch_df.count() == 0:
        print(f"Skipping empty batch {batch_id} for PostgreSQL")
        return
        
    batch_df.persist()
    conn = None
    
    try:
        pdf = batch_df.select(
            "meter_id", "timestamp", "kwh_usage", "voltage",
            "customer_id", "region", "hour_of_day", "cost",
            "is_peak", "is_weekend", "processing_time",
            "date", "data_quality_flag", "source_system"
        ).toPandas()

        pdf['timestamp'] = pdf['timestamp'].apply(lambda x: x.to_pydatetime())
        pdf['processing_time'] = pdf['processing_time'].apply(lambda x: x.to_pydatetime())
        
        conn = psycopg2.connect(
            host="postgres",
            database="postgres",
            user="postgres",
            password="postgres",
            port="5432"
        )
        
        with conn.cursor() as cur:
            columns = pdf.columns.tolist()
            insert_stmt = sql.SQL("""
                INSERT INTO fact_smart_meter_readings ({})
                VALUES ({})
            """).format(
                sql.SQL(', ').join(map(sql.Identifier, columns)),
                sql.SQL(', ').join([sql.Placeholder()] * len(columns)))
            
            # Batch insert in chunks
            batch_size = 1000
            for i in range(0, len(pdf), batch_size):
                chunk = pdf.iloc[i:i+batch_size]
                args = [tuple(x) for x in chunk.to_numpy()]
                cur.executemany(insert_stmt, args)
                conn.commit()
            
            print(f"Inserted {len(pdf)} records to PostgreSQL (Batch {batch_id})")
            
    except Exception as e:
        print(f"Error writing to PostgreSQL (Batch {batch_id}): {str(e)}")
        if conn:
            conn.rollback()
        raise  # Re-raise to fail the batch
        
    finally:
        batch_df.unpersist()
        if conn:
            conn.close()

In [18]:
## 4. Load: Write to Minio/S3

# Function to write batch to MinIO
def write_to_minio(batch_df: DataFrame, batch_id: Any) -> None:
    """Write batch data to MinIO storage with enhanced configuration."""
    try:
        if batch_df.count() == 0:
            print(f"Skipping empty batch {batch_id} for MinIO")
            return
            
        output_path = f"s3a://default/smart_meter/raw/{batch_id}/"
        
        (batch_df.write
         .mode("append")
         .option("pathStyleAccess", "true")
         .option("fs.s3a.multipart.size", "104857600")  # 100MB part size
         .option("fs.s3a.fast.upload", "true")
         .option("fs.s3a.connection.ssl.enabled", "false")
         .parquet(output_path))
         
        print(f"Successfully wrote batch {batch_id} to MinIO")
        
    except Exception as e:
        print(f"Error writing batch {batch_id} to MinIO: {str(e)}")

In [20]:
## 5. Execute the Streaming Pipeline

def run_streaming():
    try:
        print("Starting streaming queries...")
        
        # Start PostgreSQL writer
        pg_query = (enhanced_df.writeStream
            .foreachBatch(write_to_postgres)
            .option("checkpointLocation", "/tmp/checkpoints/postgres")
            .start())
        
        # Start MinIO writer
        minio_query = (enhanced_df.writeStream
            .foreachBatch(write_to_minio)
            .option("checkpointLocation", "/tmp/checkpoints/minio")
            .start())

        # Handle each query separately
        while True:
            pg_status = pg_query.status
            minio_status = minio_query.status
            
            print(f"\nPostgreSQL Status: {pg_status['message']}")
            print(f"MinIO Status: {minio_status['message']}")
            
            # Check for errors without immediately failing
            if pg_ex := pg_query.exception():
                print(f"PostgreSQL query error: {str(pg_ex)}")
            if minio_ex := minio_query.exception():
                print(f"MinIO query error: {str(minio_ex)}")
                
            sleep(5)
            
    except KeyboardInterrupt:
        print("\nUser requested shutdown...")
    except Exception as e:
        print(f"\nCRITICAL ERROR: {str(e)}", file=sys.stderr)
    finally:
        print("\nShutting down streams...")
        for name, q in [("PostgreSQL", pg_query), ("MinIO", minio_query)]:
            if q and q.isActive:
                print(f"Stopping {name} query...")
                try:
                    q.stop()
                except Exception as e:
                    print(f"Error stopping {name} query: {str(e)}")
        print("All streams stopped")

In [21]:
## 6. Run ETL
run_streaming()

Starting streaming queries...

PostgreSQL Status: Initializing sources
MinIO Status: Initializing sources
Skipping empty batch 0 for MinIO

PostgreSQL Status: Processing new data
MinIO Status: Processing new data
Error writing to PostgreSQL (Batch 1): insert or update on table "fact_smart_meter_readings" violates foreign key constraint "fk_customer"
DETAIL:  Key (customer_id)=(CUST_8357) is not present in table "dim_customer".


PostgreSQL Status: Terminated with exception: An exception was raised by the Python Proxy. Return Message: Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 617, in _call_proxy
    return_value = getattr(self.pool[obj_id], method)(*params)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/pyspark/sql/utils.py", line 120, in call
    raise e
  File "/usr/local/spark/python/pyspark/sql/utils.py", line 117, in call
    self.func(DataFrame(jdf, wrappe