In [9]:
import os
from pyspark.sql import SparkSession
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

os.environ["PYSPARK_SUBMIT_ARGS"] = "--packages org.apache.hadoop:hadoop-aws:3.3.4,io.delta:delta-spark_2.12:3.3.0 pyspark-shell"

# Initialize Spark session with Delta Lake and MinIO support
spark = SparkSession.builder \
    .appName("DeltaLakeWithMinIO") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("delta.autoOptimize.optimizeWrite", "true") \
    .config("delta.autoOptimize.autoCompact", "true") \
    .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore") \
    .config("spark.hadoop.fs.s3a.endpoint", os.getenv("MINIO_ENDPOINT")) \
    .config("spark.hadoop.fs.s3a.access.key", os.getenv("MINIO_ACCESS_KEY")) \
    .config("spark.hadoop.fs.s3a.secret.key", os.getenv("MINIO_SECRET_KEY")) \
    .config('spark.hadoop.fs.s3a.attempts.maximum', "3") \
    .config('spark.hadoop.fs.s3a.connection.timeout', "10000") \
    .config('spark.hadoop.fs.s3a.connection.establish.timeout', "5000") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3n.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()

print("Spark session configured with Delta Lake and MinIO!")

Spark session configured with Delta Lake and MinIO!


In [None]:
# Define the path to the raw Delta table
raw_delta_path = "s3a://lakehouse/delta/raw_iot_data"

# Read changes from the raw Delta table using CDF
cdf_stream = spark.readStream \
    .format("delta") \
    .option("readChangeFeed", "true") \
    .option("startingVersion", 0) \
    .load(raw_delta_path)


In [None]:
# Path for the processed Delta table
processed_delta_path = "s3a://lakehouse/delta/processed_iot_data"

# Write changes to the processed Delta table
processed_stream_query = cdf_stream.writeStream \
    .format("delta") \
    .option("path", "s3a://lakehouse/delta/processed_iot_data") \
    .option("checkpointLocation", "s3a://lakehouse/delta/checkpoints/processed_iot_data") \
    .outputMode("append") \
    .option("mergeSchema", "true") \
    .start()

print("Propagating changes from raw_iot_data to processed_iot_data...")
# Monitor the query progress
import time

while processed_stream_query.isActive:
    print(processed_stream_query.lastProgress)  # Shows the latest progress info
    time.sleep(5)  # Updates every 5 seconds

In [1]:
spark.sql("SELECT * FROM delta.`s3a://lakehouse/delta/processed_iot_data`;").show()

NameError: name 'spark' is not defined