In [None]:
import os
from pyspark.sql import SparkSession

from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

os.environ["PYSPARK_SUBMIT_ARGS"] = "--packages org.apache.hadoop:hadoop-aws:3.3.4,io.delta:delta-spark_2.12:3.3.0 pyspark-shell"

# Initialize Spark session with Delta Lake and MinIO support
spark = SparkSession.builder \
    .appName("DeltaLakeWithMinIO") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("delta.autoOptimize.optimizeWrite", "true") \
    .config("delta.autoOptimize.autoCompact", "true") \
    .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore") \
    .config("spark.hadoop.fs.s3a.endpoint", os.getenv("MINIO_ENDPOINT")) \
    .config("spark.hadoop.fs.s3a.access.key", os.getenv("MINIO_ACCESS_KEY") \
    .config("spark.hadoop.fs.s3a.secret.key", os.getenv("MINIO_SECRET_KEY") \
    .config('spark.hadoop.fs.s3a.attempts.maximum', "3") \
    .config('spark.hadoop.fs.s3a.connection.timeout', "10000") \
    .config('spark.hadoop.fs.s3a.connection.establish.timeout', "5000") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3n.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()

print("Spark session configured with Delta Lake and MinIO!")

your 131072x1 screen size is bogus. expect trouble
25/01/24 15:53:44 WARN Utils: Your hostname, CPC-12806 resolves to a loopback address: 127.0.1.1; using 172.26.242.248 instead (on interface eth0)
25/01/24 15:53:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/arthur/streaming-pipeline/.venv/lib/python3.13/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/arthur/.ivy2/cache
The jars for the packages stored in: /home/arthur/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-172de65f-9bcd-4158-957b-c2c9ef76d088;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
	found io.delta#delta-spark_2.12;3.3.0 in central
	found io.delta#delta-storage;3.3.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 315ms :: artifacts dl 16ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.12.262 from central in [default]
	io.delta#delta-spark_2.12;3.3.0 from central in [default]
	io.delta#delta-storage;3.3.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	org.apache.hado

Spark session configured with Delta Lake and MinIO!


In [23]:
spark.sql("SELECT COUNT(*) FROM delta.`s3a://lakehouse/delta/processed_iot_data`;").show(5)

+--------+
|count(1)|
+--------+
|    5616|
+--------+



In [24]:
spark.sql("SELECT * FROM delta.`s3a://lakehouse/delta/processed_iot_data`;").show(5)

+--------------------+-----------+------------+---------------+--------------------+
|           timestamp|temperature|_change_type|_commit_version|   _commit_timestamp|
+--------------------+-----------+------------+---------------+--------------------+
|2025-01-16T17:00:...|       23.8|      insert|            133|2025-01-16 14:00:...|
|2025-01-16T16:43:...|      25.29|      insert|             59|2025-01-16 13:43:...|
|2025-01-16T16:49:...|      22.61|      insert|            110|2025-01-16 13:49:...|
|2025-01-16T16:40:...|      21.82|      insert|             33|2025-01-16 13:40:...|
|2025-01-16T16:44:...|      29.38|      insert|             68|2025-01-16 13:43:...|
+--------------------+-----------+------------+---------------+--------------------+
only showing top 5 rows



In [15]:
spark.sql("DESCRIBE delta.`s3a://lakehouse/delta/processed_iot_data`;").show()

+-----------------+---------+-------+
|         col_name|data_type|comment|
+-----------------+---------+-------+
|        timestamp|   string|   NULL|
|      temperature|   double|   NULL|
|     _change_type|   string|   NULL|
|  _commit_version|   bigint|   NULL|
|_commit_timestamp|timestamp|   NULL|
+-----------------+---------+-------+



In [22]:
spark.sql("DESCRIBE DETAIL delta.`s3a://lakehouse/delta/processed_iot_data`;").show(truncate=False)



+------+------------------------------------+----+-----------+----------------------------------------+-----------------------+-------------------+----------------+-----------------+--------+-----------+----------+----------------+----------------+------------------------+
|format|id                                  |name|description|location                                |createdAt              |lastModified       |partitionColumns|clusteringColumns|numFiles|sizeInBytes|properties|minReaderVersion|minWriterVersion|tableFeatures           |
+------+------------------------------------+----+-----------+----------------------------------------+-----------------------+-------------------+----------------+-----------------+--------+-----------+----------+----------------+----------------+------------------------+
|delta |ad3a479d-ade0-4e6b-94bb-05816c71c18c|NULL|NULL       |s3a://lakehouse/delta/processed_iot_data|2025-01-16 13:37:09.164|2025-01-16 14:00:21|[]              |[]            

                                                                                

In [5]:
df = spark.read.format("delta").load("s3a://lakehouse/delta/processed_iot_data")
df.printSchema()

root
 |-- timestamp: string (nullable = true)
 |-- temperature: double (nullable = true)
 |-- _change_type: string (nullable = true)
 |-- _commit_version: long (nullable = true)
 |-- _commit_timestamp: timestamp (nullable = true)



In [21]:
# Stream the changes using CDF
streaming_df = spark.readStream \
    .format("delta") \
    .option("readChangeFeed", "true") \
    .option("startingVersion", "latest") \
    .load("s3a://lakehouse/delta/processed_iot_data")

# Filter to keep only 'timestamp' and 'temperature' columns
filtered_streaming_df = streaming_df.select("timestamp", "temperature") \
    .withColumnRenamed("timestamp", "event_time") \
    .withColumnRenamed("temperature", "sensor_temp")

# Process and display the streaming changes
query = filtered_streaming_df.writeStream \
    .format("console") \
    .option("checkpointLocation", "s3a://lakehouse/delta/checkpoints/test") \
    .start()

# Monitor the query progress
import time

while query.isActive:
    print(query.lastProgress)  # Shows the latest progress info
    time.sleep(5)  # Updates every 5 seconds

25/01/16 14:01:32 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


None
{'id': 'bac56a91-3658-421d-8974-17c72e58191b', 'runId': 'e7a365e3-2592-4fee-a7a6-807934ad2aa2', 'name': None, 'timestamp': '2025-01-16T17:01:32.991Z', 'batchId': 0, 'numInputRows': 0, 'inputRowsPerSecond': 0.0, 'processedRowsPerSecond': 0.0, 'durationMs': {'latestOffset': 352, 'triggerExecution': 590}, 'stateOperators': [], 'sources': [{'description': 'DeltaSource[s3a://lakehouse/delta/processed_iot_data]', 'startOffset': None, 'endOffset': None, 'latestOffset': None, 'numInputRows': 0, 'inputRowsPerSecond': 0.0, 'processedRowsPerSecond': 0.0}], 'sink': {'description': 'org.apache.spark.sql.execution.streaming.ConsoleTable$@44ebbfa0', 'numOutputRows': -1}}
{'id': 'bac56a91-3658-421d-8974-17c72e58191b', 'runId': 'e7a365e3-2592-4fee-a7a6-807934ad2aa2', 'name': None, 'timestamp': '2025-01-16T17:01:32.991Z', 'batchId': 0, 'numInputRows': 0, 'inputRowsPerSecond': 0.0, 'processedRowsPerSecond': 0.0, 'durationMs': {'latestOffset': 352, 'triggerExecution': 590}, 'stateOperators': [], 'sou

KeyboardInterrupt: 