In [1]:
from pyspark.sql import SparkSession
import os

from dotenv import load_dotenv

ENVIRONMENT = "edge"

load_dotenv(f"./{ENVIRONMENT}.env")

os.environ["PYSPARK_SUBMIT_ARGS"] = "--packages org.apache.hadoop:hadoop-aws:3.3.4,io.delta:delta-spark_2.12:3.3.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.4,org.apache.kafka:kafka-clients:3.9.0,org.apache.spark:spark-avro_2.12:3.5.1 pyspark-shell"

# Initialize Spark session with Delta Lake and MinIO support
spark = (SparkSession.builder \
    .appName("DeltaLakeWithMinIO") \
    ## Delta
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    #Hive Catalog
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    ## Optimize Delta
    .config("delta.autoOptimize.optimizeWrite", "true") \
    .config("delta.autoOptimize.autoCompact", "true") \
    .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore") \
    ## MinIO
    .config("spark.hadoop.fs.s3a.endpoint", os.getenv("MINIO_ENDPOINT")) \
    .config("spark.hadoop.fs.s3a.access.key", os.getenv("MINIO_ACCESS_KEY")) \
    .config("spark.hadoop.fs.s3a.secret.key", os.getenv("MINIO_SECRET_KEY")) \
    .config('spark.hadoop.fs.s3a.attempts.maximum', "3") \
    .config('spark.hadoop.fs.s3a.connection.timeout', "10000") \
    .config('spark.hadoop.fs.s3a.connection.establish.timeout', "5000") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3n.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate())

your 131072x1 screen size is bogus. expect trouble
25/04/27 16:20:13 WARN Utils: Your hostname, CPC-12806 resolves to a loopback address: 127.0.1.1; using 172.26.242.248 instead (on interface eth0)
25/04/27 16:20:13 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/arthur/streaming-pipeline/src/.venv/lib/python3.13/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/arthur/.ivy2/cache
The jars for the packages stored in: /home/arthur/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
io.delta#delta-spark_2.12 added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
org.apache.kafka#kafka-clients added as a dependency
org.apache.spark#spark-avro_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-bd24bee8-ddb0-4873-9b63-175a38109588;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
	found io.delta#delta-spark_2.12;3.3.0 in central
	found io.delta#delta-storage;3.3.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.4 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.4 in central
	found org.apache.ha

In [2]:
spark.sql("DESCRIBE delta.`s3a://lakehouse/delta/raw_control_power-avro/`").show()

25/04/27 16:20:20 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

+-----------------+--------------------+-------+
|         col_name|           data_type|comment|
+-----------------+--------------------+-------+
|            topic|              string|   NULL|
|        timestamp|           timestamp|   NULL|
|landing_timestamp|           timestamp|   NULL|
|     parsed_value|struct<robot_acti...|   NULL|
+-----------------+--------------------+-------+



In [3]:
spark.sql("SELECT timestamp, landing_timestamp, parsed_value.* FROM delta.`s3a://lakehouse/delta/raw_control_power-avro`;").show(5)

25/04/27 16:20:30 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+--------------------+--------------------+---------------+--------------+--------+---------+-----------+---------+------------+--------------+---------+--------------------+
|           timestamp|   landing_timestamp|robot_action_id|apparent_power| current|frequency|phase_angle|    power|power_factor|reactive_power|  voltage|    source_timestamp|
+--------------------+--------------------+---------------+--------------+--------+---------+-----------+---------+------------+--------------+---------+--------------------+
|2025-04-27 15:44:...|2025-04-27 15:45:...|           15.0|     257.88306|1.271197|49.951218|  335.52814| 234.7194|    0.910178|   -106.810036|224.69905|2025-04-27T18:44:...|
|2025-04-27 15:44:...|2025-04-27 15:45:...|           15.0|     257.88306|1.271197|49.951218|  335.52814| 234.7194|    0.910178|   -106.810036|224.69905|2025-04-27T18:44:...|
|2025-04-27 15:44:...|2025-04-27 15:45:...|           15.0|     201.71207|1.156928|49.951218|  335.52814|173.89613|    0.8621

In [4]:
df_datalake = spark.sql("SELECT timestamp, landing_timestamp, parsed_value.* FROM delta.`s3a://lakehouse/delta/raw_control_power-avro`;").toPandas()

                                                                                

In [5]:
from datetime import datetime

file_name = datetime.now().strftime("%Y-%m-%d-%H_%M_%S") + f"-avro-{ENVIRONMENT}.parquet"

print(file_name)

df_datalake.to_parquet(file_name, index=False)

2025-04-27-16_22_05-avro-edge.parquet
