In [None]:
import os
os.environ["PYSPARK_ALLOW_INSECURE_GATEWAY"] = "1"


from delta import configure_spark_with_delta_pip
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StringType, IntegerType


DB_URL = "jdbc:postgresql://postgres:5432/postgres"
DB_USER = "myuser"
DB_PASS = "myuserpass"
DB_TABLE = "kafka_data_06"
BAD_ROWS_PATH = "/tmp/bad_rows_06"
CHECKPOINT_PATH = "/tmp/checkpoints/kafka-to-pgsql_06"
SINK_PATH = "/home/jovyan/sink_06/delta"


builder = (
    SparkSession.builder
    .appName("DeltaSession")
    .master("local[*]")
    .config("spark.jars", "/usr/local/spark/jars/delta-core_2.12-2.4.0.jar,/usr/local/spark/jars/delta-storage-2.4.0.jar")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
)
spark = configure_spark_with_delta_pip(builder).getOrCreate()

# Schemat danych JSON
schema = (
    StructType()
    .add("id", IntegerType())
    .add("name", StringType())
    .add("message", StringType())
)

# Wczytanie danych ze strumienia Kafka
df = (
    spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka:9092")
    .option("subscribe", "spark-lab6-topic")
    .option("startingOffsets", "latest")
    .option("badRecordsPath", BAD_ROWS_PATH)
    .load()
)

# Parsowanie JSON
parsed = (
    df.selectExpr("CAST(value AS STRING)")
    .select(from_json(col("value"), schema).alias("data"))
    .select("data.*")
)

# Zapis do pliku Parquet
query = (
    parsed.writeStream
    .format("delta")
    .outputMode("append")
    .option("path", SINK_PATH)
    .option("checkpointLocation", CHECKPOINT_PATH)
    .start()
)

query.awaitTermination()
