In [None]:
import os
os.environ["PYSPARK_ALLOW_INSECURE_GATEWAY"] = "1"


from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import StructType, IntegerType, StringType


DB_URL = "jdbc:postgresql://postgres:5432/postgres"
DB_USER = "myuser"
DB_PASS = "myuserpass"
DB_TABLE = "kafka_data_01"
BAD_ROWS_PATH = "/tmp/bad_rows_01"
CHECKPOINT_PATH = "/tmp/checkpoints/kafka-to-pgsql"


spark = (
    SparkSession.builder
    .appName("StructuredStreamingKafka")
    .master("local[*]")
    .getOrCreate()
)

# Schemat danych JSON
schema = (
    StructType()
    .add("id", IntegerType())
    .add("value", StringType())
)

# Strumień z Kafka
df = (
    spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka:9092")
    .option("subscribe", "spark-lab1-topic")
    .option("startingOffsets", "latest")
    .option("badRecordsPath", BAD_ROWS_PATH)
    .load()
)

# Parsowanie danych
parsed_df = (
    df.selectExpr("CAST(value AS STRING)")
    .select(from_json(col("value"), schema).alias("data"))
    .select("data.*")
)

# Zapisywanie do PostgreSQL
def write_to_postgres(batch_df, batch_id):
    (
        batch_df.write
        .format("jdbc")
        .option("url", DB_URL)
        .option("dbtable", DB_TABLE)
        .option("user", DB_USER)
        .option("password", DB_PASS)
        .option("driver", "org.postgresql.Driver")
        .mode("append")
        .save()
    )

# Zapis jako foreachBatch
query = (
    parsed_df.writeStream
    .foreachBatch(write_to_postgres)
    .outputMode("append")
    .option("checkpointLocation", CHECKPOINT_PATH)
    .start()
)

query.awaitTermination()
