In [None]:
import os
os.environ["PYSPARK_ALLOW_INSECURE_GATEWAY"] = "1"


from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StringType, IntegerType


DB_URL = "jdbc:postgresql://postgres:5432/postgres"
DB_USER = "myuser"
DB_PASS = "myuserpass"
DB_TABLE = "kafka_data_03"
BAD_ROWS_PATH = "/tmp/bad_rows_03"
CHECKPOINT_PATH = "/tmp/checkpoints/kafka-to-pgsql_03"
SINK_PATH = "/home/jovyan/sink_03/parquet"


# SparkSession z Kafka
spark = (
    SparkSession.builder
    .appName("StreamingToParquet")
    .master("local[*]")
    .getOrCreate()
)

# Schemat danych JSON
schema = (
    StructType()
    .add("id", IntegerType())
    .add("name", StringType())
    .add("message", StringType())
)

# Wczytanie danych ze strumienia Kafka
df = (
    spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka:9092")
    .option("subscribe", "spark-lab3-topic")
    .option("startingOffsets", "latest")
    .option("badRecordsPath", BAD_ROWS_PATH)
    .load()
)

# Parsowanie JSON
parsed = (
    df.selectExpr("CAST(value AS STRING)")
    .select(from_json(col("value"), schema).alias("data"))
    .select("data.*")
)

# Zapis do pliku Parquet
query = (
    parsed.writeStream
    .format("parquet")
    .outputMode("append")
    .option("path", SINK_PATH)
    .option("checkpointLocation", CHECKPOINT_PATH)
    .start()
)

query.awaitTermination()
