In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, length, upper
from pyspark.sql.types import StructType, StringType, IntegerType

# SparkSession z obsługą Kafka + JSON
spark = SparkSession.builder \
    .appName("StreamingJSONTransform") \
    .master("local[*]") \
    .config("spark.jars.packages", ",".join([
        "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0"
    ])) \
    .getOrCreate()

# Schemat danych JSON
json_schema = StructType() \
    .add("id", IntegerType()) \
    .add("name", StringType()) \
    .add("message", StringType())

# Strumień z Kafka
kafka_stream = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka_streaming_lab:9092") \
    .option("subscribe", "spark-lab2-topic") \
    .option("startingOffsets", "latest") \
    .load()

# Parsowanie JSON z kolumny value
parsed = kafka_stream.selectExpr("CAST(value AS STRING) as json_str") \
    .select(from_json(col("json_str"), json_schema).alias("data")) \
    .select("data.*")

# Przekształcanie danych
transformed = parsed \
    .withColumn("name_upper", upper(col("name"))) \
    .withColumn("msg_length", length(col("message"))) \
    .filter(col("msg_length") > 5)

# Wypisz wynik na konsolę
query = transformed.writeStream \
    .outputMode("append") \
    .format("console") \
    .option("truncate", "false") \
    .start()

query.awaitTermination()
