In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import StructType, StringType, TimestampType


In [2]:
# Set environment variables
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /opt/spark/jars/spark-sql-kafka-0-10_2.12-3.2.1.jar,/opt/spark/jars/kafka-clients-3.2.1.jar pyspark-shell'


In [3]:
# Initialize Spark Session
spark = SparkSession.builder \
    .appName("KafkaSparkStreaming") \
    .getOrCreate()


In [4]:
# Define schema for Kafka messages
schema = StructType() \
    .add("client_host", StringType()) \
    .add("http_method", StringType()) \
    .add("url", StringType()) \
    .add("event_time", TimestampType())


In [5]:
# Read from Kafka
df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "ui-event-log") \
    .option("startingOffsets", "earliest") \
    .load()


In [6]:
# Parse the value column and apply schema
parsed_df = df.selectExpr("CAST(value AS STRING)") \
    .select(from_json(col("value"), schema).alias("data")) \
    .select("data.*")

# Example: Transform data (simple transformation)
transformed_df = parsed_df.withColumn("processed_time", col("event_time"))


In [7]:
# Display transformed data to console
query = transformed_df.writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

# Await termination to keep the stream running
query.awaitTermination()
