In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, col, from_json
from pyspark.sql.types import StringType, StructType, StructField, IntegerType
import sys

# Create a Spark session with Kafka integration
spark = SparkSession.builder \
    .appName("KafkaStreamExample") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.1") \
    .getOrCreate()

In [None]:
# Define schema for the JSON data (expecting a key 'message' with a string value)
json_schema = StructType([
    StructField("message_content", StringType(), True)  # The 'message' key contains a sentence
])

In [None]:
df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9093") \
    .option("subscribe", "test-topic") \
    .load()

In [None]:
# The value in Kafka is in bytes, so we need to cast it to string
kafka_stream_df = df.selectExpr("CAST(value AS STRING)")

In [None]:
# Parse the JSON data and extract the 'message' field
parsed_df = kafka_stream_df.select(from_json(col("value"), json_schema).alias("data"))

In [None]:
# Extract the 'message' field from the parsed JSON
message_df = parsed_df.select(col("data.message_content").alias("message"))

In [None]:
# Split the message into words
words_df = message_df.select(explode(split(col("message"), "\s+")).alias("word"))

In [None]:
# Count the occurrences of each word
word_counts_df = words_df.groupBy("word").count()

In [None]:
# Write the output to the console, this will update in real-time
query = word_counts_df \
    .writeStream \
    .outputMode("complete") \
    .format("console") \
    .option("truncate", "false") \
    .start()

In [None]:
spark.stop()