In [1]:
# Create the Spark Session
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, StructField, StructType
from pyspark.sql.functions import from_json, col, regexp_replace, expr
import logging
spark = (
    SparkSession 
    .builder 
    .appName("Streaming from Kafka into S3 bucket") 
    .config('spark.jars.packages', "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0")
    .master("local[*]") 
    .getOrCreate()
)

spark

In [2]:
# Create the kafka_df to read from kafka

kafka_df = (
    spark
    .readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka3:29094")
    .option("subscribe", "shopify.items")
    .option("startingOffsets", "earliest")
    .load()
)

In [3]:
# View schema for raw kafka_df
kafka_df.printSchema()
#kafka_df.show()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [4]:
# Cast key and value columns to STRING and remove backslashes
kafka_df = kafka_df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
kafka_df = kafka_df.withColumn("key", regexp_replace(col("key"), "\\\\", ""))
kafka_df = kafka_df.withColumn("value", regexp_replace(col("value"), "\\\\", ""))

In [5]:
kafka_df.printSchema()
#kafka_df.show(truncate=False)

root
 |-- key: string (nullable = true)
 |-- value: string (nullable = true)



In [6]:
# Extract the payload JSON string from the value column
# Remove the `schema` part to focus on the `payload`
value_payload_df = kafka_df.withColumn("payload", expr("substring(value, instr(value, 'payload') + 10, length(value) - instr(value, 'payload') - 11)"))

In [7]:
# Define the schema for the payload
payload_schema = StructType([
    StructField("_id", StructType([StructField("$oid", StringType(), True)]), True),
    StructField("name", StringType(), True),
    StructField("price", StringType(), True),
    StructField("category", StringType(), True),
    StructField("instock", StringType(), True),
    StructField("tags", StringType(), True),
    StructField("description", StringType(), True),
    StructField("filename", StringType(), True)
])

In [8]:
# Parse the payload JSON string using the defined schema
parsed_df = value_payload_df.withColumn("data", from_json(col("payload"), payload_schema))

In [9]:
parsed_df.printSchema()
#parsed_df.show(truncate=False)

root
 |-- key: string (nullable = true)
 |-- value: string (nullable = true)
 |-- payload: string (nullable = true)
 |-- data: struct (nullable = true)
 |    |-- _id: struct (nullable = true)
 |    |    |-- $oid: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- price: string (nullable = true)
 |    |-- category: string (nullable = true)
 |    |-- instock: string (nullable = true)
 |    |-- tags: string (nullable = true)
 |    |-- description: string (nullable = true)
 |    |-- filename: string (nullable = true)



In [10]:
# Select individual fields from the parsed JSON data
final_df = parsed_df.select(
    col("data._id.$oid").alias("_id"),
    col("data.name").alias("name"),
    col("data.price").alias("price"),
    col("data.category").alias("category"),
    col("data.instock").alias("instock"),
    col("data.tags").alias("tags"),
    col("data.description").alias("description"),
    col("data.filename").alias("filename")
)

In [11]:
final_df.printSchema()
#final_df.show(truncate=False)

root
 |-- _id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- price: string (nullable = true)
 |-- category: string (nullable = true)
 |-- instock: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- description: string (nullable = true)
 |-- filename: string (nullable = true)



In [12]:
# Write the stream to the console
query = final_df \
    .writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

In [13]:
query.awaitTermination()

StreamingQueryException: Query [id = f65298c6-d89d-458c-bf30-fce2dee155f6, runId = 1c7b89b4-5bb8-4719-a970-682702c7298d] terminated with exception: Failed to construct kafka consumer