In [1]:
from pyspark.sql import SparkSession

# Spark session & context
spark = (SparkSession
         .builder
         .master('local')
         .appName('kafka-streaming')
         # Add kafka package
         .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5")
         .getOrCreate())
sc = spark.sparkContext

In [2]:
# Read the message from the kafka stream for product
df_product = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka:9092") \
  .option("subscribe", "ingest-product") \
  .option("failOnDataLoss", "false") \
  .load()
df_product.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

DataFrame[key: string, value: string]

In [3]:
# Read the message from the kafka stream for category
df_category = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka:9092") \
  .option("subscribe", "ingest-category") \
  .option("failOnDataLoss", "false") \
  .load()
df_category.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

DataFrame[key: string, value: string]

In [4]:
#Create a small temporary view for SparkSQL
df_product.createOrReplaceTempView("message_product")
df_category.createOrReplaceTempView("message_category")

In [5]:
# Write out the product message to the console of the environment
res_product = spark.sql("SELECT * from message_product")
res_product.writeStream.format("console") \
            .outputMode("append") \
            .start() 

<pyspark.sql.streaming.StreamingQuery at 0x7f5584cce9d0>

In [6]:
# Write out the category message to the console of the environment
res_category = spark.sql("SELECT * from message_category")
res_category.writeStream.format("console") \
            .outputMode("append") \
            .start() 

<pyspark.sql.streaming.StreamingQuery at 0x7f5584ce67d0>

In [7]:
# Write the product message back into Kafka in another topic that you are going to listen to with a local consumer
ds_product = df_product \
  .writeStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka:9092") \
  .option("topic", "spark-output-product") \
  .option("checkpointLocation", "/tmp/product") \
  .start()

In [8]:
# Write the category message back into Kafka in another topic that you are going to listen to with a local consumer
ds_category = df_category \
  .writeStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka:9092") \
  .option("topic", "spark-output-category") \
  .option("checkpointLocation", "/tmp/category") \
  .start() 

In [None]:
spark.streams.awaitAnyTermination()

In [None]:
###Done