In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, create_map
from pyspark.sql.types import MapType, StringType, StructType, StructField


# Spark session & context
spark = (SparkSession
         .builder
         .master('local')
         .appName('kafka-mongo-streaming')     
         # Add kafka package and mongodb package. Make sure to to this as one string!
         # Versions need to match the Spark version (trial & error)
         .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1,org.mongodb.spark:mongo-spark-connector_2.12:2.4.0")
         # Mongo config including the username and password from compose file
         .config("spark.mongodb.input.uri","mongodb://root:example@mongo:27017/docstreaming.yelp?authSource=admin")
         .config("spark.mongodb.output.uri","mongodb://root:example@mongo:27017/docstreaming.yelp?authSource=admin")
         .getOrCreate())
sc = spark.sparkContext


In [2]:
# Read the message from the kafka stream
df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka:9092") \
  .option("subscribe", "Yelp-topic") \
  .load()

# convert the binary values to string
df1 = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

In [3]:
#Create a temporary view for SparkSQL
df1.createOrReplaceTempView("message")

In [4]:
# Write out the message to the console of the environment
res = spark.sql("SELECT * from message")
res.writeStream.format("console") \
            .outputMode("append") \
            .start()

<pyspark.sql.streaming.StreamingQuery at 0x7fbde81e6aa0>

In [5]:
# Write the unvonverted dataframe (no strings)
# message back into Kafka in another topic#
# listen to it with a local consumer
ds = df \
  .writeStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka:9092") \
  .option("topic", "spark-output") \
  .option("checkpointLocation", "/tmp") \
  .start() 

StreamingQueryException: Query [id = 2b4a5333-190c-4fe4-9333-887f53c01abf, runId = f2dc32cd-859e-4464-8d3f-e983d6c792ea] terminated with exception: Writing job aborted

In [6]:
schema = StructType([
    StructField("user_id", StringType(), True),
    StructField("business_id", StringType(), True),
    StructField("text", StringType(), True),
    StructField("date", StringType(), True),
    StructField("compliment_count", StringType(), True)
])

In [7]:
 

# Write the message into MongoDB
def foreach_batch_function(df, epoch_id):
    if df.rdd.isEmpty():
        print("Dataframe is empty")
        return True
    
    ds.show()
    # Parse the value column as a JSON string with the given schema
    df2 = ds.withColumn("value", from_json(col("value").cast("string"), schema))

    # Create a map of the individual fields dynamically based on the schema
    result_map_expr = {}
    for field in schema.fields:
        result_map_expr[field.name] = col("value").getItem(field.name).alias(field.name)

    result_map = create_map(result_map_expr)

    # Create a new dataframe with the map as a single row
    result_df = df2.withColumn("value", result_map).limit(1)

    # Extract the map as a dictionary
    result_dict = result_df.collect()[0].value.asDict()

    # Create a new dataframe from the dictionary
    new_df = spark.createDataFrame([result_dict])
    
    # Send the dataframe into MongoDB which will create a BSON document out of it
    new_df.write.format("com.mongodb.spark.sql.DefaultSource").mode("append").save()
    
    return True

In [8]:
# Start the MongoDB stream and wait for termination
df1.writeStream.foreachBatch(foreach_batch_function).start().awaitTermination()