In [1]:
from pyspark.sql import SparkSession

# you need these two to transform the json strings to dataframes
from pyspark.sql.types import MapType, StringType
from pyspark.sql.functions import from_json, col, explode, to_timestamp, from_utc_timestamp

# Spark session & context
spark = (SparkSession
         .builder
         .master('local')
         .appName('kafka-mongo-streaming')     
         # Add kafka package and mongodb package. Make sure to to this as one string!
         # Versions need to match the Spark version (trial & error)
         .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1,org.mongodb.spark:mongo-spark-connector_2.12:2.4.0")
         # Mongo config including the username and password from compose file
         .config("spark.mongodb.input.uri","mongodb://root:example@mongo:27017/docstreaming.yelp?authSource=admin")
         .config("spark.mongodb.output.uri","mongodb://root:example@mongo:27017/docstreaming.yelp?authSource=admin")
         .getOrCreate())
sc = spark.sparkContext


In [2]:
# Read the message from the kafka stream
df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka:9092") \
  .option("subscribe", "Yelp-topic") \
  .load()

# convert the binary values to string
df1 = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

In [3]:
#Create a temporary view for SparkSQL
df1.createOrReplaceTempView("message")

In [4]:
# Write out the message to the console of the environment
res = spark.sql("SELECT * from message")
res.writeStream.format("console") \
            .outputMode("append") \
            .start()

<pyspark.sql.streaming.StreamingQuery at 0x7f0b62eadf30>

In [5]:
# Write the unvonverted dataframe (no strings)
# message back into Kafka in another topic#
# listen to it with a local consumer
ds = df \
  .writeStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka:9092") \
  .option("topic", "spark-output") \
  .option("checkpointLocation", "/tmp") \
  .start() 

In [6]:
# Write the message into MongoDB
def foreach_batch_function(df, epoch_id):
    if df.rdd.isEmpty():
        print("Dataframe is empty")
        return True
    
    #Transform the values of all rows in column value and create a dataframe out of it (will also only have one row)
    df2=df.withColumn("value",from_json(df.value,MapType(StringType(),StringType()))) 
    
    #Explode the value column to create a key-value pair for each row
    df2 = df2.select(explode("value").alias("key", "value"))
    
    #Pivot the key-value pairs such that each key becomes a separate column, The "first" function returns the first non-null value in the group of values for each key.
    df2 = df2.groupBy().pivot("key").agg({"value": "first"})
    column_names = df2.columns
    # print(column_names)
    
    # df2.show()
    # df2.printSchema()
   
    # Transform the dataframe
    user_columns = ["user_id", "name", "review_count", "yelping_since"]
    tip_columns = ["user_id", "business_id", "text", "date"]
    # Check if data is coming from User 
    if all(col in column_names for col in user_columns):
        # Rename Columns
        df3 = df2.withColumnRenamed("name", "user_name") \
                 .withColumnRenamed("friends", "Numbers_of_friends")
        
        # Convert timestamp to UTC
        df3 = df3.withColumn("yelping_since_utc", to_timestamp("yelping_since", "yyyy-MM-dd HH:mm:ss"))
        # Convert timestamp to GMT+1
        df3 = df3.withColumn("yelping_since", from_utc_timestamp("yelping_since_utc", "+01:00"))
        # Drop the intermediate UTC column
        df3 = df3.drop("yelping_since_utc")
        df3.write.format("com.mongodb.spark.sql.DefaultSource").mode("append").save()
            
    # Check if data is coming from Tip
    elif all(col in column_names for col in tip_columns):
        # Rename Columns
        df3 = df2.withColumnRenamed("text", "tip_text") \
                 .withColumnRenamed("date", "tip_date")
        
        # Convert timestamp to UTC
        df3 = df3.withColumn("tip_date_utc", to_timestamp("tip_date", "yyyy-MM-dd HH:mm:ss"))
        # Convert timestamp to GMT+1
        df3 = df3.withColumn("tip_date", from_utc_timestamp("tip_date_utc", "+01:00"))
        # Drop the intermediate UTC column
        df3 = df3.drop("tip_date_utc")
        
        df3.show()
        df3.printSchema()
        df3.write.format("com.mongodb.spark.sql.DefaultSource").mode("append").save()
        
#     # Check if data is coming from Review
#     elif df2.filter(col("review_id").isNull() | col("text").isNull() | col("date").isNull()).count() == 0:
#         df3= df2.select(column_review)
#         df3.show()
#         df3.printSchema()
        
#     # Check if data is coming from Checkin
#     elif df2.filter(col("business_id").isNull() | col("date").isNull()).count() == 0:
#         df3= df2.select(column_checkin)
#         df3.show()
#         df3.printSchema()
        
#     # Check if data is coming from Business
#     elif df2.filter(col("name").isNull() | col("address").isNull() | col("city").isNull()).count() == 0:
#         col_business = ["value.business_id","value.name","value.address","value.city","value.postal_code","value.latitude","value.longitude","value.stars","value.review_count","value.is_open", "value.attributes_NoiseLevel"]
#         df3 = df2.select(col_business)
#         df3.show()
#         df3.printSchema()
    
#     #df3.write.format("com.mongodb.spark.sql.DefaultSource").mode("append").save()  
# #     existing_df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()
# #     existing_df.select("user_id").show()
# #     df3.select("user_id").show()
    
# #     if df3.select("user_id").count() != 0: 
        
# #         # Check if "user_id" is already on mongodb
# #         joined_df = df3.join(existing_df, "user_id", 'inner')

# #         # check if any rows were returned by the join operation
# #         if joined_df.select("user_id").count() > 0:
# #             joined_df.write.format("com.mongodb.spark.sql.DefaultSource").option("replaceDocument", "true").mode("append").save()
# #         else:
# #             df3.write.format("com.mongodb.spark.sql.DefaultSource").mode("append").save()
# #     else:
# #         return False
     
    return True

SyntaxError: invalid syntax (539903584.py, line 39)

In [None]:
# Start the MongoDB stream and wait for termination
df1.writeStream.foreachBatch(foreach_batch_function).start().awaitTermination()