In [31]:
# Create the Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession 
    .builder 
    .appName("Streaming from Kafka") 
    .config("spark.streaming.stopGracefullyOnShutdown", True) 
    .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0')
    .config("spark.sql.shuffle.partitions", 4)
    .master("local[*]") 
    .getOrCreate()
)

spark

In [32]:
# Create the kafka_df to read from kafka

kafka_df = (
    spark
    .readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka:9092")
    .option("subscribe", "people") #people is the topic name
    .option("startingOffsets", "earliest")
    .load()
)

In [33]:
kafka_df.printSchema()
# kafka_df.show()


root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [34]:
from pyspark.sql.functions import expr

kafka_json_df = kafka_df.withColumn("value", expr("cast(value as string)"))
# kafka_json_df.show()

In [35]:
from pyspark.sql.types import StringType, StructField, StructType, ArrayType, LongType


In [36]:
# df=spark.read.json("data/input/people1.json")

In [37]:
# myschema=df.schema
from pyspark.sql.types import StructType, StructField, StringType, LongType, ArrayType
myschema=StructType([
    StructField("id", LongType(), True),
    StructField("name", StringType(), True),
    StructField("age", LongType(), True),
    StructField("contacts", StructType([
        StructField("email", StringType(), True),
        StructField("phones", ArrayType(StringType()), True)
    ]), True),
    StructField("skills", ArrayType(StringType()), True),
    StructField("projects", ArrayType(StructType([
        StructField("name", StringType(), True),
        StructField("duration", StringType(), True)
    ])), True)
])

In [38]:
from pyspark.sql.functions import from_json,col

streaming_df = kafka_json_df.withColumn("values_json", from_json(col("value"), myschema)).selectExpr("values_json.*")
# streaming_df.show(truncate=False)
# kafka_json_df.select("value").show(5, truncate=False)

In [39]:
streaming_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- contacts: struct (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- phones: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- projects: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- duration: string (nullable = true)



In [40]:
from pyspark.sql.functions import explode

df_flat = streaming_df\
    .withColumn("email", col("contacts").getField("email")) \
    .withColumn("phones", col("contacts").getField("phones")) \
    .withColumn("phone", explode("phones")) \
    .withColumn("skill", explode("skills")) \
    .withColumn("project", explode("projects")) \
    .withColumn("project_name", col("project.name")) \
    .withColumn("project_duration", col("project.duration")) \
    .drop("contacts", "skills", "projects", "phones", "project")

df_flat.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- email: string (nullable = true)
 |-- phone: string (nullable = true)
 |-- skill: string (nullable = true)
 |-- project_name: string (nullable = true)
 |-- project_duration: string (nullable = true)



In [None]:
(df_flat
 .writeStream
 .format("console")
 .outputMode("append")
 .trigger(once=True)
 .option("checkpointLocation", "checkpoint_dir_kafka")
 .start()
 .awaitTermination())

25/04/19 06:56:16 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+---+----+---+---------------+------------+-----+------------+----------------+
| id|name|age|          email|       phone|skill|project_name|project_duration|
+---+----+---+---------------+------------+-----+------------+----------------+
|  2| Bob| 25|bob@example.com|555-111-2222| Java|       Gamma|        4 months|
|  2| Bob| 25|bob@example.com|555-111-2222|Kafka|       Gamma|        4 months|
+---+----+---+---------------+------------+-----+------------+----------------+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+---+-------+---+-------------------+------------+-------+------------+----------------+
| id|   name|age|              email|       phone|  skill|project_name|project_duration|
+---+-------+---+-------------------+------------+-------+------------+----------------+
|  3|Charlie| 35|charlie@example.com|444-333-2222|  Scala|       Delta|        2 months|
|  3|Charlie| 35|charlie@example.com|444-333-2222|  Scala|     Epsilon|        5 months|
|  3|Charlie| 35|charlie@example.com|444-333-2222|Airflow|       Delta|        2 months|
|  3|Charlie| 35|charlie@example.com|444-333-2222|Airflow|     Epsilon|        5 months|
|  3|Charlie| 35|charlie@example.com|222-333-4444|  Scala|       Delta|        2 months|
|  3|Charlie| 35|charlie@example.com|222-333-4444|  Scala|     Epsilon|        5 months|
|  3|Charlie| 35|charlie@example.com|222-333-4444|Airflow|       Delta|        2 months|
|  3|Charlie|