In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, LongType, StringType, ArrayType
from pyspark.sql.functions import explode

In [2]:
spark = (SparkSession
         .builder
         .config("spark.streaming.stopGracefullyOnShutdown", True)
         .master("local[*]")
         .appName("Reading from file").getOrCreate())

In [3]:
spark.conf.set("spark.sql.streaming.schemaInterface", True)
spark

In [4]:
devices_element_schema = StructType([
    StructField('deviceId', StringType(), True),
    StructField('measure', StringType(), True),
    StructField('status', StringType(), True),
    StructField('temperature', LongType(), True)
])

devices_schema = ArrayType(devices_element_schema, True)

data_schema = StructType([
    StructField('devices', devices_schema, True)
])

df_schema = StructType([
    StructField('customerId', StringType(), True),
    StructField('data', data_schema, True),
    StructField('eventId', StringType(), True),
    StructField('eventOffset', LongType(), True),
    StructField('eventPublisher', StringType(), True),
    StructField('eventTime', StringType(), True),
])


streaming_df = (
    spark.readStream
    .option("multiline","true")
    .option("cleanSource", "archive")
    .option("sourceArchiveDir", "archive_dir")
    .option("maxFilesPerTrigger", 1)
    .format("json")
    .schema(df_schema)
    .load("../data/input/events/"))

In [5]:
streaming_df.printSchema()

root
 |-- customerId: string (nullable = true)
 |-- data: struct (nullable = true)
 |    |-- devices: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- deviceId: string (nullable = true)
 |    |    |    |-- measure: string (nullable = true)
 |    |    |    |-- status: string (nullable = true)
 |    |    |    |-- temperature: long (nullable = true)
 |-- eventId: string (nullable = true)
 |-- eventOffset: long (nullable = true)
 |-- eventPublisher: string (nullable = true)
 |-- eventTime: string (nullable = true)



In [6]:
# streaming_df.show(truncate=False)

In [7]:
exploded_df = streaming_df.withColumn('devices', explode(streaming_df.data.devices)).drop('data')

In [8]:
exploded_df.printSchema()

root
 |-- customerId: string (nullable = true)
 |-- eventId: string (nullable = true)
 |-- eventOffset: long (nullable = true)
 |-- eventPublisher: string (nullable = true)
 |-- eventTime: string (nullable = true)
 |-- devices: struct (nullable = true)
 |    |-- deviceId: string (nullable = true)
 |    |-- measure: string (nullable = true)
 |    |-- status: string (nullable = true)
 |    |-- temperature: long (nullable = true)



In [9]:
# exploded_df.show()

In [10]:
flattened_df = exploded_df.withColumns({
    'device_id': exploded_df.devices.deviceId,
    'measure' : exploded_df.devices.measure,
    'status': exploded_df.devices.status,
    'temperature' : exploded_df.devices.temperature
}).drop('devices')

In [11]:
flattened_df.printSchema()

root
 |-- customerId: string (nullable = true)
 |-- eventId: string (nullable = true)
 |-- eventOffset: long (nullable = true)
 |-- eventPublisher: string (nullable = true)
 |-- eventTime: string (nullable = true)
 |-- device_id: string (nullable = true)
 |-- measure: string (nullable = true)
 |-- status: string (nullable = true)
 |-- temperature: long (nullable = true)



In [13]:
(flattened_df
 .writeStream
 .format("console")
 .outputMode("append")
  .option("checkpointLocation", "checkpoint_dir")
 .start()
 .awaitTermination())

StreamingQueryException: Query [id = 719b1bb7-6455-45cd-b215-a478ea4ed219, runId = 971957a7-6924-49f5-8952-60ee864fb69c] terminated with exception: Multiple streaming queries are concurrently using file:/home/jovyan/work/scripts/checkpoint_dir/commits

In [None]:
# (flattened_df
#  .writeStream
#  .format("csv")
#  .outputMode("append")
#  .option("path", "../data/output/events/")        
#  .option("checkpointLocation", "checkpoint_dir")
#  .start()
#  .awaitTermination())