In [0]:
# Lab 00a: Before we begin, confirm all files are loaded
display(dbutils.fs.ls("dbfs:/FileStore/tables/"))

# Mod 06a: Streaming

In [0]:
# https://hackersandslackers.com/structured-streaming-in-pyspark/

# https://medium.com/expedia-group-tech/apache-spark-structured-streaming-checkpoints-and-triggers-4-of-6-b6f15d5cfd8d

#dbutils.fs.rm("dbfs:/FileStore/tables/sfpd/", True)

### Lab 01: Aggregate using BATCH

In [0]:
# Here's the files we'll be Streaming in
dbutils.fs.ls("dbfs:/FileStore/tables/stream1")

In [0]:
# View contents of 1 of the 20 JSON files
display(spark.read.json("dbfs:/FileStore/tables/stream1/1.json"))

In [0]:
# Explicitly set schema (This is mandatory for Streaming)
from pyspark.sql.types import TimestampType, StringType, StructType, StructField

DDL_schema = StructType([ StructField("time", StringType(), True),
                      StructField("customer", StringType(), True),
                      StructField("action", StringType(), True),
                      StructField("device", StringType(), True)])

In [0]:
# Create DataFrame representing data in the JSON files
staticDF = (spark.read.schema(DDL_schema)
                 .option("header", True)
                 .json("dbfs:/FileStore/tables/stream1/").na.drop())

display(staticDF)

In [0]:
# Aggregate number of actions
actionsDF = (staticDF.groupBy(staticDF.action).count())

# Create Temp table named 'iot_action_counts'
actionsDF.createOrReplaceTempView("iot_action_counts")

In [0]:
%sql
SELECT action, sum(count) as total_count 
FROM iot_action_counts GROUP BY action

### Lab 02: Aggregate using STREAMING

In [0]:
# Our data isn't being created in real time, so we'll have to use a trick to emulate streaming conditions. 
# Instead of streaming data as it comes in, we can load each of our JSON files one at a time.  
# That's what option() is doing: we're setting the maxFilesPerTrigger option to 1, 
# which means only a single JSON file will be streamed at a time. This will allow us to see the #data as it streams in!

In [0]:
# Create streaming equivalent of `staticDF` using .readStream
# Only read in 1 file per Trigger as defined in 'writeStream' (in later Cell)
streamReadDF = (spark.readStream
    .schema(DDL_schema)
    .option("maxFilesPerTrigger", 1)
    .json("dbfs:/FileStore/tables/stream1/"))

# Count # of 'action' in Streaming DataFrame
streamingActionCountsDF = (streamReadDF.groupBy(streamReadDF.action).count())

In [0]:
# Is `streamingActionCountsDF` actually streaming?
streamingActionCountsDF.isStreaming

In [0]:
# To start 'readStream', normally have a 'writeStream' followed by 'start()'.
# But can issue an Action like 'display' to view the Output from 'readStream' 
# Let's do it to see Streaming in Action as each of the 20 files get Aggregated
# When 'counts' match Cell 12, click on 'Cancel' hotlink to reset
display(streamingActionCountsDF)

In [0]:
# Here's the 'writeStream'. Notice we point to the 'readStream' DataFrame named = 'streamingActionCountsDF'
# We are writing the Output to a TempView named = 'counts'
# And we want 'outputMode' = Complete which means we'll get full count as each JSON files streams in
# We set a Trigger so each file will Stream in every 5 seconds
# And we set a Checkpoint for Fault Tolerance purposes
# The 'start' kicks off the Streaming

spark.conf.set("spark.sql.shuffle.partitions", "2")

# View stream in 5-second micro-batches
query = (streamingActionCountsDF
    .writeStream
    .format("memory")
    .queryName("counts")
    .outputMode("complete")
    .trigger(processingTime = "5 seconds")
    .option("checkpointLocation", "/tmp/checkpoint/")
    .start())

In [0]:
# While above Cell is running, Is streaming query still Active?
query.status

In [0]:
%sql
-- Click this every 5 seconds to view the updated counts
SELECT * FROM counts

In [0]:
# Stop Streaming Job
query.stop()

In [0]:
# Confirm Checkpoint directory is populated
dbutils.fs.ls("tmp/checkpoint/")

In [0]:
# Confirm have Offset for every File that was Micro-batched
dbutils.fs.ls("dbfs:/tmp/checkpoint/offsets/")

In [0]:
# Clean up the Checkpoint Files
dbutils.fs.rm("dbfs:/tmp/checkpoint/", True)

# End of Module 06a-Streaming