# Summary

This notebook is a simples example of Spark Structured Streaming for reading new files created in Directory and printing data to console

# Imports

In [None]:
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Start Spark App

In [None]:
spark = (SparkSession.builder
         .appName("spark_structured_streaming_directory")
         .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
         .config("spark.sql.warehouse.dir", "data/")
         .getOrCreate())

# Setup

## Define and create directories

In [None]:
paths = {
    "source": "data/source",
    "bronze": "data/delta/bronze",
    "new_records_ckpt": "data/_checkpoints/new_records",
    "stats_ckpt": "data/_checkpoints/stats",
}

# Ensure the source path exists so it
# doesn't fail when creating the readStream
import os
os.makedirs(paths['source'], exist_ok=True)

# Start ReadStream

In [None]:
schema = StructType([
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True)
])

streaming_df = spark.readStream \
    .format("csv") \
    .schema(schema) \
    .option("header", "true") \
    .load(paths["source"])

# Write Streams

## new_records_query

In [None]:
# This writeStream will output to console
# every time a new record is captured
# A new record is captured when a new file is
# created in the source directory
new_records_query = streaming_df.writeStream \
    .outputMode("append") \
    .format("console") \
    .option("checkpointLocation", paths['new_records_ckpt']) \
    .start()

## stats_query

In [None]:
stats_query = streaming_df.agg(
    F.count("*").alias("count"),
    F.avg("age").alias("avg_age"),
) \
    .writeStream \
    .outputMode("complete") \
    .format("console") \
    .option("checkpointLocation", paths['stats_ckpt']) \
    .start()

## await termination

In [None]:
# This cell will execute indefinitely and the outputs of
# the write streams will be printed here
spark.streams.awaitAnyTermination()