## How streaming works

![streaming works](../../images/streamingworks1.png)

![streaming works](../../images/streamingworks2.png)

In [None]:
# Here is how you can define a streaming pipeline using only PySpark DataFrame and Structured Streaming APIs, without using the dp (or dlt) module. This approach is suitable for prototyping or when you do not need the managed features of Lakeflow Declarative Pipelines

spark.sql("USE CATALOG catalog")
spark.sql("USE schema")


from pyspark.sql.functions import col, sum as _sum
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

schema = StructType([
   StructField("Id", IntegerType(), True),
   StructField("name", StringType(), True),
   StructField("age", IntegerType(), True),
   StructField("money", IntegerType(), True),
   StructField("sales", IntegerType(), True),
   StructField("units", IntegerType(), True),
])

# Ingest streaming data from JSON files. Returns a spark dataframe
bronze_sales_spark = (
        spark.readStream
        .format("cloudFiles")
        .option("cloudFiles.format", "json")
        .option("multiLine", True)
        # .option("maxFilesPerTrigger", 5)  # Set the number of files to process per trigger
        .schema(schema)
        .load("/Volumes/dataops_dev/schema_test/volume_test/delta_tables/json/")
        .na.drop()
    )

# Data transformation and quality checks
silver_sales_spark = (
        bronze_sales_spark
        .filter(col("money") > 0)
        .select(
            col("Id"),
            col("money").cast("double")
        )
    )

# Define a dataframe for batch aggregation
gold_sales_summary_spark = (
        silver_sales_spark
        .groupBy("Id")
        .agg(
            _sum("money").alias("total_money")
        )
    )

# Start the streaming query for bronze layer to Delta
bronze_query = (
    bronze_sales_spark
    .writeStream
    .format("delta")
    .outputMode("append")
    .option("checkpointLocation", "/Volumes/dataops_dev/schema_test/volume_test/delta_tables/checkpoints_bronze_spark")
    .toTable("bronze_sales_spark2")
)

# Start the streaming query for silver layer to Delta
silver_query = (
    silver_sales_spark
    .writeStream
    .format("delta")
    .outputMode("append")
    .option("checkpointLocation", "/Volumes/dataops_dev/schema_test/volume_test/delta_tables/checkpoints_silver_spark")
    .toTable("silver_sales_spark2")
)

# Start the streaming query for golden layer to Delta
gold_query = (
    gold_sales_summary_spark
    .writeStream
    .format("delta")
    #.queryName("gold_sales_summary_spark")
    .outputMode("complete")
    #.trigger(processingtime="10 seconds")
    .option("checkpointLocation", "/Volumes/dataops_dev/schema_test/volume_test/delta_tables/checkpoints_golden_spark")
    .toTable("gold_sales_summary_spark2")    
)



In [None]:
# gold_query.name
# gold_query.id
# gold_query.runId
# gold_query.status
# gold_query.lastProgress

import time
time.sleep(10)

In [0]:
bronze_query.stop()
silver_query.stop()
gold_query.stop()
# or stop_streaming_query("query name")

bronze_query.awaitTermination()
silver_query.awaitTermination()
gold_query.awaitTermination()

# awaitTermination() is a blocking call that waits for the streaming query to finish.

![streaming query](../../images/streaming_query.png)

In [0]:
%sql
SELECT * FROM bronze_sales_spark2;


Id,name,age,money,sales,units
10,Julia,26,1300,2800,28
11,Kevin,39,1750,5500,55
12,Laura,31,1450,3800,38
7,George,30,1100,2200,22
8,Hannah,28,1600,4000,40
9,Ian,36,2000,6500,65
1,Alice,25,1200,4500,45
2,Bob,32,850,3000,30
3,Charlie,29,1500,6000,60
4,Diana,41,2100,7500,75


In [0]:
%sql
SELECT * FROM silver_sales_spark2;


Id,money
10,1300.0
11,1750.0
12,1450.0
7,1100.0
8,1600.0
9,2000.0
1,1200.0
2,850.0
3,1500.0
4,2100.0


In [0]:
%sql
SELECT * FROM gold_sales_summary_spark2;



Id,total_money
12,1450.0
1,1200.0
3,1500.0
5,950.0
9,2000.0
4,2100.0
8,1600.0
7,1100.0
10,1300.0
11,1750.0
