In [0]:
# Here is how you can define a streaming pipeline using only PySpark DataFrame and Structured Streaming APIs, without using the dp (or dlt) module. This approach is suitable for prototyping or when you do not need the managed features of Lakeflow Declarative Pipelines


spark.sql("USE CATALOG dataops_dev")
spark.sql("USE schema_test")


from pyspark.sql.functions import col, sum as _sum
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

schema = StructType([
   StructField("Id", IntegerType(), True),
   StructField("name", StringType(), True),
   StructField("age", IntegerType(), True),
   StructField("money", IntegerType(), True),
   StructField("sales", IntegerType(), True),
   StructField("units", IntegerType(), True),
])

# Read streaming data from a JSON source
bronze_sales_stream = (
        spark.readStream
        .format("json")
        .option("multiLine", True)
        .schema(schema)
        .load("/Volumes/dataops_dev/schema_test/volume_test/delta_tables/json/")
        .na.drop()
    )

# Write the streaming data to a Delta table
bronze_query = (
    bronze_sales_stream.writeStream
    .format("delta")
    .outputMode("append")
    .option("checkpointLocation", "/Volumes/dataops_dev/schema_test/volume_test/delta_tables/checkpoints_bronze_spark2")
    .toTable("bronze_sales_spark3")
)

# Wait for the streaming query to process data (or for a certain time)
bronze_query.awaitTermination(80)

# Read data from the Bronze Delta table
bronze_sales_table = spark.read.table("bronze_sales_spark3")

# Clean and transform the data
silver_sales = (
        bronze_sales_table
        .filter(col("money") > 0)
        .select(
            col("Id"),
            col("money").cast("double")
        )
    )

# Write the transformed data to a Delta table
(
    silver_sales.write
    .format("delta")
    .mode("append")
    .saveAsTable("silver_sales_spark3")
)

# Read data from the Silver Delta table
silver_sales_table = spark.read.table("silver_sales_spark3")

# Perform batch aggregation
gold_sales = (
    silver_sales_table
    .groupBy("Id")
    .agg(
        _sum("money").alias("total_money")
        )
)

# Write the aggregated data to a Delta table
(
    gold_sales.write
    .format("delta")
    .mode("overwrite")
    .saveAsTable("gold_sales_summary_spark3")
)

In [0]:
bronze_query.stop()

In [0]:
%sql
SELECT * FROM bronze_sales_spark3;


Id,name,age,money,sales,units
10,Julia,26,1300,2800,28
11,Kevin,39,1750,5500,55
12,Laura,31,1450,3800,38
7,George,30,1100,2200,22
8,Hannah,28,1600,4000,40
9,Ian,36,2000,6500,65
1,Alice,25,1200,4500,45
2,Bob,32,850,3000,30
3,Charlie,29,1500,6000,60
4,Diana,41,2100,7500,75


In [0]:
%sql
SELECT * FROM silver_sales_spark3;


Id,money
10,1300.0
11,1750.0
12,1450.0
7,1100.0
8,1600.0
9,2000.0
1,1200.0
2,850.0
3,1500.0
4,2100.0


In [0]:
%sql
SELECT * FROM gold_sales_summary_spark3;


Id,total_money
12,1450.0
1,1200.0
6,1800.0
3,1500.0
5,950.0
9,2000.0
4,2100.0
8,1600.0
7,1100.0
10,1300.0
