In [0]:
from pyspark import pipelines as dp
from pyspark.sql.functions import col, sum as _sum
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

schema = StructType([
   StructField("Id", IntegerType(), True),
   StructField("name", StringType(), True),
   StructField("age", IntegerType(), True),
   StructField("money", IntegerType(), True),
   StructField("sales", IntegerType(), True),
   StructField("units", IntegerType(), True),
])

# --------------------------
# Bronze: Streaming ingestion
# --------------------------
@dp.table
def bronze_sales_spark4():
    """
    Ingest raw JSON sales data continuously
    """
    return (
        spark.readStream
        .format("json")
        .option("multiLine", True)
        .schema(schema)
        .load("/Volumes/dataops_dev/schema_test/volume_test/delta_tables/json/")
    )

# --------------------------
# Silver: Streaming clean & quality
# --------------------------
@dp.table
@dp.expect("valid_amount", "money > 0")  # data quality check
def silver_sales_spark4():
    """
    Clean data: cast types, enforce quality rules
    """
    return (
        dp.readStream("bronze_sales_spark")
        .withColumn("money", col("money").cast("double"))
    )

# --------------------------
# Gold: Batch aggregation
# --------------------------
@dp.table
def gold_sales_summary_spark4():
    """
    Aggregate silver data in batch mode.
    You can run this on schedule or manually.
    """
    return (
        dp.read("silver_sales_spark")
        .groupBy("Id")
        .agg(
            _sum("money").alias("total_money")
        )
    )