In [0]:
from pyspark.sql import SparkSession, functions as F
from datetime import datetime


In [0]:
jdbc_url = ""
jdbc_props = {
    "user": "",
    "password": "",
    "driver": "org.postgresql.Driver"
}

In [0]:
customer_merchant_amount_df = None
already_detected_pat2 = set()


In [0]:
input_dir = "abfss://gdrive-ingest@devdolphinstorage.dfs.core.windows.net/transactions/"
schema_location = "abfss://gdrive-ingest@devdolphinstorage.dfs.core.windows.net/schema/ChunksSchemaPat2/"

streaming_df = (
    spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .option("header", "true")
    .option("cloudFiles.schemaLocation", schema_location)
    .load(input_dir)
)


In [0]:
def foreach_batch_function(batch_df, batch_id):
    global customer_merchant_amount_df, already_detected_pat2

    print(f"\n Processing batch {batch_id} rows: {batch_df.count()}")

    batch_df = batch_df.withColumn("amount", F.col("amount").cast("double"))

    # Step 1: Aggregate stats from this chunk
    stats = (
        batch_df.groupBy("customer", "merchant")
        .agg(
            F.count("*").alias("txn_count"),
            F.sum("amount").alias("total_amount")
        )
    )

    # Step 2: Update rolling state
    if customer_merchant_amount_df is None:
        customer_merchant_amount_df = stats
    else:
        combined = customer_merchant_amount_df.union(stats)
        customer_merchant_amount_df = (
            combined.groupBy("customer", "merchant")
            .agg(
                F.sum("txn_count").alias("txn_count"),
                F.sum("total_amount").alias("total_amount")
            )
        )

    print(f" Updated state rows: {customer_merchant_amount_df.count()}")

    # Step 3: Apply Pattern 2 rules
    result_df = (
        customer_merchant_amount_df
        .withColumn("avg_amount", F.col("total_amount") / F.col("txn_count"))
        .filter((F.col("txn_count") >= 80) & (F.col("avg_amount") < 23))
    )

    # Remove already detected
    new_result_df = result_df.filter(~F.concat_ws("|", "customer", "merchant").isin(already_detected_pat2))

    # If any new detections
    if new_result_df.count() > 0:
        now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        detections_df = (
            new_result_df
            .withColumn("YStartTime", F.current_timestamp())
            .withColumn("detectionTime", F.current_timestamp())
            .withColumn("patternId", F.lit("PatId2"))
            .withColumn("ActionType", F.lit("CHILD"))
            .withColumnRenamed("customer", "customerName")
            .withColumnRenamed("merchant", "MerchantId")
            .select("YStartTime", "detectionTime", "patternId", "ActionType", "customerName", "MerchantId", "txn_count")
        )

        detections_df.show(truncate=False)

        detections_df.write.jdbc(jdbc_url, "already_detected_pat2", mode="append", properties=jdbc_props)

        # Mark these as detected
        new_keys = new_result_df.select(F.concat_ws("|", "customer", "merchant")).distinct().collect()
        already_detected_pat2.update([row[0] for row in new_keys])

        print(f" New detections saved: {len(new_keys)}")
    else:
        print("No new detections this batch.")




In [0]:

query = (
    streaming_df.writeStream
    .foreachBatch(foreach_batch_function)
    .outputMode("append")
    .option("checkpointLocation", "abfss://gdrive-ingest@devdolphinstorage.dfs.core.windows.net/schema/checkpoint/")
    .start()
)

query.awaitTermination()