In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, avg, lit, expr, when, coalesce
from pyspark.sql.types import StructType, StructField, StringType, LongType, DoubleType, TimestampType
import pyspark.sql.functions as F


customer_df = (
    spark.read.option("header", "true")
    .csv("abfss://gdrive-ingest@devdolphinstorage.dfs.core.windows.net/reference_data/customer data.csv")
)


merchant_transaction_count_df = spark.createDataFrame([], "merchant STRING, total_txn LONG")
customer_merchant_stats_df = spark.createDataFrame([], "customer STRING, merchant STRING, txn_count LONG, avg_weight DOUBLE")

transactions_schema = StructType([
    StructField("step", LongType(), True),
    StructField("customer", StringType(), True),
    StructField("age", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("zipcodeOri", StringType(), True),
    StructField("merchant", StringType(), True),
    StructField("zipMerchant", StringType(), True),
    StructField("category", StringType(), True),
    StructField("amount", DoubleType(), True),
    StructField("fraud", LongType(), True),
])

transactions_df = (
    spark.readStream
    .option("header", True)
    .schema(transactions_schema)
    .csv("abfss://gdrive-ingest@devdolphinstorage.dfs.core.windows.net/transactions/")
)


def foreach_batch_function(batch_df, batch_id):
    global merchant_transaction_count_df
    global customer_merchant_stats_df

    print(f"⚡ Processing batch {batch_id} rows: {batch_df.count()}")

    #  Join with static reference
    merged_df = (
        batch_df.join(
            customer_df,
            (batch_df["customer"] == customer_df["Source"]) &
            (batch_df["merchant"] == customer_df["Target"]) &
            (batch_df["category"] == customer_df["typeTrans"]) &
            (batch_df["amount"] == customer_df["Weight"]),
            "inner"
        )
    )

    if merged_df.isEmpty():
        print(f"🚫 No matching transactions for this batch.")
        return

    #  Update merchant_transaction_count_df
    merchant_txn = merged_df.groupBy("merchant").count().withColumnRenamed("count", "new_txn_count")

    if merchant_transaction_count_df.isEmpty():
        merchant_transaction_count_df = merchant_txn.withColumnRenamed("new_txn_count", "total_txn")
    else:
        left_df = merchant_transaction_count_df.withColumnRenamed("total_txn", "left_total_txn")
        right_df = merchant_txn.withColumnRenamed("new_txn_count", "right_new_txn_count")

        merchant_transaction_count_df = (
            left_df.join(right_df, on="merchant", how="outer")
            .fillna(0)
            .withColumn("total_txn", col("left_total_txn") + col("right_new_txn_count"))
            .select("merchant", "total_txn")
        )

    print(f" Updated merchant_transaction_count_df count: {merchant_transaction_count_df.count()}")

    #  Update customer_merchant_stats_df
    cust_stats = (
        merged_df.groupBy("customer", "merchant")
        .agg(
            count("step").alias("new_txn_count"),
            avg("Weight").alias("new_avg_weight")
        )
    )

    if customer_merchant_stats_df.isEmpty():
        customer_merchant_stats_df = cust_stats.withColumnRenamed("new_txn_count", "txn_count").withColumnRenamed("new_avg_weight", "avg_weight")
    else:
        left_df = customer_merchant_stats_df.withColumnRenamed("txn_count", "left_txn_count").withColumnRenamed("avg_weight", "left_avg_weight")
        right_df = cust_stats.withColumnRenamed("new_txn_count", "right_txn_count").withColumnRenamed("new_avg_weight", "right_avg_weight")

        customer_merchant_stats_df = (
            left_df.join(right_df, on=["customer", "merchant"], how="outer")
            .fillna(0)
            .withColumn(
                "txn_count",
                col("left_txn_count") + col("right_txn_count")
            )
            .withColumn(
                "avg_weight",
                ((col("left_avg_weight") * col("left_txn_count")) + (col("right_avg_weight") * col("right_txn_count"))) /
                when(col("left_txn_count") + col("right_txn_count") == 0, 1).otherwise(col("left_txn_count") + col("right_txn_count"))
            )
            .select("customer", "merchant", "txn_count", "avg_weight")
        )

    print(f" Updated customer_merchant_stats_df count: {customer_merchant_stats_df.count()}")

    detections = []
    merchants = merchant_transaction_count_df.collect()

    for row in merchants:
        if row["total_txn"] < 50000:
            continue

        merchant = row["merchant"]
        subset = customer_merchant_stats_df.filter(col("merchant") == merchant)

        if subset.isEmpty():
            continue

        txn_thresh = subset.approxQuantile("txn_count", [0.9], 0.01)[0]
        weight_thresh = subset.approxQuantile("avg_weight", [0.1], 0.01)[0]

        eligible = subset.filter(
            (col("txn_count") >= txn_thresh) &
            (col("avg_weight") <= weight_thresh)
        ).withColumn("YStartTime", F.current_timestamp()) \
         .withColumn("detectionTime", F.current_timestamp()) \
         .withColumn("patternId", lit("PatId1")) \
         .withColumn("ActionType", lit("UPGRADE")) \
         .withColumnRenamed("customer", "customerName") \
         .withColumnRenamed("merchant", "MerchantId") \
         .select("YStartTime", "detectionTime", "patternId", "ActionType", "customerName", "MerchantId")
        eligible.show(truncate=False)
        

    print(" Pattern 1 batch done!")

# -----------------------------------
# Start Streaming Query
# -----------------------------------
query = (
    transactions_df.writeStream
    .foreachBatch(foreach_batch_function)
    .start()
)

query.awaitTermination()
