In [0]:
from pyspark.sql import SparkSession, functions as F
from datetime import datetime



# In-memory Spark DataFrame to hold the running gender summary
gender_summary_df = None

# Merchants already detected
already_detected_pat3 = set()


In [0]:
streaming_df = (
    spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .option("header", "true")
    .option("cloudFiles.schemaLocation", "abfss://gdrive-ingest@devdolphinstorage.dfs.core.windows.net/schema/ChunksSchema/")
    .load("abfss://gdrive-ingest@devdolphinstorage.dfs.core.windows.net/transactions/")
)

In [0]:
def foreach_batch_function(batch_df, batch_id):
    global gender_summary_df, already_detected_pat3

    print(f"\n⚡ Processing batch {batch_id} rows: {batch_df.count()}")

    # Clean gender column
    cleaned = (
        batch_df.withColumn("gender", F.upper(F.trim(F.regexp_replace("gender", "'", ""))))
        .select("merchant", "customer", "gender")
        .dropna()
        .dropDuplicates(["merchant", "customer", "gender"])
    )

    # Initialize state if first batch
    if gender_summary_df is None:
        gender_summary_df = cleaned
    else:
        gender_summary_df = gender_summary_df.union(cleaned).dropDuplicates(["merchant", "customer", "gender"])

    print(f"✅ Updated gender_summary_df size: {gender_summary_df.count()}")

    # Group & Pivot
    gender_counts = (
        gender_summary_df
        .groupBy("merchant", "gender")
        .agg(F.countDistinct("customer").alias("customer_count"))
    )

    pivot = (
        gender_counts
        .groupBy("merchant")
        .pivot("gender", ["F", "M"])
        .sum("customer_count")
        .fillna(0)
        .withColumnRenamed("F", "female_count")
        .withColumnRenamed("M", "male_count")
    )

    pivot.show(5, truncate=False)

    # Filter
    eligible = pivot.filter(
        (F.col("female_count") > 100) & (F.col("female_count") < F.col("male_count"))
    )

    # New merchants only
    new_detections = eligible.filter(
        ~F.col("merchant").isin(already_detected_pat3)
    )

    merchants_detected = [row["merchant"] for row in new_detections.collect()]

    if merchants_detected:
        print(f"🌟 New detections found: {merchants_detected}")
        now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        detections_df = (
            new_detections
            .withColumn("YStartTime", F.current_timestamp())
            .withColumn("detectionTime", F.current_timestamp())
            .withColumn("patternId", F.lit("PatId3"))
            .withColumn("ActionType", F.lit("DEI-NEEDED"))
            .withColumn("customerName", F.lit(""))
            .withColumn("MerchantId", F.regexp_replace("merchant", "'", ""))
            .select("YStartTime", "detectionTime", "patternId", "ActionType", "customerName", "MerchantId")
        )

        detections_df.show(truncate=False)

        detections_df.write.jdbc(jdbc_url, "already_detected_pat3", mode="append", properties=jdbc_properties)

        already_detected_pat3.update(merchants_detected)

    else:
        print("🚫 No new detections this batch.")




In [0]:
query = (
    streaming_df.writeStream
    .foreachBatch(foreach_batch_function)
    .outputMode("append")
    .start()
)
query.awaitTermination()