In [1]:
import pandas as pd
import re
import os
from datetime import datetime

# -----------------------------
# 🧠 Simulated PostgreSQL Table
# -----------------------------
customer_merchant_amount_df = pd.DataFrame(columns=[
    "customer", "merchant", "txn_count", "total_amount"
])

# Optional: Set to track already detected customer-merchant pairs
already_detected = set()

# -----------------------------
# 🔄 Update State from a Chunk
# -----------------------------
def update_postgres_sim(chunk_df):
    global customer_merchant_amount_df

    stats = chunk_df.groupby(['customer', 'merchant']).agg(
        txn_count=('amount', 'count'),
        total_amount=('amount', 'sum')
    ).reset_index()

    customer_merchant_amount_df_combined = pd.merge(
        customer_merchant_amount_df,
        stats,
        on=['customer', 'merchant'],
        how='outer'
    ).fillna(0)

    customer_merchant_amount_df_combined['txn_count'] = (
        customer_merchant_amount_df_combined['txn_count_x'] + customer_merchant_amount_df_combined['txn_count_y']
    )

    customer_merchant_amount_df_combined['total_amount'] = (
        customer_merchant_amount_df_combined['total_amount_x'] + customer_merchant_amount_df_combined['total_amount_y']
    )

    customer_merchant_amount_df = customer_merchant_amount_df_combined[
        ['customer', 'merchant', 'txn_count', 'total_amount']
    ]

# -----------------------------
# 🔍 Detect Pattern 2
# -----------------------------
def detect_pattern2():
    detections = []

    customer_merchant_amount_df['avg_amount'] = (
        customer_merchant_amount_df['total_amount'] / customer_merchant_amount_df['txn_count']
    )

    # Apply pattern logic
    filtered = customer_merchant_amount_df[
        (customer_merchant_amount_df['txn_count'] >= 80) &
        (customer_merchant_amount_df['avg_amount'] < 23)
    ]

    now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    for _, row in filtered.iterrows():
        key = (row['customer'], row['merchant'])
        if key in already_detected:
            continue  # Avoid duplicates

        detection = {
            "YStartTime": now,
            "detectionTime": now,
            "patternId": "PatId2",
            "ActionType": "CHILD",
            "customerName": row['customer'],
            "MerchantId": row['merchant'],
            "txn_count": int(row['txn_count']),
        }
        detections.append(detection)
        already_detected.add(key)
        print("detection found")
        print(detection)
    return pd.DataFrame(detections)

# -----------------------------
# 💾 Write Detections
# -----------------------------
def write_detections(detections, output_dir="outputs/", file_prefix="detections_pat2"):
    if detections.empty:
        return

    os.makedirs(output_dir, exist_ok=True)

    for i in range(0, len(detections), 50):
        batch = detections.iloc[i:i+50]
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S_%f')
        output_path = f"{output_dir}/{file_prefix}_{timestamp}.csv"
        batch.to_csv(output_path, index=False)
        print(f"✅ Wrote {len(batch)} detections to {output_path}")
def chunk_key(filename):
    match = re.search(r"chunk_(\d+)_", filename)
    return int(match.group(1)) if match else float('inf')

# -----------------------------
# 🚀 Main Function
# -----------------------------
def main():
    chunk_folder = r"C:\Users\kusha\OneDrive\Desktop\Projects\DevDolphins\Blob files\Chunks"
    all_files = sorted(os.listdir(chunk_folder), key=chunk_key)

    for file in all_files:
        print(f"📦 Processing: {file}")
        chunk_df = pd.read_csv(os.path.join(chunk_folder, file))

        chunk_df['amount'] = chunk_df['amount'].astype(float)

        update_postgres_sim(chunk_df)         # Update rolling stats
        detections_df = detect_pattern2()     # Detect based on full state
        write_detections(detections_df)       # Write in batches

# -----------------------------
# 🔁 Entry Point
# -----------------------------
if __name__ == "__main__":
    main()


📦 Processing: chunk_1_20250624_202535.csv
📦 Processing: chunk_2_20250624_202537.csv
📦 Processing: chunk_3_20250624_202539.csv
📦 Processing: chunk_4_20250624_202540.csv
📦 Processing: chunk_5_20250624_202542.csv
📦 Processing: chunk_6_20250624_202544.csv


  ).fillna(0)


📦 Processing: chunk_7_20250624_202546.csv
📦 Processing: chunk_8_20250624_202548.csv
📦 Processing: chunk_9_20250624_202550.csv
📦 Processing: chunk_10_20250624_202551.csv
📦 Processing: chunk_11_20250624_202553.csv
📦 Processing: chunk_12_20250624_202555.csv
📦 Processing: chunk_13_20250624_202557.csv
📦 Processing: chunk_14_20250624_202559.csv
📦 Processing: chunk_15_20250624_202601.csv
📦 Processing: chunk_16_20250624_202602.csv
📦 Processing: chunk_17_20250624_202608.csv
📦 Processing: chunk_18_20250624_202610.csv
📦 Processing: chunk_19_20250624_202612.csv
📦 Processing: chunk_20_20250624_202614.csv
📦 Processing: chunk_21_20250624_202616.csv
📦 Processing: chunk_22_20250624_202618.csv
📦 Processing: chunk_23_20250624_202619.csv
📦 Processing: chunk_24_20250624_202621.csv
📦 Processing: chunk_25_20250624_202623.csv
📦 Processing: chunk_26_20250624_202625.csv
📦 Processing: chunk_27_20250624_202627.csv
📦 Processing: chunk_28_20250624_202629.csv
detection found
{'YStartTime': '2025-07-05 23:16:23', 'de

In [2]:
print(customer_merchant_amount_df.sort_values("txn_count", ascending=False))


            customer       merchant  txn_count  total_amount  avg_amount
15852  'C1853381059'   'M348934600'      105.0       2762.00   26.304762
8405   'C1450562088'   'M348934600'      104.0       2807.74   26.997500
22794   'C300112558'   'M348934600'      103.0       2704.31   26.255437
7982   'C1424475955'   'M348934600'      103.0       2757.37   26.770583
20683  'C2110532720'   'M348934600'      102.0       3115.04   30.539608
...              ...            ...        ...           ...         ...
16      'C100045114'   'M547558035'        1.0        125.87  125.870000
11      'C100045114'  'M1873032707'        1.0         39.86   39.860000
10      'C100045114'  'M1649169323'        1.0         53.13   53.130000
9       'C100045114'  'M1535107174'        1.0        130.24  130.240000
5      'C1000148617'   'M840466850'        1.0         14.74   14.740000

[35160 rows x 5 columns]
