In [11]:
import pandas as pd
import os
from datetime import datetime

# -----------------------------
# 🧠 Simulated PostgreSQL Table
# -----------------------------
customer_merchant_amount_df = pd.DataFrame(columns=[
    "customer", "merchant", "txn_count", "total_amount"
])

# Optional: Set to track already detected customer-merchant pairs
already_detected = set()

# -----------------------------
# 🔄 Update State from a Chunk
# -----------------------------
def update_postgres_sim(chunk_df):
    global customer_merchant_amount_df

    stats = chunk_df.groupby(['customer', 'merchant']).agg(
        txn_count=('amount', 'count'),
        total_amount=('amount', 'sum')
    ).reset_index()

    customer_merchant_amount_df_combined = pd.merge(
        customer_merchant_amount_df,
        stats,
        on=['customer', 'merchant'],
        how='outer'
    ).fillna(0)

    customer_merchant_amount_df_combined['txn_count'] = (
        customer_merchant_amount_df_combined['txn_count_x'] + customer_merchant_amount_df_combined['txn_count_y']
    )

    customer_merchant_amount_df_combined['total_amount'] = (
        customer_merchant_amount_df_combined['total_amount_x'] + customer_merchant_amount_df_combined['total_amount_y']
    )

    customer_merchant_amount_df = customer_merchant_amount_df_combined[
        ['customer', 'merchant', 'txn_count', 'total_amount']
    ]

# -----------------------------
# 🔍 Detect Pattern 2
# -----------------------------
def detect_pattern2():
    detections = []

    customer_merchant_amount_df['avg_amount'] = (
        customer_merchant_amount_df['total_amount'] / customer_merchant_amount_df['txn_count']
    )

    # Apply pattern logic
    filtered = customer_merchant_amount_df[
        (customer_merchant_amount_df['txn_count'] >= 80) &
        (customer_merchant_amount_df['avg_amount'] < 23)
    ]

    now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    for _, row in filtered.iterrows():
        key = (row['customer'], row['merchant'])
        if key in already_detected:
            continue  # Avoid duplicates

        detection = {
            "YStartTime": now,
            "detectionTime": now,
            "patternId": "PatId2",
            "ActionType": "CHILD",
            "customerName": row['customer'],
            "MerchantId": row['merchant'],
            "txn_count": int(row['txn_count']),
        }
        detections.append(detection)
        already_detected.add(key)

    return pd.DataFrame(detections)

# -----------------------------
# 💾 Write Detections
# -----------------------------
def write_detections(detections, output_dir="outputs/", file_prefix="detections_pat2"):
    if detections.empty:
        return

    os.makedirs(output_dir, exist_ok=True)

    for i in range(0, len(detections), 50):
        batch = detections.iloc[i:i+50]
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S_%f')
        output_path = f"{output_dir}/{file_prefix}_{timestamp}.csv"
        batch.to_csv(output_path, index=False)
        print(f"✅ Wrote {len(batch)} detections to {output_path}")

# -----------------------------
# 🚀 Main Function
# -----------------------------
def main():
    chunk_folder = r"C:\Users\kusha\OneDrive\Desktop\Projects\DevDolphins\Blob files\Chunks"
    all_files = sorted(os.listdir(chunk_folder))

    for file in all_files:
        print(f"📦 Processing: {file}")
        chunk_df = pd.read_csv(os.path.join(chunk_folder, file))

        chunk_df['amount'] = chunk_df['amount'].astype(float)

        update_postgres_sim(chunk_df)         # Update rolling stats
        detections_df = detect_pattern2()     # Detect based on full state
        write_detections(detections_df)       # Write in batches

# -----------------------------
# 🔁 Entry Point
# -----------------------------
if __name__ == "__main__":
    main()


📦 Processing: chunk_10_20250624_202551.csv
📦 Processing: chunk_11_20250624_202553.csv
📦 Processing: chunk_1_20250624_202535.csv
📦 Processing: chunk_2_20250624_202537.csv
📦 Processing: chunk_3_20250624_202539.csv
📦 Processing: chunk_4_20250624_202540.csv
📦 Processing: chunk_5_20250624_202542.csv
📦 Processing: chunk_6_20250624_202544.csv
📦 Processing: chunk_7_20250624_202546.csv
📦 Processing: chunk_8_20250624_202548.csv


  ).fillna(0)


📦 Processing: chunk_9_20250624_202550.csv


In [12]:
print(customer_merchant_amount_df.sort_values("txn_count", ascending=False))


            customer       merchant  txn_count  total_amount  avg_amount
5373   'C1598564853'   'M348934600'       40.0        924.00   23.100000
5688    'C164104645'  'M1823072687'       39.0       1098.08   28.155897
777    'C1087943403'  'M1823072687'       39.0       1013.65   25.991026
1295   'C1145304322'   'M348934600'       39.0        986.51   25.295128
10223  'C2144163136'   'M348934600'       39.0       1160.90   29.766667
...              ...            ...        ...           ...         ...
16     'C1002658784'   'M855959430'        1.0         47.58   47.580000
15     'C1002658784'   'M348934600'        1.0         18.09   18.090000
14     'C1002658784'  'M1888755466'        1.0         71.71   71.710000
13     'C1002658784'  'M1741626453'        1.0        197.30  197.300000
11     'C1001065306'    'M50039827'        1.0        188.94  188.940000

[16828 rows x 5 columns]
