In [1]:
import pandas as pd
import os
from datetime import datetime

# -----------------------------
# 🧠 Simulated Postgres Tables
# -----------------------------
merchant_transaction_count_df = pd.DataFrame(columns=["merchant", "total_txn"])
customer_merchant_stats_df = pd.DataFrame(columns=["customer", "merchant", "txn_count", "avg_weight"])

# -----------------------------
# 📥 Load Customer Importance
# -----------------------------
customer_df = pd.read_csv(r"C:\Users\kusha\OneDrive\Desktop\Projects\DevDolphins\Blob files\customer data\customer data.csv")

# -----------------------------
# 🔄 Process One Chunk
# -----------------------------
def process_chunk(chunk_df):
    global merchant_transaction_count_df
    global customer_merchant_stats_df

    # Step 1: Merge with customer importance to get Weight
    merged_df = chunk_df.merge(
        customer_df,
        how='inner',
        left_on=['customer', 'merchant', 'category', 'amount'],
        right_on=['Source', 'Target', 'typeTrans', 'Weight']
    )

    # Step 2: Update merchant-level transaction count
    merchant_txn = merged_df.groupby('merchant').size().reset_index(name='new_txn_count')
    merchant_transaction_count_df = pd.merge(
        merchant_transaction_count_df,
        merchant_txn,
        on='merchant',
        how='outer'
    ).fillna(0)
    merchant_transaction_count_df['total_txn'] = merchant_transaction_count_df['total_txn'] + merchant_transaction_count_df['new_txn_count']
    merchant_transaction_count_df.drop(columns=['new_txn_count'], inplace=True)

    # Step 3: Update customer-merchant level stats (txn_count + avg_weight)
    cust_merchant_stats = merged_df.groupby(['customer', 'merchant']).agg(
        txn_count=('step', 'count'),
        avg_weight=('Weight', 'mean')
    ).reset_index()

    customer_merchant_stats_df = pd.merge(
        customer_merchant_stats_df,
        cust_merchant_stats,
        on=['customer', 'merchant'],
        how='outer'
    ).fillna(0)

    customer_merchant_stats_df['txn_count'] = customer_merchant_stats_df['txn_count_x'] + customer_merchant_stats_df['txn_count_y']
    customer_merchant_stats_df['avg_weight'] = (
        (customer_merchant_stats_df['avg_weight_x'] * customer_merchant_stats_df['txn_count_x']) +
        (customer_merchant_stats_df['avg_weight_y'] * customer_merchant_stats_df['txn_count_y'])
    ) / customer_merchant_stats_df['txn_count']

    customer_merchant_stats_df = customer_merchant_stats_df[['customer', 'merchant', 'txn_count', 'avg_weight']]

# -----------------------------
# 🔍 Pattern 1 Detection Logic
# -----------------------------
def detect_patid1():
    detections = []

    for merchant in merchant_transaction_count_df.itertuples():
        if merchant.total_txn < 50000:
            continue

        merchant_name = merchant.merchant
        cust_subset = customer_merchant_stats_df[
            customer_merchant_stats_df['merchant'] == merchant_name
        ]

        if cust_subset.empty:
            continue

        txn_threshold = cust_subset['txn_count'].quantile(0.90)
        weight_threshold = cust_subset['avg_weight'].quantile(0.10)

        eligible = cust_subset[
            (cust_subset['txn_count'] >= txn_threshold) &
            (cust_subset['avg_weight'] <= weight_threshold)
        ]

        for row in eligible.itertuples():
            detections.append({
                "YStartTime": "",  # optional if you want to track start of ingestion
                "detectionTime": datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                "patternId": "PatId1",
                "ActionType": "UPGRADE",
                "customerName": row.customer,
                "MerchantId": row.merchant
            })

    return detections

# -----------------------------
# 💾 Write Detections (50 per file)
# -----------------------------
def write_detections(detections, output_dir="outputs/", file_prefix="detections_pat1"):
    if not detections:
        return

    os.makedirs(output_dir, exist_ok=True)

    for i in range(0, len(detections), 50):
        batch = detections[i:i+50]
        df = pd.DataFrame(batch)
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S_%f')
        df.to_csv(f"{output_dir}/{file_prefix}_{timestamp}.csv", index=False)

# -----------------------------
# 🚀 Main Loop
# -----------------------------
def main():
    # Use raw string (r"") to avoid escape issues
    chunk_folder = r"C:\Users\kusha\OneDrive\Desktop\Projects\DevDolphins\Blob files\Chunks"
    all_files = sorted(os.listdir(chunk_folder))

    for file in all_files:
        print(f"Processing: {file}")
        chunk_df = pd.read_csv(os.path.join(chunk_folder, file))

        # Ensure numeric types are correct
        chunk_df['amount'] = chunk_df['amount'].astype(float)

        process_chunk(chunk_df)
        detections = detect_patid1()
        write_detections(detections)

# -----------------------------
# 📌 Entry Point
# -----------------------------
if __name__ == "__main__":
    main()


Processing: chunk_10_20250624_202551.csv


  ).fillna(0)
  ).fillna(0)


Processing: chunk_11_20250624_202553.csv
Processing: chunk_1_20250624_202535.csv
Processing: chunk_2_20250624_202537.csv
Processing: chunk_3_20250624_202539.csv
Processing: chunk_4_20250624_202540.csv
Processing: chunk_5_20250624_202542.csv
Processing: chunk_6_20250624_202544.csv
Processing: chunk_7_20250624_202546.csv
Processing: chunk_8_20250624_202548.csv
Processing: chunk_9_20250624_202550.csv
