In [22]:
import pandas as pd
import os
from datetime import datetime

# -----------------------------
# 🧠 Simulated PostgreSQL-like Tables
# -----------------------------
merchant_transaction_count_df = pd.DataFrame(columns=["merchant", "total_txn"])
customer_merchant_stats_df = pd.DataFrame(columns=["customer", "merchant", "txn_count", "avg_weight"])
customer_merchant_amount_df = pd.DataFrame(columns=["customer", "merchant", "txn_count", "total_amount"])

already_detected_pat2 = set()
already_detected_pat3 = set()

# -----------------------------
# 📥 Load Customer Importance
# -----------------------------
customer_df = pd.read_csv(r"C:\Users\kusha\OneDrive\Desktop\Projects\DevDolphins\Blob files\customer data\customer data.csv")

# -----------------------------
# 🔄 Process One Chunk (State Updates)
# -----------------------------
def process_chunk(chunk_df):
    global merchant_transaction_count_df
    global customer_merchant_stats_df
    global customer_merchant_amount_df

    # Step 1: Join with customer importance
    merged_df = chunk_df.merge(
        customer_df,
        how='inner',
        left_on=['customer', 'merchant', 'category', 'amount'],
        right_on=['Source', 'Target', 'typeTrans', 'Weight']
    )

    # Step 2: Update merchant transaction count
    merchant_txn = merged_df.groupby('merchant').size().reset_index(name='new_txn_count')

    if merchant_transaction_count_df.empty:
        merchant_transaction_count_df = merchant_txn.rename(columns={'new_txn_count': 'total_txn'}).copy()
    else:
        merchant_transaction_count_df = pd.merge(
            merchant_transaction_count_df,
            merchant_txn,
            on='merchant',
            how='outer'
        ).fillna(0)
        merchant_transaction_count_df['total_txn'] += merchant_transaction_count_df['new_txn_count']
        merchant_transaction_count_df.drop(columns=['new_txn_count'], inplace=True)

    # Step 3: Update Pattern 1 stats
    cust_merchant_stats = merged_df.groupby(['customer', 'merchant']).agg(
        txn_count=('step', 'count'),
        avg_weight=('Weight', 'mean')
    ).reset_index()

    if customer_merchant_stats_df.empty:
        customer_merchant_stats_df = cust_merchant_stats.copy()
    else:
        customer_merchant_stats_df = pd.merge(
            customer_merchant_stats_df,
            cust_merchant_stats,
            on=['customer', 'merchant'],
            how='outer'
        ).fillna(0)

        customer_merchant_stats_df['txn_count'] = (
            customer_merchant_stats_df['txn_count_x'] + customer_merchant_stats_df['txn_count_y']
        )
        customer_merchant_stats_df['avg_weight'] = (
            (customer_merchant_stats_df['avg_weight_x'] * customer_merchant_stats_df['txn_count_x']) +
            (customer_merchant_stats_df['avg_weight_y'] * customer_merchant_stats_df['txn_count_y'])
        ) / customer_merchant_stats_df['txn_count']

        customer_merchant_stats_df = customer_merchant_stats_df[[
            'customer', 'merchant', 'txn_count', 'avg_weight'
        ]].copy()

    # Step 4: Update Pattern 2 stats
    cust_merchant_amount_stats = chunk_df.groupby(['customer', 'merchant']).agg(
        txn_count=('amount', 'count'),
        total_amount=('amount', 'sum')
    ).reset_index()

    if customer_merchant_amount_df.empty:
        customer_merchant_amount_df = cust_merchant_amount_stats.copy()
    else:
        combined = pd.merge(
            customer_merchant_amount_df,
            cust_merchant_amount_stats,
            on=['customer', 'merchant'],
            how='outer'
        ).fillna(0)

        combined['txn_count'] = combined['txn_count_x'] + combined['txn_count_y']
        combined['total_amount'] = combined['total_amount_x'] + combined['total_amount_y']

        customer_merchant_amount_df = combined[[
            'customer', 'merchant', 'txn_count', 'total_amount'
        ]].copy()

# -----------------------------
# 🔍 Pattern 1
# -----------------------------
def detect_patid1():
    detections = []
    for merchant in merchant_transaction_count_df.itertuples():
        if merchant.total_txn < 50000:
            continue

        merchant_name = merchant.merchant
        cust_subset = customer_merchant_stats_df[
            customer_merchant_stats_df['merchant'] == merchant_name
        ]

        if cust_subset.empty:
            continue

        txn_threshold = cust_subset['txn_count'].quantile(0.90)
        weight_threshold = cust_subset['avg_weight'].quantile(0.10)

        eligible = cust_subset[
            (cust_subset['txn_count'] >= txn_threshold) &
            (cust_subset['avg_weight'] <= weight_threshold)
        ]

        for row in eligible.itertuples():
            detections.append({
                "YStartTime": "",
                "detectionTime": datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                "patternId": "PatId1",
                "ActionType": "UPGRADE",
                "customerName": row.customer,
                "MerchantId": row.merchant
            })

    return pd.DataFrame(detections)

# -----------------------------
# 🔍 Pattern 2
# -----------------------------
def detect_patid2():
    detections = []
    customer_merchant_amount_df['avg_amount'] = (
        customer_merchant_amount_df['total_amount'] / customer_merchant_amount_df['txn_count']
    )

    filtered = customer_merchant_amount_df[
        (customer_merchant_amount_df['txn_count'] >= 80) &
        (customer_merchant_amount_df['avg_amount'] < 23)
    ]

    now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    for _, row in filtered.iterrows():
        key = (row['customer'], row['merchant'])
        if key in already_detected_pat2:
            continue

        detections.append({
            "YStartTime": now,
            "detectionTime": now,
            "patternId": "PatId2",
            "ActionType": "CHILD",
            "customerName": row['customer'],
            "MerchantId": row['merchant'],
            "txn_count": int(row['txn_count']),
        })
        already_detected_pat2.add(key)

    return pd.DataFrame(detections)

# -----------------------------
# 🔍 Pattern 3
# -----------------------------
def detect_patid3(transactions_df):
    detections = []

    filtered_df = transactions_df.dropna(subset=['merchant', 'gender'])
    filtered_df['gender'] = filtered_df['gender'].astype(str).str.strip().str.upper()

    gender_counts = filtered_df.groupby(['merchant', 'gender'])['customer'].nunique().reset_index(name='count')

    pivot_df = gender_counts.pivot(index='merchant', columns='gender', values='count').fillna(0).reset_index()
    pivot_df.columns.name = None

    if 'F' not in pivot_df.columns:
        pivot_df['F'] = 0
    if 'M' not in pivot_df.columns:
        pivot_df['M'] = 0

    eligible = pivot_df[(pivot_df['F'] > 100) & (pivot_df['F'] < pivot_df['M'])]

    now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    for _, row in eligible.iterrows():
        merchant_id = row['merchant']
        if merchant_id in already_detected_pat3:
            continue

        detections.append({
            "YStartTime": now,
            "detectionTime": now,
            "patternId": "PatId3",
            "ActionType": "DEI-NEEDED",
            "customerName": "",
            "MerchantId": merchant_id
        })
        already_detected_pat3.add(merchant_id)

    return pd.DataFrame(detections)

# -----------------------------
# 💾 Write Detections
# -----------------------------
def write_detections(detections, output_dir="outputs/", file_prefix="detections"):
    if detections.empty:
        return

    os.makedirs(output_dir, exist_ok=True)
    for i in range(0, len(detections), 50):
        batch = detections.iloc[i:i+50]
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S_%f')
        output_path = f"{output_dir}/{file_prefix}_{timestamp}.csv"
        batch.to_csv(output_path, index=False)
        print(f"✅ Wrote {len(batch)} detections to {output_path}")

# -----------------------------
# 🚀 Main
# -----------------------------
def main():
    chunk_folder = r"C:\Users\kusha\OneDrive\Desktop\Projects\DevDolphins\Blob files\Chunks"
    all_files = sorted(os.listdir(chunk_folder))

    for file in all_files:
        print(f"\n📦 Processing: {file}")
        chunk_df = pd.read_csv(os.path.join(chunk_folder, file))
        chunk_df['amount'] = chunk_df['amount'].astype(float)
        chunk_df['gender'] = chunk_df['gender'].astype(str).str.strip().str.upper()

        process_chunk(chunk_df)

        detections1 = detect_patid1()
        detections2 = detect_patid2()
        detections3 = detect_patid3(chunk_df)

        write_detections(detections1, file_prefix="detections_pat1")
        write_detections(detections2, file_prefix="detections_pat2")
        write_detections(detections3, file_prefix="detections_pat3")

# -----------------------------
# 🔁 Entry Point
# -----------------------------
if __name__ == "__main__":
    main()



📦 Processing: chunk_10_20250624_202551.csv

📦 Processing: chunk_11_20250624_202553.csv

📦 Processing: chunk_12_20250624_202555.csv

📦 Processing: chunk_13_20250624_202557.csv

📦 Processing: chunk_14_20250624_202559.csv

📦 Processing: chunk_15_20250624_202601.csv

📦 Processing: chunk_16_20250624_202602.csv

📦 Processing: chunk_17_20250624_202608.csv

📦 Processing: chunk_18_20250624_202610.csv

📦 Processing: chunk_19_20250624_202612.csv

📦 Processing: chunk_1_20250624_202535.csv
✅ Wrote 4 detections to outputs//detections_pat1_20250627_043847_634903.csv

📦 Processing: chunk_20_20250624_202614.csv
✅ Wrote 3 detections to outputs//detections_pat1_20250627_043847_833818.csv

📦 Processing: chunk_21_20250624_202616.csv
✅ Wrote 16 detections to outputs//detections_pat1_20250627_043848_044608.csv

📦 Processing: chunk_22_20250624_202618.csv
✅ Wrote 13 detections to outputs//detections_pat1_20250627_043848_268870.csv

📦 Processing: chunk_23_20250624_202619.csv
✅ Wrote 16 detections to outputs//d