In [None]:
import pandas as pd
import os
from datetime import datetime

merchant_transaction_count_df = pd.DataFrame(columns=["merchant", "total_txn"])
customer_merchant_stats_df = pd.DataFrame(columns=["customer", "merchant", "txn_count", "avg_weight"])
customer_merchant_amount_df = pd.DataFrame(columns=["customer", "merchant", "txn_count", "total_amount"])
already_detected_pat2 = set()

customer_df = pd.read_csv(r"C:\Users\kusha\OneDrive\Desktop\Projects\DevDolphins\Blob files\customer data\customer data.csv")

def process_chunk(chunk_df):
    global merchant_transaction_count_df
    global customer_merchant_stats_df
    global customer_merchant_amount_df

    # Step 1: Join with customer importance to get Weight
    merged_df = chunk_df.merge(
        customer_df,
        how='inner',
        left_on=['customer', 'merchant', 'category', 'amount'],
        right_on=['Source', 'Target', 'typeTrans', 'Weight']
    )

    # Step 2: Update merchant transaction count
    merchant_txn = merged_df.groupby('merchant').size().reset_index(name='new_txn_count')

    if merchant_transaction_count_df.empty:
        merchant_transaction_count_df = merchant_txn.rename(columns={'new_txn_count': 'total_txn'}).copy()
    else:
        merchant_transaction_count_df = pd.merge(
            merchant_transaction_count_df,
            merchant_txn,
            on='merchant',
            how='outer'
        ).fillna(0)

        merchant_transaction_count_df['total_txn'] = (
            merchant_transaction_count_df['total_txn'] + merchant_transaction_count_df['new_txn_count']
        )
        merchant_transaction_count_df.drop(columns=['new_txn_count'], inplace=True)

    # Step 3: Update customer-merchant txn stats for Pattern 1
    cust_merchant_stats = merged_df.groupby(['customer', 'merchant']).agg(
        txn_count=('step', 'count'),
        avg_weight=('Weight', 'mean')
    ).reset_index()

    if customer_merchant_stats_df.empty:
        customer_merchant_stats_df = cust_merchant_stats.copy()
    else:
        customer_merchant_stats_df = pd.merge(
            customer_merchant_stats_df,
            cust_merchant_stats,
            on=['customer', 'merchant'],
            how='outer'
        ).fillna(0)

        customer_merchant_stats_df['txn_count'] = (
            customer_merchant_stats_df['txn_count_x'] + customer_merchant_stats_df['txn_count_y']
        )
        customer_merchant_stats_df['avg_weight'] = (
            (customer_merchant_stats_df['avg_weight_x'] * customer_merchant_stats_df['txn_count_x']) +
            (customer_merchant_stats_df['avg_weight_y'] * customer_merchant_stats_df['txn_count_y'])
        ) / customer_merchant_stats_df['txn_count']

        customer_merchant_stats_df = customer_merchant_stats_df[[
            'customer', 'merchant', 'txn_count', 'avg_weight'
        ]].copy()

    # Step 4: Update customer-merchant stats for Pattern 2
    cust_merchant_amount_stats = chunk_df.groupby(['customer', 'merchant']).agg(
        txn_count=('amount', 'count'),
        total_amount=('amount', 'sum')
    ).reset_index()

    if customer_merchant_amount_df.empty:
        customer_merchant_amount_df = cust_merchant_amount_stats.copy()
    else:
        customer_merchant_amount_df_combined = pd.merge(
            customer_merchant_amount_df,
            cust_merchant_amount_stats,
            on=['customer', 'merchant'],
            how='outer'
        ).fillna(0)

        customer_merchant_amount_df_combined['txn_count'] = (
            customer_merchant_amount_df_combined['txn_count_x'] + customer_merchant_amount_df_combined['txn_count_y']
        )
        customer_merchant_amount_df_combined['total_amount'] = (
            customer_merchant_amount_df_combined['total_amount_x'] + customer_merchant_amount_df_combined['total_amount_y']
        )

        customer_merchant_amount_df = customer_merchant_amount_df_combined[[
            'customer', 'merchant', 'txn_count', 'total_amount'
        ]].copy()

def detect_patid1():
    detections = []

    for merchant in merchant_transaction_count_df.itertuples():
        if merchant.total_txn < 50000:
            continue

        merchant_name = merchant.merchant
        cust_subset = customer_merchant_stats_df[
            customer_merchant_stats_df['merchant'] == merchant_name
        ]

        if cust_subset.empty:
            continue

        txn_threshold = cust_subset['txn_count'].quantile(0.90)
        weight_threshold = cust_subset['avg_weight'].quantile(0.10)

        eligible = cust_subset[
            (cust_subset['txn_count'] >= txn_threshold) &
            (cust_subset['avg_weight'] <= weight_threshold)
        ]

        for row in eligible.itertuples():
            detections.append({
                "YStartTime": "",
                "detectionTime": datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                "patternId": "PatId1",
                "ActionType": "UPGRADE",
                "customerName": row.customer,
                "MerchantId": row.merchant
            })

    return pd.DataFrame(detections)

def detect_patid2():
    detections = []
    customer_merchant_amount_df['avg_amount'] = (
        customer_merchant_amount_df['total_amount'] / customer_merchant_amount_df['txn_count']
    )

    filtered = customer_merchant_amount_df[
        (customer_merchant_amount_df['txn_count'] >= 80) &
        (customer_merchant_amount_df['avg_amount'] < 23)
    ]

    now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    for _, row in filtered.iterrows():
        key = (row['customer'], row['merchant'])
        if key in already_detected_pat2:
            continue

        detection = {
            "YStartTime": now,
            "detectionTime": now,
            "patternId": "PatId2",
            "ActionType": "CHILD",
            "customerName": row['customer'],
            "MerchantId": row['merchant'],
            "txn_count": int(row['txn_count']),
        }
        detections.append(detection)
        already_detected_pat2.add(key)

    return pd.DataFrame(detections)

def write_detections(detections, output_dir="outputs/", file_prefix="detections"):
    if detections.empty:
        return

    os.makedirs(output_dir, exist_ok=True)

    for i in range(0, len(detections), 50):
        batch = detections.iloc[i:i+50]
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S_%f')
        output_path = f"{output_dir}/{file_prefix}_{timestamp}.csv"
        batch.to_csv(output_path, index=False)
        print(f"✅ Wrote {len(batch)} detections to {output_path}")

def main():
    chunk_folder = r"C:\Users\kusha\OneDrive\Desktop\Projects\DevDolphins\Blob files\Chunks"
    all_files = sorted(os.listdir(chunk_folder))

    for file in all_files:
        print(f"📦 Processing: {file}")
        chunk_df = pd.read_csv(os.path.join(chunk_folder, file))
        chunk_df['amount'] = chunk_df['amount'].astype(float)

        process_chunk(chunk_df)

        detections1 = detect_patid1()
        detections2 = detect_patid2()

        write_detections(detections1, file_prefix="detections_pat1")
        write_detections(detections2, file_prefix="detections_pat2")

if __name__ == "__main__":
    main()


📦 Processing: chunk_10_20250624_202551.csv
📦 Processing: chunk_11_20250624_202553.csv
📦 Processing: chunk_1_20250624_202535.csv
📦 Processing: chunk_2_20250624_202537.csv
📦 Processing: chunk_3_20250624_202539.csv
📦 Processing: chunk_4_20250624_202540.csv
📦 Processing: chunk_5_20250624_202542.csv
📦 Processing: chunk_6_20250624_202544.csv
📦 Processing: chunk_7_20250624_202546.csv
✅ Wrote 9 detections to outputs//detections_pat1_20250627_041230_295591.csv
📦 Processing: chunk_8_20250624_202548.csv
✅ Wrote 15 detections to outputs//detections_pat1_20250627_041230_475892.csv
📦 Processing: chunk_9_20250624_202550.csv
✅ Wrote 12 detections to outputs//detections_pat1_20250627_041230_663966.csv
