In [32]:
import pandas as pd
import os
from datetime import datetime
import re

gender_summary_df = pd.DataFrame(columns=['merchant', 'customer', 'gender'])
already_detected_pat3 = set()


def print_gender_conflicts(df):
    gender_counts = df.groupby(['merchant', 'customer'])['gender'].nunique().reset_index()
    conflicts = gender_counts[gender_counts['gender'] > 1]

    if conflicts.empty:
        print("\n✅ No gender conflicts found.")
        return

    print(f"\n⚠️ Found {len(conflicts)} gender conflicts across chunks:")
    for _, row in conflicts.iterrows():
        merchant = row['merchant']
        customer = row['customer']
        all_genders = df[(df['merchant'] == merchant) & (df['customer'] == customer)]['gender'].unique()
        print(f" - Merchant: {merchant}, Customer: {customer}, Genders: {list(all_genders)}")

def detect_patid3():
    detections = []

    global gender_summary_df
    # Drop conflicting gender entries
    gender_counts = gender_summary_df.groupby(['merchant', 'customer'])['gender'].nunique().reset_index()
    conflict_keys = gender_counts[gender_counts['gender'] > 1][['merchant', 'customer']].apply(tuple, axis=1)
    gender_summary_df['key'] = list(zip(gender_summary_df['merchant'], gender_summary_df['customer']))
    gender_summary_df = gender_summary_df[~gender_summary_df['key'].isin(conflict_keys)]
    gender_summary_df.drop(columns='key', inplace=True)

    # Group by merchant & gender
    gender_counts = gender_summary_df.groupby(['merchant', 'gender'])['customer'].nunique().reset_index(name='count')
    pivot_df = gender_counts.pivot(index='merchant', columns='gender', values='count').fillna(0).reset_index()
    pivot_df.columns.name = None

    if 'F' not in pivot_df.columns:
        pivot_df['F'] = 0
    if 'M' not in pivot_df.columns:
        pivot_df['M'] = 0

    print("\n📟 Gender distribution per merchant (top 10):")
    print(pivot_df[['merchant', 'F', 'M']].sort_values(by='F', ascending=False).head(10))

    eligible = pivot_df[(pivot_df['F'] > 100) & (pivot_df['F'] < pivot_df['M'])]

    print("\n🌟 Eligible DEI-NEEDED merchants:")
    print(eligible[['merchant', 'F', 'M']])

    now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    for _, row in eligible.iterrows():
        merchant_id = row['merchant']
        if merchant_id in already_detected_pat3:
            continue

        detections.append({
            "YStartTime": now,
            "detectionTime": now,
            "patternId": "PatId3",
            "ActionType": "DEI-NEEDED",
            "customerName": "",
            "MerchantId": merchant_id
        })
        already_detected_pat3.add(merchant_id)

    return pd.DataFrame(detections)

def write_detections(detections, output_dir="outputs/", file_prefix="detections_pat3"):
    if detections.empty:
        return

    os.makedirs(output_dir, exist_ok=True)

    for i in range(0, len(detections), 50):
        batch = detections.iloc[i:i+50]
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S_%f')
        output_path = f"{output_dir}/{file_prefix}_{timestamp}.csv"
        batch.to_csv(output_path, index=False)
        print(f"✅ Wrote {len(batch)} detections to {output_path}")

def chunk_key(filename):
    match = re.search(r"chunk_(\d+)_", filename)
    return int(match.group(1)) if match else float('inf')

def main():
    chunk_folder = r"C:\\Users\\kusha\\OneDrive\\Desktop\\Projects\\DevDolphins\\Blob files\\Chunks"
    all_files = sorted(os.listdir(chunk_folder), key=chunk_key)

    global gender_summary_df

    for file in all_files:
        print(f"\n📦 Processing: {file}")
        chunk_df = pd.read_csv(os.path.join(chunk_folder, file))
        chunk_df['gender'] = (
            chunk_df['gender']
            .astype(str)
            .str.replace("'", "")
            .str.strip()
            .str.upper()
        )

        chunk_gender_df = chunk_df[['merchant', 'customer', 'gender']].dropna()
        gender_summary_df = pd.concat([gender_summary_df, chunk_gender_df], ignore_index=True)
        gender_summary_df.drop_duplicates(subset=['merchant', 'customer', 'gender'], inplace=True)

        print_gender_conflicts(gender_summary_df)

        detections = detect_patid3()
        write_detections(detections)

if __name__ == "__main__":
    main()



📦 Processing: chunk_1_20250624_202535.csv

✅ No gender conflicts found.

📟 Gender distribution per merchant (top 10):
         merchant       F      M
26   'M348934600'  1108.0  911.0
15  'M1823072687'   318.0  263.0
39   'M855959430'   110.0   93.0
8    'M151143676'   109.0   85.0
41    'M85975013'   100.0   83.0
30   'M480139044'    54.0   46.0
23   'M209847108'    54.0   50.0
9   'M1535107174'    47.0   38.0
0   'M1053599405'    37.0   29.0
10  'M1600850729'    33.0   21.0

🌟 Eligible DEI-NEEDED merchants:
Empty DataFrame
Columns: [merchant, F, M]
Index: []

📦 Processing: chunk_2_20250624_202537.csv

✅ No gender conflicts found.

📟 Gender distribution per merchant (top 10):
         merchant       F      M
27   'M348934600'  1170.0  965.0
15  'M1823072687'   412.0  336.0
40   'M855959430'   225.0  163.0
8    'M151143676'   206.0  185.0
42    'M85975013'   175.0  141.0
24   'M209847108'   122.0  101.0
31   'M480139044'   109.0   75.0
9   'M1535107174'    82.0   68.0
10  'M1600850729