In [8]:
import pandas as pd
import os
from datetime import datetime

# -----------------------------
# 🧠 Simulated Postgres-like Table
# -----------------------------
already_detected_pat3 = set()

# -----------------------------
# 🔍 Pattern 3 Detection Logic
# -----------------------------
def detect_patid3(transactions_df):
    detections = []

    # Step 1: Drop rows where merchant or gender is missing
    filtered_df = transactions_df.dropna(subset=['merchant', 'gender'])

    # Step 2: Normalize gender values
    filtered_df['gender'] = filtered_df['gender'].astype(str).str.strip().str.upper()

    # Step 3: Count unique customers by gender and merchant
    gender_counts = filtered_df.groupby(['merchant', 'gender'])['customer'].nunique().reset_index(name='count')

    # Step 4: Pivot to get male/female side by side
    pivot_df = gender_counts.pivot(index='merchant', columns='gender', values='count').fillna(0).reset_index()
    pivot_df.columns.name = None

    # Ensure F and M columns exist
    if 'F' not in pivot_df.columns:
        pivot_df['F'] = 0
    if 'M' not in pivot_df.columns:
        pivot_df['M'] = 0

    # Step 5: Apply filter: F > 100 AND F < M
    eligible = pivot_df[(pivot_df['F'] > 100) & (pivot_df['F'] < pivot_df['M'])]

    # Step 6: Mark detections
    now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    for _, row in eligible.iterrows():
        merchant_id = row['merchant']
        if merchant_id in already_detected_pat3:
            continue

        detection = {
            "YStartTime": now,
            "detectionTime": now,
            "patternId": "PatId3",
            "ActionType": "DEI-NEEDED",
            "customerName": "",
            "MerchantId": merchant_id
        }
        detections.append(detection)
        already_detected_pat3.add(merchant_id)

    return pd.DataFrame(detections)

# -----------------------------
# 💾 Write Detections (50 per file)
# -----------------------------
def write_detections(detections, output_dir="outputs/", file_prefix="detections_pat3"):
    if detections.empty:
        return

    os.makedirs(output_dir, exist_ok=True)

    for i in range(0, len(detections), 50):
        batch = detections.iloc[i:i+50]
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S_%f')
        output_path = f"{output_dir}/{file_prefix}_{timestamp}.csv"
        batch.to_csv(output_path, index=False)
        print(f"✅ Wrote {len(batch)} detections to {output_path}")

# -----------------------------
# 🚀 Main Function
# -----------------------------
def main():
    chunk_folder = r"C:\Users\kusha\OneDrive\Desktop\Projects\DevDolphins\Blob files\Chunks"
    all_files = sorted(os.listdir(chunk_folder))

    for file in all_files:
        print(f"📦 Processing: {file}")
        chunk_df = pd.read_csv(os.path.join(chunk_folder, file))
        chunk_df['gender'] = chunk_df['gender'].astype(str).str.strip().str.upper()

        detections3 = detect_patid3(chunk_df)
        write_detections(detections3)

# -----------------------------
# 🔁 Entry Point
# -----------------------------
if __name__ == "__main__":
    main()


📦 Processing: chunk_10_20250624_202551.csv
📦 Processing: chunk_11_20250624_202553.csv
📦 Processing: chunk_12_20250624_202555.csv
📦 Processing: chunk_13_20250624_202557.csv
📦 Processing: chunk_14_20250624_202559.csv
📦 Processing: chunk_15_20250624_202601.csv
📦 Processing: chunk_16_20250624_202602.csv
📦 Processing: chunk_17_20250624_202608.csv
📦 Processing: chunk_18_20250624_202610.csv
📦 Processing: chunk_19_20250624_202612.csv
📦 Processing: chunk_1_20250624_202535.csv
📦 Processing: chunk_20_20250624_202614.csv
📦 Processing: chunk_21_20250624_202616.csv
📦 Processing: chunk_22_20250624_202618.csv
📦 Processing: chunk_23_20250624_202619.csv
📦 Processing: chunk_24_20250624_202621.csv
📦 Processing: chunk_25_20250624_202623.csv
📦 Processing: chunk_26_20250624_202625.csv
📦 Processing: chunk_27_20250624_202627.csv
📦 Processing: chunk_28_20250624_202629.csv
📦 Processing: chunk_29_20250624_202631.csv
📦 Processing: chunk_2_20250624_202537.csv
📦 Processing: chunk_30_20250624_202633.csv
📦 Processing: