In [102]:
# %pip install pandas

In [103]:
import pandas as pd
from datetime import datetime
import os

# Path to the directory containing chunk files
folder_path = r"C:\Users\kusha\OneDrive\Desktop\Projects\DevDolphins\Blob files\Chunks"

# List all CSV files in the folder
csv_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.csv')]

# Read and concatenate all CSVs into one DataFrame
transactions_df = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)

# transactions_df = pd.read_csv(r"C:\Users\kusha\OneDrive\Desktop\Projects\DevDolphins\Blob files\Chunks\chunk_1_20250624_202535.csv")
customer_df = pd.read_csv(r"C:\Users\kusha\OneDrive\Desktop\Projects\DevDolphins\Blob files\customer data\customer data.csv")
customer_df


Unnamed: 0,Source,Target,Weight,typeTrans,fraud
0,'C1093826151','M348934600',4.55,'es_transportation',0
1,'C352968107','M348934600',39.68,'es_transportation',0
2,'C2054744914','M1823072687',26.89,'es_transportation',0
3,'C1760612790','M348934600',17.25,'es_transportation',0
4,'C757503768','M348934600',35.72,'es_transportation',0
...,...,...,...,...,...
594638,'C1753498738','M1823072687',20.53,'es_transportation',0
594639,'C650108285','M1823072687',50.73,'es_transportation',0
594640,'C123623130','M349281107',22.44,'es_fashion',0
594641,'C1499363341','M1823072687',14.46,'es_transportation',0


In [104]:
merged_df = transactions_df.merge(
    customer_df,
    how='inner',
    left_on=['customer', 'merchant', 'category','amount'],
    right_on=['Source', 'Target', 'typeTrans','Weight']
)
merged_df

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud_x,Source,Target,Weight,typeTrans,fraud_y
0,33,'C407936671','2','F','28007','M348934600','28007','es_transportation',46.22,0,'C407936671','M348934600',46.22,'es_transportation',0
1,33,'C957771834','4','M','28007','M1823072687','28007','es_transportation',4.76,0,'C957771834','M1823072687',4.76,'es_transportation',0
2,33,'C188991095','4','F','28007','M1823072687','28007','es_transportation',37.10,0,'C188991095','M1823072687',37.10,'es_transportation',0
3,33,'C2035341190','2','M','28007','M348934600','28007','es_transportation',40.04,0,'C2035341190','M348934600',40.04,'es_transportation',0
4,33,'C2035341190','2','M','28007','M348934600','28007','es_transportation',40.04,0,'C2035341190','M348934600',40.04,'es_transportation',0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101178,33,'C608244416','1','F','28007','M348934600','28007','es_transportation',37.18,0,'C608244416','M348934600',37.18,'es_transportation',0
101179,33,'C608244416','1','F','28007','M1823072687','28007','es_transportation',25.87,0,'C608244416','M1823072687',25.87,'es_transportation',0
101180,33,'C1031698099','0','M','28007','M1823072687','28007','es_transportation',49.90,0,'C1031698099','M1823072687',49.90,'es_transportation',0
101181,33,'C602544155','4','M','28007','M348934600','28007','es_transportation',17.03,0,'C602544155','M348934600',17.03,'es_transportation',0


In [105]:
import pandas as pd
from datetime import datetime

def detect_pattern1(merged_df: pd.DataFrame) -> pd.DataFrame:
    """
    Detects pattern1: customers with low average transaction weight but high transaction frequency
    within high-volume merchants.
    
    Args:
        merged_df (pd.DataFrame): DataFrame containing 'merchant', 'customer', 'Weight' columns.

    Returns:
        pd.DataFrame: Filtered DataFrame with pattern1 matches and appended metadata.
    """
    
    # Step 1: Merchant-level transaction count
    merchant_txn_counts = merged_df.groupby('merchant')['customer'].count().reset_index(name='txn_count')
    
    # Step 2: Only keep merchants with >50K transactions
    eligible_merchants = merchant_txn_counts[merchant_txn_counts['txn_count'] > 50000]['merchant']
    
    # Step 3: Filter to eligible merchants
    df_eligible = merged_df[merged_df['merchant'].isin(eligible_merchants)]
    
    # Step 4: Average weight per customer per merchant
    avg_weight = df_eligible.groupby(['merchant', 'customer'])['Weight'].mean().reset_index()
    
    # Step 5: Compute 10th percentile weight threshold per merchant
    weight_threshold = avg_weight.groupby('merchant')['Weight'].quantile(0.10).reset_index()
    weight_threshold.rename(columns={'Weight': 'weight_thresh'}, inplace=True)
    
    # Step 6: Transaction count per customer
    txn_count_per_customer = df_eligible.groupby(['merchant', 'customer']).size().reset_index(name='txn_count')
    
    # Step 7: Compute 90th percentile txn count threshold per merchant
    txn_count_threshold = txn_count_per_customer.groupby('merchant')['txn_count'].quantile(0.90).reset_index()
    txn_count_threshold.rename(columns={'txn_count': 'txn_thresh'}, inplace=True)
    
    # Step 8: Join thresholds
    df1 = avg_weight.merge(weight_threshold, on='merchant')
    df1 = df1.merge(txn_count_per_customer, on=['merchant', 'customer'])
    df1 = df1.merge(txn_count_threshold, on='merchant')
    
    # Step 9: Apply pattern filter
    pattern1 = df1[
        (df1['Weight'] <= df1['weight_thresh']) & 
        (df1['txn_count'] >= df1['txn_thresh'])
    ].copy()
    
    # Step 10: Add metadata
    detection_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    pattern1['patternId'] = 'PatId1'
    pattern1['actionType'] = 'UPGRADE'
    pattern1['YStartTime'] = detection_time
    pattern1['detectionTime'] = detection_time
    pattern1['MerchantId'] = pattern1['merchant']
    pattern1['customerName'] = pattern1['customer']
    
    return pattern1


In [106]:
pattern1_df = detect_pattern1(merged_df)
pattern1_df
if pattern1_df.empty:
    print("No pattern1 matches found.")
else:
    print(f"Found {len(pattern1_df)} pattern1 matches. {print(pattern1_df)}")


          merchant       customer     Weight  weight_thresh  txn_count  \
81    'M348934600'  'C1056476734'  21.333636      21.516111         33   
439   'M348934600'   'C129566217'  20.243889      21.516111         36   
799   'M348934600'  'C1546864044'  19.331389      21.516111         36   
1011  'M348934600'  'C1687722078'  20.616061      21.516111         33   
1270  'M348934600'  'C1855338494'  21.287273      21.516111         33   
1356  'M348934600'  'C1910153505'  21.406765      21.516111         34   
1701  'M348934600'   'C215334224'  20.006970      21.516111         33   
1778  'M348934600'    'C27152267'  18.679118      21.516111         34   
2003  'M348934600'   'C433139335'  18.990909      21.516111         33   
2008  'M348934600'   'C436675602'  21.157647      21.516111         34   
2589  'M348934600'   'C868096672'  20.222353      21.516111         34   

      txn_thresh patternId actionType           YStartTime  \
81          33.0    PatId1    UPGRADE  2025-06-27

In [107]:
pattern1_df

Unnamed: 0,merchant,customer,Weight,weight_thresh,txn_count,txn_thresh,patternId,actionType,YStartTime,detectionTime,MerchantId,customerName
81,'M348934600','C1056476734',21.333636,21.516111,33,33.0,PatId1,UPGRADE,2025-06-27 01:41:44,2025-06-27 01:41:44,'M348934600','C1056476734'
439,'M348934600','C129566217',20.243889,21.516111,36,33.0,PatId1,UPGRADE,2025-06-27 01:41:44,2025-06-27 01:41:44,'M348934600','C129566217'
799,'M348934600','C1546864044',19.331389,21.516111,36,33.0,PatId1,UPGRADE,2025-06-27 01:41:44,2025-06-27 01:41:44,'M348934600','C1546864044'
1011,'M348934600','C1687722078',20.616061,21.516111,33,33.0,PatId1,UPGRADE,2025-06-27 01:41:44,2025-06-27 01:41:44,'M348934600','C1687722078'
1270,'M348934600','C1855338494',21.287273,21.516111,33,33.0,PatId1,UPGRADE,2025-06-27 01:41:44,2025-06-27 01:41:44,'M348934600','C1855338494'
1356,'M348934600','C1910153505',21.406765,21.516111,34,33.0,PatId1,UPGRADE,2025-06-27 01:41:44,2025-06-27 01:41:44,'M348934600','C1910153505'
1701,'M348934600','C215334224',20.00697,21.516111,33,33.0,PatId1,UPGRADE,2025-06-27 01:41:44,2025-06-27 01:41:44,'M348934600','C215334224'
1778,'M348934600','C27152267',18.679118,21.516111,34,33.0,PatId1,UPGRADE,2025-06-27 01:41:44,2025-06-27 01:41:44,'M348934600','C27152267'
2003,'M348934600','C433139335',18.990909,21.516111,33,33.0,PatId1,UPGRADE,2025-06-27 01:41:44,2025-06-27 01:41:44,'M348934600','C433139335'
2008,'M348934600','C436675602',21.157647,21.516111,34,33.0,PatId1,UPGRADE,2025-06-27 01:41:44,2025-06-27 01:41:44,'M348934600','C436675602'
