<a href="https://colab.research.google.com/github/AsmaaYassinDev/Context-Aware-Fraud-Detection/blob/main/benchmark_complexity_o1_entity_resolution_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import gc
import time
import numpy as np
import pandas as pd
import xgboost as xgb
from google.colab import files
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns

# =========================================================
# 1. ROBUST DATA LOADING (Creates 'df')
# =========================================================
def force_download():
    if os.path.exists('train_transaction.csv'):
        print("✅ Data found.")
        return
    if not os.path.exists('/root/.kaggle/kaggle.json'):
        print("Please upload 'kaggle.json':")
        files.upload()
        os.system('mkdir -p ~/.kaggle')
        os.system('cp kaggle.json ~/.kaggle/')
        os.system('chmod 600 ~/.kaggle/kaggle.json')
    os.system('kaggle competitions download -c ieee-fraud-detection')
    os.system('unzip -o ieee-fraud-detection.zip')
    os.system('unzip -o train_transaction.csv.zip')
    os.system('unzip -o train_identity.csv.zip')

force_download()

def reduce_mem_usage(df):
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min, c_max = df[col].min(), df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: df[col] = df[col].astype(np.int32)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: df[col] = df[col].astype(np.float32)
    return df

print("Loading and Optimizing Data...")
train_trans = reduce_mem_usage(pd.read_csv('train_transaction.csv'))
train_id = reduce_mem_usage(pd.read_csv('train_identity.csv'))

# This line defines 'df', fixing your NameError
df = pd.merge(train_trans, train_id, on='TransactionID', how='left')

del train_trans, train_id
gc.collect()

# =========================================================
# 2. O(1) EFFICIENCY BENCHMARK
# =========================================================

print("\nStarting Entity Resolution Benchmark...")

# 1. Select a sample (10,000 rows is enough for statistical significance)
sample_df = df[['card1', 'addr1', 'P_emaildomain']].head(10000).copy()

# 2. Handle missing values (Imputation strategy from your methodology)
sample_df['card1'] = sample_df['card1'].fillna(-1)
sample_df['addr1'] = sample_df['addr1'].fillna(-1)
sample_df['P_emaildomain'] = sample_df['P_emaildomain'].fillna('unknown')

# 3. Define the Entity Resolution Function (The deterministic logic)
def generate_uid(row):
    # O(1) String Concatenation
    return str(row['card1']) + str(row['addr1']) + str(row['P_emaildomain'])

# 4. Measure Execution Time
start_time = time.time()
sample_df['UID'] = sample_df.apply(generate_uid, axis=1)
end_time = time.time()

# 5. Calculate Statistics
total_time = end_time - start_time
total_rows = len(sample_df)
avg_time_per_transaction_sec = total_time / total_rows
avg_time_ms = avg_time_per_transaction_sec * 1000  # Convert to milliseconds

# 6. Output Results
print("-" * 40)
print(f"BENCHMARK RESULTS (O(1) Proof)")
print("-" * 40)
print(f"Total rows processed:       {total_rows}")
print(f"Total execution time:       {total_time:.4f} seconds")
print(f"Average latency per txn:    {avg_time_ms:.5f} milliseconds")
print("-" * 40)

Please upload 'kaggle.json':


Saving kaggle.json to kaggle.json
Loading and Optimizing Data...


  if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: df[col] = df[col].astype(np.float16)
  if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: df[col] = df[col].astype(np.float16)
  if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: df[col] = df[col].astype(np.float16)
  if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: df[col] = df[col].astype(np.float16)
  if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: df[col] = df[col].astype(np.float16)
  if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: df[col] = df[col].astype(np.float16)
  if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: df[col] = df[col].astype(np.float16)
  if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: df[col] = df[col].astype(np.float16)
  if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: df[col] = df[col].as


Starting Entity Resolution Benchmark...
----------------------------------------
BENCHMARK RESULTS (O(1) Proof)
----------------------------------------
Total rows processed:       10000
Total execution time:       0.0923 seconds
Average latency per txn:    0.00923 milliseconds
----------------------------------------


In [2]:
import os
import gc
import time
import numpy as np
import pandas as pd
import xgboost as xgb
from google.colab import files
import matplotlib.pyplot as plt
import seaborn as sns

# =========================================================
# 1. ROBUST DATA LOADING
# =========================================================
def force_download():
    if os.path.exists('train_transaction.csv'):
        print("✅ Data found.")
        return
    if not os.path.exists('/root/.kaggle/kaggle.json'):
        print("Please upload 'kaggle.json':")
        files.upload()
        os.system('mkdir -p ~/.kaggle && cp kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json')
    os.system('kaggle competitions download -c ieee-fraud-detection')
    os.system('unzip -o ieee-fraud-detection.zip && unzip -o train_transaction.csv.zip && unzip -o train_identity.csv.zip')

force_download()

def reduce_mem_usage(df):
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min, c_max = df[col].min(), df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: df[col] = df[col].astype(np.int32)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: df[col] = df[col].astype(np.float32)
    return df

print("Loading and Optimizing Data...")
# Load only columns needed for UID to save memory if running on all rows
train_trans = reduce_mem_usage(pd.read_csv('train_transaction.csv', usecols=['TransactionID', 'card1', 'addr1', 'P_emaildomain']))
train_id = reduce_mem_usage(pd.read_csv('train_identity.csv', usecols=['TransactionID']))

df = pd.merge(train_trans, train_id, on='TransactionID', how='left')

del train_trans, train_id
gc.collect()

# =========================================================
# 2. VECTORIZED O(1) EFFICIENCY BENCHMARK (ALL ROWS)
# =========================================================

print(f"\nStarting Entity Resolution Benchmark on ALL {len(df):,} rows...")

# 1. Prepare copy to avoid fragmentation
# Consolidating memory prevents performance warnings
benchmark_df = df[['card1', 'addr1', 'P_emaildomain']].copy()

# 2. Impute missing values
benchmark_df['card1'] = benchmark_df['card1'].fillna(-1)
benchmark_df['addr1'] = benchmark_df['addr1'].fillna(-1)
benchmark_df['P_emaildomain'] = benchmark_df['P_emaildomain'].fillna('unknown')

# 3. Measure Execution Time with Vectorization
# This logic is O(1) per row: concatenation cost is constant
start_time = time.time()

benchmark_df['UID'] = (benchmark_df['card1'].astype(str) + "_" +
                       benchmark_df['addr1'].astype(str) + "_" +
                       benchmark_df['P_emaildomain'].astype(str))

end_time = time.time()

# 4. Calculate Statistics
total_time = end_time - start_time
total_rows = len(benchmark_df)
avg_latency_us = (total_time / total_rows) * 1_000_000 # Convert to microseconds

# 5. Output Results
print("-" * 50)
print(f"BENCHMARK RESULTS (O(1) Scalability Proof)")
print("-" * 50)
print(f"Total rows processed:       {total_rows:,}")
print(f"Total execution time:       {total_time:.4f} seconds")
print(f"Average latency per txn:    {avg_latency_us:.2f} microseconds (μs)")
print("-" * 50)

print("\nDEFENSE JUSTIFICATION:")
print("The latency per transaction is measured in microseconds, proving the system")
print("is capable of real-time identity resolution at production scale.")

Please upload 'kaggle.json':


Saving kaggle.json to kaggle.json
Loading and Optimizing Data...

Starting Entity Resolution Benchmark on ALL 590,540 rows...
--------------------------------------------------
BENCHMARK RESULTS (O(1) Scalability Proof)
--------------------------------------------------
Total rows processed:       590,540
Total execution time:       0.9416 seconds
Average latency per txn:    1.59 microseconds (μs)
--------------------------------------------------

DEFENSE JUSTIFICATION:
The latency per transaction is measured in microseconds, proving the system
is capable of real-time identity resolution at production scale.
