# Step 0: Regenerate Synthetic Data with Noisy Duplicates and Feature Leakage Check
This notebook generates customer records and noisy duplicates, where key identifiers (like phone, insurance number) are slightly modified. It also includes feature leakage analysis after pair generation.

In [None]:
import pandas as pd
import random
from faker import Faker
from pathlib import Path
from tqdm import tqdm
tqdm.pandas()

In [None]:
fake = Faker()
Path('data').mkdir(exist_ok=True)

def perturb_number(val, delta=50):
    try:
        return int(val) + random.randint(-delta, delta)
    except:
        return val

def generate_record(entity_id, customer_type, similar_to=None):
    base = {
        'Entity ID': entity_id,
        'First Name': fake.first_name(),
        'Last Name': fake.last_name(),
        'Birthdate': fake.date_of_birth(minimum_age=18, maximum_age=80).strftime('%Y-%m-%d'),
        'Gender': random.choice(['Male', 'Female']),
        'Job': fake.job(),
        'Email': fake.email(),
        'Phone': fake.phone_number(),
        'Address': fake.address().replace('\n', ' '),
        'City': fake.city(),
        'Country': fake.country(),
        'ID Number': fake.unique.random_int(min=100000, max=999999),
        'Insurance No': fake.unique.random_int(min=100000, max=999999),
        'Marital Status': random.choice(['Single', 'Married', 'Divorced']),
        'Nationality': fake.country(),
        'Notes': fake.sentence(),
        'Customer Type': customer_type
    }
    if similar_to:
        base['Last Name'] = similar_to['Last Name']
        base['Job'] = similar_to['Job']
        base['City'] = similar_to['City'] if random.random() > 0.5 else fake.city()
        base['Email'] = similar_to['Email'] if random.random() > 0.6 else fake.email()
        base['Phone'] = perturb_number(similar_to['Phone'], 20) if random.random() > 0.5 else fake.phone_number()
        base['Insurance No'] = perturb_number(similar_to['Insurance No'], 100)
    return base

In [None]:
def generate_dataset(seed, n_unique, n_duplicates_per_unique, n_similar):
    Faker.seed(seed)
    random.seed(seed)
    fake.unique.clear()
    records = []
    for i in range(n_unique):
        base = generate_record(i, 'unique')
        records.append(base)
        for _ in range(n_duplicates_per_unique):
            records.append(generate_record(i, 'duplicate', similar_to=base))
    for i in range(n_similar):
        ref = random.choice(records[:n_unique])
        new_id = n_unique * (1 + n_duplicates_per_unique) + i
        records.append(generate_record(new_id, 'similar', similar_to=ref))
    return pd.DataFrame(records)

def generate_pairs(df, negative_ratio=0.7):
    pairs = []
    grouped = df.groupby('Entity ID')
    for entity_id, group in grouped:
        uniques = group[group['Customer Type'] == 'unique']
        duplicates = group[group['Customer Type'] == 'duplicate']
        for i in uniques.index:
            for j in duplicates.index:
                pairs.append({
                    'record1_index': i,
                    'record2_index': j,
                    'record1_id': entity_id,
                    'record2_id': entity_id,
                    'is_duplicate': 1
                })
    pos_count = len(pairs)
    neg_count = int(pos_count * negative_ratio / (1 - negative_ratio))
    all_indices = df.index.tolist()
    while len(pairs) < pos_count + neg_count:
        i, j = random.sample(all_indices, 2)
        if df.loc[i, 'Entity ID'] != df.loc[j, 'Entity ID']:
            pairs.append({
                'record1_index': i,
                'record2_index': j,
                'record1_id': df.loc[i, 'Entity ID'],
                'record2_id': df.loc[j, 'Entity ID'],
                'is_duplicate': 0
            })
    return pd.DataFrame(pairs)

In [None]:
train_df = generate_dataset(seed=42, n_unique=300, n_duplicates_per_unique=3, n_similar=100)
test_df = generate_dataset(seed=99, n_unique=60, n_duplicates_per_unique=2, n_similar=20)
train_df.to_excel('data/train_customers.xlsx', index=False)
test_df.to_excel('data/test_customers.xlsx', index=False)
print('Noisy datasets saved.')

train_pairs = generate_pairs(train_df)
test_pairs = generate_pairs(test_df)
train_pairs.to_csv('data/train_pairs.csv', index=False)
test_pairs.to_csv('data/test_pairs.csv', index=False)
print('Balanced pairs saved.')

In [None]:
# Quick leakage analysis: Check if any feature perfectly separates label
print("\n🔍 Checking potential leakage in binary features...")
import seaborn as sns
from matplotlib import pyplot as plt
leak_check_cols = ['insurance_match', 'phone_match', 'gender_match']
df = pd.read_csv('output/feature_matrix.csv')
for col in leak_check_cols:
    ct = pd.crosstab(df[col], df['is_duplicate'], normalize='index')
    print(f"\nFeature: {col}")
    print(ct)
    sns.histplot(data=df, x=col, hue='is_duplicate', multiple='stack', bins=3)
    plt.title(f"{col} by is_duplicate")
    plt.show()