In [30]:
import random
import re
from faker import Faker
import pandas as pd
from collections import defaultdict

# ── CONFIG ───────────────────────────────────────────────────────────
TOTAL_INVOICES   = 3000
TOTAL_PAIRS      = 5000
DUP_RATIO        = 0.08      # 8% true duplicates
N_DUP            = int(TOTAL_PAIRS * DUP_RATIO)
N_LEGIT          = TOTAL_PAIRS - N_DUP
N_BORDERLINE     = int(N_LEGIT * 0.6)   # 60% of negatives are borderline
N_EASY_NEG       = N_LEGIT - N_BORDERLINE

SEV_LOW, SEV_HIGH = 0.3, 0.6   # narrower noise band
P_ALIAS, P_DROP   = 0.1, 0.05  # alias and field‐drop probs

fake = Faker(); Faker.seed(42); random.seed(42)

# ── BUILD A SMALL VENDOR POOL + ALIASES ─────────────────────────────
# generate 500 unique vendor names
vendor_pool = [fake.company() for _ in range(500)]
# simple alias map: append “Inc.”, “Co.” or abbreviate
alias_map = {}
for v in vendor_pool:
    alias_map[v] = random.choice([
        v,
        v.replace("Corporation", "Corp"),
        v + " Inc",
        v[:min(10, len(v))] + " Co"
    ])

# ── HELPERS ─────────────────────────────────────────────────────────
def sample_severity():
    return random.random() * (SEV_HIGH - SEV_LOW) + SEV_LOW

def jitter_amount(x, sev):
    return round(x * (1 + random.uniform(-0.03*sev, 0.03*sev)), 2)

def inject_typo(s, sev):
    out = list(s)
    for i in range(len(out)-1):
        if random.random() < 0.05 * sev:
            out[i], out[i+1] = out[i+1], out[i]
    return "".join(out)

def inject_ocr(s, sev):
    if random.random() < 0.1*sev: s = re.sub(r'O','0',s)
    if random.random() < 0.1*sev: s = re.sub(r'0','O',s)
    return s

def format_date(dt, sev):
    fmts = ["%Y-%m-%d","%d/%m/%Y","%b %d, %Y","%m-%d-%Y"]
    return dt.strftime(random.choice(fmts))

# ── INVOICE SCHEMA ─────────────────────────────────────────────────
def generate_invoice(i):
    vendor = random.choice(vendor_pool)
    return {
        # no DOC_NO field now
        'VENDOR_NAME'   : vendor,
        'VENDOR_ID'     : f"VE{vendor_pool.index(vendor)+1000}",
        'AMOUNT'        : round(random.uniform(100,3000),2),
        'CURRENCY'      : random.choice(['USD','EUR','GBP']),
        'INVOICE_DATE'  : fake.date_between('-1y','today'),
        'DESCRIPTION'   : fake.sentence(nb_words=6),
        'PURCHASE_ORDER': f"PO{random.randint(1000,9999)}",
        'COMPANY_CODE'  : random.choice(['1000','2000']),
        'COST_CENTER'   : f"CC{random.randint(100,999)}",
        'TAX_CODE'      : random.choice(['A0','B1']),
        'PAYMENT_TERMS' : random.choice(['NET30','NET60'])
    }

# ── MAKE A NOISY DUPLICATE ───────────────────────────────────────────
def make_duplicate(orig, sev):
    dup = orig.copy()
    # 1) amount jitter
    dup['AMOUNT'] = jitter_amount(orig['AMOUNT'], sev)
    # 2) vendor alias or noise
    if random.random()<P_ALIAS:
        dup['VENDOR_NAME'] = alias_map[orig['VENDOR_NAME']]
    else:
        dup['VENDOR_NAME'] = inject_ocr(inject_typo(orig['VENDOR_NAME'], sev), sev)
    # 3) description typo
    if random.random()<0.5*sev:
        dup['DESCRIPTION'] = inject_typo(orig['DESCRIPTION'], sev)
    # 4) date format
    dup['INVOICE_DATE'] = format_date(orig['INVOICE_DATE'], sev)
    # 5) maybe drop cost center
    if random.random()<P_DROP:
        dup['COST_CENTER'] = None
    return dup

# ── MAKE BORDERLINE NEGATIVE ────────────────────────────────────────
def make_borderline_negative(orig, pool, sev):
    # same vendor, amount ±(1%×sev), diff PO/date
    candidates = [
        inv for inv in pool
        if inv['VENDOR_NAME']==orig['VENDOR_NAME']
           and abs(inv['AMOUNT']-orig['AMOUNT'])<orig['AMOUNT']*0.01*sev
           and inv['PURCHASE_ORDER']!=orig['PURCHASE_ORDER']
    ]
    if candidates:
        cand = random.choice(candidates)
    else:
        cand = random.choice(pool)
    return make_duplicate(cand, sev*0.7)

# ── GENERATE INVOICES & PAIRS ───────────────────────────────────────
invoices = [generate_invoice(i) for i in range(TOTAL_INVOICES)]

pairs, labels = [], []

# 1) True duplicates
for _ in range(N_DUP):
    inv1 = random.choice(invoices)
    sev  = sample_severity()
    pairs.append((inv1, make_duplicate(inv1, sev)))
    labels.append(1)

# 2) Easy negatives
for _ in range(N_EASY_NEG):
    a,b = random.choice(invoices), random.choice(invoices)
    while a['VENDOR_NAME']==b['VENDOR_NAME'] and abs(a['AMOUNT']-b['AMOUNT'])<5:
        b = random.choice(invoices)
    pairs.append((a,b)); labels.append(0)

# 3) Borderline negatives
for _ in range(N_BORDERLINE):
    inv1 = random.choice(invoices)
    sev  = sample_severity()
    inv2 = make_borderline_negative(inv1, invoices, sev)
    pairs.append((inv1,inv2)); labels.append(0)

# ── FLATTEN & SAVE ─────────────────────────────────────────────────
df_pairs = pd.DataFrame([
    {
      **{f'INV1_{k}':v for k,v in i1.items()},
      **{f'INV2_{k}':v for k,v in i2.items()},
      'label':lbl
    }
    for (i1,i2),lbl in zip(pairs,labels)
])

df_pairs.to_csv('synthetic_invoice_pairs.csv', index=False)
print("Done:", df_pairs.shape)


Done: (5000, 23)


In [31]:
df_pairs.head(5)

Unnamed: 0,INV1_VENDOR_NAME,INV1_VENDOR_ID,INV1_AMOUNT,INV1_CURRENCY,INV1_INVOICE_DATE,INV1_DESCRIPTION,INV1_PURCHASE_ORDER,INV1_COMPANY_CODE,INV1_COST_CENTER,INV1_TAX_CODE,...,INV2_AMOUNT,INV2_CURRENCY,INV2_INVOICE_DATE,INV2_DESCRIPTION,INV2_PURCHASE_ORDER,INV2_COMPANY_CODE,INV2_COST_CENTER,INV2_TAX_CODE,INV2_PAYMENT_TERMS,label
0,Cooper Ltd,VE1489,633.22,USD,2025-01-31,Heavy town money.,PO9276,1000,CC525,B1,...,626.6,USD,2025-01-31,Heavy town money.,PO9276,1000,CC525,B1,NET30,1
1,"Diaz, Anderson and Browning",VE1316,1369.83,GBP,2024-09-16,Military place edge environmental even eye mes...,PO8146,1000,CC275,A0,...,1368.64,GBP,16/09/2024,Military place edge environmental even eye mes...,PO8146,1000,CC275,A0,NET60,1
2,Patton-Jenkins,VE1110,2460.07,EUR,2024-12-27,Perhaps lawyer interest star his difficult.,PO2577,2000,CC862,B1,...,2464.33,EUR,27/12/2024,Perhaps lawyer interest star his difficult.,PO2577,2000,CC862,B1,NET60,1
3,Graham Group,VE1404,1904.09,GBP,2024-11-08,Feeling surface girl expert report include.,PO7972,1000,CC882,A0,...,1922.51,GBP,11-08-2024,Feeling surface girl expert report include.,PO7972,1000,CC882,A0,NET60,1
4,Sandoval-Cunningham,VE1081,1850.76,EUR,2024-06-30,Service throughout spring film look during mys...,PO4325,2000,CC382,A0,...,1825.69,EUR,06-30-2024,Service throughout spring film look during mys...,PO4325,2000,CC382,A0,NET30,1
