In [1]:
import invoice_generator
import preprocess
from sklearn import model_selection

# 1. Data Generation


In [2]:
data = invoice_generator.generate_dataset(
    num_invoices=10000,
    num_merchants=1000,
    general_anomaly_rate=0.3,
    line_anomaly_rate=0.05,
    seed=42,
    test=True,
)

Generating 10000 synthetic invoices across a set of 1000 merchants (general anomaly rate: 30.0%, line anomaly rate: 5.0%)...
Analysis of 10525 invoices:
- Unique merchants: 916
- Total line items: 36267
- Average line items per invoice: 3.45

Field frequency:
- merchant: 10525 (100.0%)
- invoice_date: 10525 (100.0%)
- merchant_branch: 10525 (100.0%)
- merchant_chain: 10525 (100.0%)
- due_date: 10525 (100.0%)
- payment_terms: 10525 (100.0%)
- grand_total: 10525 (100.0%)
- tax: 10525 (100.0%)
- po_number: 10525 (100.0%)
- merchant_address: 10525 (100.0%)
- payment_method: 10525 (100.0%)
- country: 10525 (100.0%)
- currency: 10525 (100.0%)
- line_details: 10525 (100.0%)
Generated 10525 synthetic invoices and saved to synthetic_invoices.json
Saved invoices in JSONL format to synthetic_invoices.jsonl


In [3]:
data[0]

{'extractions': [{'field': 'merchant', 'value': 'Write Laboratory'},
  {'field': 'invoice_date', 'value': '12/10/2024'},
  {'field': 'merchant_branch', 'value': 'Write Laboratory'},
  {'field': 'merchant_chain', 'value': 'Write Laboratory'},
  {'field': 'due_date', 'value': '01/09/2025'},
  {'field': 'payment_terms', 'value': 'NET 30 DAYS'},
  {'field': 'grand_total', 'value': '105688.50'},
  {'field': 'tax', 'value': '5979.88'},
  {'field': 'po_number', 'value': '226656'},
  {'field': 'merchant_address',
   'value': '79163 Rebecca Forks Suite 246 West Thomastown VA 02369 USA'},
  {'field': 'payment_method', 'value': 'Check'},
  {'field': 'country', 'value': 'US'},
  {'field': 'currency', 'value': 'USD'},
  {'field': 'line_details',
   'value': [{'line_count': '1',
     'line_description': 'X-6906 Compact Chemical',
     'line_qty': '50',
     'line_tax': '3106.62',
     'line_total': '51776.94',
     'model': 'X-6906'},
    {'line_count': '2',
     'line_description': 'M-26501 Lightwe

# 2. Feature Engineering


In [4]:
train, test = model_selection.train_test_split(data, test_size=0.2, random_state=42)

print(f"Training set size: {len(train)}")
print(f"Test set size: {len(test)}")

Training set size: 8420
Test set size: 2105


In [5]:
train_df = preprocess.process_invoice(train, test=True)
test_df = preprocess.process_invoice(test, test=True)

In [6]:
train_df.head()

Unnamed: 0,_ANOMALY_TYPES_DROP_BEFORE_TRAINING_,is_anomalous,merchant,invoice_date,merchant_branch,merchant_chain,due_date,payment_terms,grand_total,tax,...,merchant_mismatch_flag,duplicate_invoice_flag,invoice_age,actual_tax_rate,expected_tax_rate,expected_tax,avg_description_similarity,invoice_similarity,payment_terms_numeric,state
0,[],0,Pope GmbH,2025-03-31,Pope GmbH,Pope GmbH,2025-04-30,NET 30 DAYS,3890.83,0.0,...,False,False,30,0.0,0.0,0.0,0.015204,0.653181,30,DE
1,[merchant_mismatch],1,Gonzalez Solutions,2025-04-06,Gonzalez Solutions,Lee & Norton Enterprises,2025-05-06,NET 30 DAYS,599.88,0.0,...,True,False,30,0.0,0.0,0.0,0.014831,0.636066,30,VI
2,[state_tax_mismatch],1,AcceptWalk,2025-03-09,AcceptWalk,AcceptWalk,2025-04-08,NET 30 DAYS,16225.74,622.85,...,False,False,30,0.039919,0.06,936.1734,0.017363,0.640247,30,SC
3,[merchant_mismatch],1,DemocraticHit,2025-03-11,DemocraticHit,Gonzales Co.,2025-02-15,DUE ON RECEIPT,7191.03,405.85,...,True,True,-24,0.059814,0.06,407.1108,0.022414,0.994365,0,WV
4,[tax_calc],1,Harrington-Vang Inc.,2025-02-25,Harrington-Vang Inc.,Harrington-Vang Inc.,2025-04-11,NET 45 DAYS,499712.17,41883.85,...,False,False,45,0.091484,0.061,27927.52752,0.023669,0.66134,45,UT


In [7]:
test_df.head()

Unnamed: 0,_ANOMALY_TYPES_DROP_BEFORE_TRAINING_,is_anomalous,merchant,invoice_date,merchant_branch,merchant_chain,due_date,payment_terms,grand_total,tax,...,merchant_mismatch_flag,duplicate_invoice_flag,invoice_age,actual_tax_rate,expected_tax_rate,expected_tax,avg_description_similarity,invoice_similarity,payment_terms_numeric,state
0,[state_tax_mismatch],1,American Electronics Systems,2025-02-11,American Electronics Systems,American Electronics Systems,2025-02-11,DUE ON RECEIPT,335.31,16.93,...,False,False,0,0.053175,0.05,15.919,0.013686,0.631467,0,WI
1,[],0,Washington GmbH,2025-03-29,Washington GmbH,Washington GmbH,2025-04-28,NET 30 DAYS,13671.68,850.07,...,False,False,30,0.0663,0.0663,850.072743,0.024055,0.652495,30,NJ
2,[],0,Imagine Medical Co.,2025-02-08,Imagine Medical Co.,Imagine Medical Co.,2025-03-10,NET 30 DAYS,3305.79,187.12,...,False,False,30,0.06,0.06,187.1202,0.019909,0.656128,30,VT
3,[],1,Hayes-Bradley,2025-04-01,Hayes-Bradley,Hayes-Bradley,2025-04-18,NET 30 DAYS,958.49,62.71,...,False,False,17,0.070006,0.07,62.7046,0.013981,0.571268,30,MS
4,[],0,Gordon,2025-01-09,Gordon,Gordon,2025-02-23,NET 45 DAYS,3297.13,184.23,...,False,False,45,0.059183,0.06,186.774,0.01533,0.560855,45,WV


In [8]:
train_df.to_parquet("train_df.parquet")
test_df.to_parquet("test_df.parquet")