In [1]:
import invoice_generator
import preprocess
from sklearn import model_selection

# 1. Data Generation


In [2]:
data = invoice_generator.generate_dataset(
    num_invoices=10000,
    num_merchants=1000,
    general_anomaly_rate=0.3,
    line_anomaly_rate=0.05,
    seed=42,
    test=True,
)

Generating 10000 synthetic invoices across a set of 1000 merchants (general anomaly rate: 30.0%, line anomaly rate: 5.0%)...
Analysis of 10541 invoices:
- Unique merchants: 912
- Total line items: 36601
- Average line items per invoice: 3.47

Field frequency:
- merchant: 10541 (100.0%)
- invoice_date: 10541 (100.0%)
- merchant_branch: 10541 (100.0%)
- merchant_chain: 10541 (100.0%)
- due_date: 10541 (100.0%)
- payment_terms: 10541 (100.0%)
- grand_total: 10541 (100.0%)
- tax: 10541 (100.0%)
- po_number: 10541 (100.0%)
- merchant_address: 10541 (100.0%)
- payment_method: 10541 (100.0%)
- country: 10541 (100.0%)
- currency: 10541 (100.0%)
- line_details: 10541 (100.0%)
Generated 10541 synthetic invoices and saved to synthetic_invoices.json
Saved invoices in JSONL format to synthetic_invoices.jsonl


In [3]:
data[0]

{'extractions': [{'field': 'merchant', 'value': 'Write Laboratory'},
  {'field': 'invoice_date', 'value': '12/10/2024'},
  {'field': 'merchant_branch', 'value': 'Write Laboratory'},
  {'field': 'merchant_chain', 'value': 'Write Laboratory'},
  {'field': 'due_date', 'value': '01/09/2025'},
  {'field': 'payment_terms', 'value': 'NET 30 DAYS'},
  {'field': 'grand_total', 'value': '105688.50'},
  {'field': 'tax', 'value': '5979.88'},
  {'field': 'po_number', 'value': '226656'},
  {'field': 'merchant_address',
   'value': '79163 Rebecca Forks Suite 246 West Thomastown VA 02369 USA'},
  {'field': 'payment_method', 'value': 'Check'},
  {'field': 'country', 'value': 'US'},
  {'field': 'currency', 'value': 'USD'},
  {'field': 'line_details',
   'value': [{'line_count': '1',
     'line_description': 'X-6906 Compact Chemical',
     'line_qty': '50',
     'line_tax': '3106.62',
     'line_total': '51776.94',
     'model': 'X-6906'},
    {'line_count': '2',
     'line_description': 'M-26501 Lightwe

# 2. Feature Engineering


In [4]:
train, test = model_selection.train_test_split(data, test_size=0.2, random_state=42)

print(f"Training set size: {len(train)}")
print(f"Test set size: {len(test)}")

Training set size: 8432
Test set size: 2109


In [5]:
train_df = preprocess.process_invoice(train, test=True)
test_df = preprocess.process_invoice(test, test=True)

In [6]:
train_df.head()

Unnamed: 0,_ANOMALY_TYPES_DROP_BEFORE_TRAINING_,is_anomalous,merchant,invoice_date,merchant_branch,merchant_chain,due_date,payment_terms,grand_total,tax,...,merchant_mismatch_flag,duplicate_invoice_flag,invoice_age,actual_tax_rate,expected_tax_rate,expected_tax,avg_description_similarity,invoice_similarity,payment_terms_numeric,state
0,[],0,Espinoza,2025-03-16,Espinoza,Espinoza,2025-04-15,NET 30 DAYS,1762.16,83.91,...,False,False,30,0.049999,0.05,83.9125,0.021412,0.716714,30,ND
1,[due_date_anomaly],1,Norris & Morris,2025-03-30,Norris & Morris,Norris & Morris,2025-04-21,NET 30 DAYS,3943.87,169.83,...,False,False,22,0.045,0.045,169.8318,0.019552,0.643982,30,OK
2,[varied_wordings],1,Adams Automotive,2024-11-11,Adams Automotive,Adams Automotive,2024-11-11,DUE ON RECEIPT,15527.86,0.0,...,False,False,0,0.0,0.0,0.0,0.018506,0.645085,0,PR
3,[],0,SourceA,2024-10-23,SourceA,SourceA,2024-10-23,DUE ON RECEIPT,18515.3,881.68,...,False,False,0,0.05,0.05,881.681,0.018009,0.65389,0,WI
4,[],0,AllowWhile Electronics Enterprises,2025-04-03,AllowWhile Electronics Enterprises,AllowWhile Electronics Enterprises,2025-05-03,NET 30 DAYS,1354.73,76.69,...,False,False,30,0.060006,0.06,76.6824,0.011894,0.757758,30,KY


In [7]:
test_df.head()

Unnamed: 0,_ANOMALY_TYPES_DROP_BEFORE_TRAINING_,is_anomalous,merchant,invoice_date,merchant_branch,merchant_chain,due_date,payment_terms,grand_total,tax,...,merchant_mismatch_flag,duplicate_invoice_flag,invoice_age,actual_tax_rate,expected_tax_rate,expected_tax,avg_description_similarity,invoice_similarity,payment_terms_numeric,state
0,[due_date_anomaly],1,EmployeeRole,2025-01-30,EmployeeRole,EmployeeRole,2025-02-27,NET 30 DAYS,22644.06,0.0,...,False,False,28,0.0,0.0,0.0,0.01922,0.669004,30,FM
1,[],1,FieldEstablish,2025-05-10,FieldEstablish,FieldEstablish,2025-05-27,NET 45 DAYS,33774.79,2016.64,...,False,True,17,0.0635,0.0635,2016.642525,0.024218,0.994874,45,CT
2,[due_date_anomaly],1,Gregory-Sanders,2025-02-15,Gregory-Sanders,Gregory-Sanders,2025-03-01,NET 15 DAYS,974605.56,37483.43,...,False,False,14,0.039998,0.04,37484.8852,0.023637,0.67396,15,GA
3,[],0,ChildUs Technology,2025-04-01,ChildUs Technology,ChildUs Technology,2025-04-01,DUE ON RECEIPT,26746.65,1746.83,...,False,False,0,0.069874,0.07,1749.9874,0.016722,0.572401,0,TN
4,"[phantom_item, due_date_anomaly]",1,Small Automotive Inc.,2025-04-02,Small Automotive Inc.,Small Automotive Inc.,2025-05-09,NET 30 DAYS,108371.78,7089.75,...,False,False,37,0.07,0.07,7089.7421,0.016923,0.630189,30,TN


In [8]:
train_df.to_parquet("train_df.parquet")
test_df.to_parquet("test_df.parquet")