In [1]:
import invoice_generator
import preprocess
from sklearn import model_selection

# 1. Data Generation


In [2]:
data = invoice_generator.generate_dataset(
    num_invoices=10000,
    num_merchants=1000,
    general_anomaly_rate=0.3,
    line_anomaly_rate=0.05,
    seed=42,
    output_types=True,
)

Generating 10000 synthetic invoices across a set of 1000 merchants (general anomaly rate: 30.0%, line anomaly rate: 5.0%)...
Analysis of 10536 invoices:
- Unique merchants: 898
- Total line items: 36548
- Average line items per invoice: 3.47

Field frequency:
- merchant: 10536 (100.0%)
- invoice_date: 10536 (100.0%)
- merchant_branch: 10536 (100.0%)
- merchant_chain: 10536 (100.0%)
- due_date: 10536 (100.0%)
- payment_terms: 10536 (100.0%)
- grand_total: 10536 (100.0%)
- tax: 10536 (100.0%)
- po_number: 10536 (100.0%)
- merchant_address: 10536 (100.0%)
- payment_method: 10536 (100.0%)
- country: 10536 (100.0%)
- currency: 10536 (100.0%)
- line_details: 10536 (100.0%)
Generated 10536 synthetic invoices and saved to synthetic_invoices.json
Saved invoices in JSONL format to synthetic_invoices.jsonl


In [3]:
data[0]

{'extractions': [{'field': 'merchant', 'value': 'Write Laboratory'},
  {'field': 'invoice_date', 'value': '12/12/2024'},
  {'field': 'merchant_branch', 'value': 'Write Laboratory'},
  {'field': 'merchant_chain', 'value': 'Write Laboratory'},
  {'field': 'due_date', 'value': '01/11/2025'},
  {'field': 'payment_terms', 'value': 'NET 30 DAYS'},
  {'field': 'grand_total', 'value': '105688.50'},
  {'field': 'tax', 'value': '5979.88'},
  {'field': 'po_number', 'value': '226656'},
  {'field': 'merchant_address',
   'value': '79163 Rebecca Forks Suite 246 West Thomastown VA 02369 USA'},
  {'field': 'payment_method', 'value': 'Check'},
  {'field': 'country', 'value': 'US'},
  {'field': 'currency', 'value': 'USD'},
  {'field': 'line_details',
   'value': [{'line_count': '1',
     'line_description': 'X-6906 Compact Chemical',
     'line_qty': '50',
     'line_tax': '3106.62',
     'line_total': '51776.94'},
    {'line_count': '2',
     'line_description': 'M-26501 Lightweight Glassware',
     'l

# 2. Feature Engineering


In [4]:
train, test = model_selection.train_test_split(data, test_size=0.2, random_state=42)

print(f"Training set size: {len(train)}")
print(f"Test set size: {len(test)}")

Training set size: 8428
Test set size: 2108


In [5]:
(
    train_df,
    (invoice_vectorizer, invoice_train_vecs),
    (line_vectorizer, line_normal_vecs),
) = preprocess.process_invoice(train, output_types=True)

test_df, _, _ = preprocess.process_invoice(
    test,
    is_train=False,
    invoice_vectorizer=invoice_vectorizer,
    invoice_train_vecs=invoice_train_vecs,
    line_vectorizer=line_vectorizer,
    line_normal_vecs=line_normal_vecs,
    output_types=True,
)

In [6]:
train_df.head()

Unnamed: 0,_ANOMALY_TYPES_DROP_BEFORE_TRAINING_,is_anomalous,merchant,invoice_date,merchant_branch,merchant_chain,due_date,payment_terms,grand_total,tax,...,duplicate_invoice_flag,line_total,line_qty,invoice_age,payment_terms_numeric,state,expected_tax_rate,expected_tax,line_description_similarity,invoice_similarity
0,[varied_wordings],1,Smith & Robinson Industrial,2025-01-09,Smith & Robinson Industrial,Smith & Robinson Industrial,2024-12-31,DUE ON RECEIPT,19819.71,943.8,...,True,18875.91,61,-9,0,ND,0.05,943.7955,1.0,0.991496
1,[varied_wordings],1,AdultDecade,2025-03-14,AdultDecade,AdultDecade,2025-05-13,NET 60 DAYS,622.26,35.22,...,True,587.04,1,60,60,WV,0.06,35.2224,1.0,0.994511
2,[due_date_anomaly],1,Lee & Wallace,2025-04-10,Lee & Wallace,Lee & Wallace,2025-04-13,DUE ON RECEIPT,99670.82,0.0,...,False,99670.82,31,3,0,MT,0.0,0.0,1.0,0.73951
3,"[duplicate_products, currency_anomaly, varied_...",1,OfficerThan,2025-04-29,OfficerThan,OfficerThan,2025-05-01,NET 30 DAYS,965.74,56.81,...,True,908.93,5,2,30,MA,0.0625,56.808125,1.0,0.996823
4,[],0,Todd-Ochoa,2025-02-18,Todd-Ochoa,Todd-Ochoa,2025-02-18,DUE ON RECEIPT,3184.94,0.0,...,False,3152.89,3,0,0,MH,0.0,0.0,1.0,0.76607


In [7]:
test_df.head()

Unnamed: 0,_ANOMALY_TYPES_DROP_BEFORE_TRAINING_,is_anomalous,merchant,invoice_date,merchant_branch,merchant_chain,due_date,payment_terms,grand_total,tax,...,duplicate_invoice_flag,line_total,line_qty,invoice_age,payment_terms_numeric,state,expected_tax_rate,expected_tax,line_description_similarity,invoice_similarity
0,"[due_date_anomaly, varied_wordings, varied_wor...",1,Richardson & Brown Solutions,2025-03-17,Richardson & Brown Solutions,Richardson & Brown Solutions,2025-04-21,NET 60 DAYS,3385.85,182.81,...,True,3179.14,5,35,60,OH,0.0575,182.80055,1.0,0.996251
1,"[tax_calc, merchant_mismatch]",1,Leach-Christian GmbH,2025-02-22,Leach-Christian GmbH,Goodman-Carter,2025-03-24,NET 30 DAYS,5659.5,217.89,...,False,5441.61,38,30,30,WY,0.04,217.6644,1.0,0.688298
2,[],0,Frazier Corp.,2024-12-28,Frazier Corp.,Frazier Corp.,2025-01-27,NET 30 DAYS,184970.71,10467.79,...,False,174463.26,61,30,30,ID,0.06,10467.7956,1.0,0.73747
3,[],0,Johnston Industries,2025-02-20,Johnston Industries,Johnston Industries,2025-03-07,NET 15 DAYS,9261.53,0.0,...,False,9261.53,43,15,15,DE,0.0,0.0,1.0,0.705116
4,[],0,White & Jackson Industrial,2025-02-16,White & Jackson Industrial,White & Jackson Industrial,2025-03-18,2/10 NET 30,156202.98,0.0,...,False,156202.98,112,30,30,FM,0.0,0.0,1.0,0.827569


In [8]:
train_df.to_parquet("train_df.parquet")
test_df.to_parquet("test_df.parquet")