In [1]:
import pandas as pd
import invoice_generator



import preprocess



from sklearn import model_selection

# 1. Data Generation


In [2]:
data = invoice_generator.generate_dataset(
    num_invoices=10000,
    num_merchants=1000,
    general_anomaly_rate=0.3,
    line_anomaly_rate=0.05,
    seed=42,
)

Generating 10000 synthetic invoices across a set of 1000 merchants (general anomaly rate: 30.0%, line anomaly rate: 5.0%)...
Analysis of 10525 invoices:
- Unique merchants: 916
- Total line items: 36267
- Average line items per invoice: 3.45

Field frequency:
- merchant: 10525 (100.0%)
- invoice_date: 10525 (100.0%)
- merchant_branch: 10525 (100.0%)
- merchant_chain: 10525 (100.0%)
- due_date: 10525 (100.0%)
- payment_terms: 10525 (100.0%)
- grand_total: 10525 (100.0%)
- tax: 10525 (100.0%)
- po_number: 10525 (100.0%)
- merchant_address: 10525 (100.0%)
- payment_method: 10525 (100.0%)
- country: 10525 (100.0%)
- currency: 10525 (100.0%)
- line_details: 10525 (100.0%)
Generated 10525 synthetic invoices and saved to synthetic_invoices.json
Saved invoices in JSONL format to synthetic_invoices.jsonl


In [3]:
data[0]

{'extractions': [{'field': 'merchant', 'value': 'Write Laboratory'},
  {'field': 'invoice_date', 'value': '12/06/2024'},
  {'field': 'merchant_branch', 'value': 'Write Laboratory'},
  {'field': 'merchant_chain', 'value': 'Write Laboratory'},
  {'field': 'due_date', 'value': '01/05/2025'},
  {'field': 'payment_terms', 'value': 'NET 30 DAYS'},
  {'field': 'grand_total', 'value': '105688.50'},
  {'field': 'tax', 'value': '5979.88'},
  {'field': 'po_number', 'value': '226656'},
  {'field': 'merchant_address',
   'value': '79163 Rebecca Forks Suite 246 West Thomastown VA 02369 USA'},
  {'field': 'payment_method', 'value': 'Check'},
  {'field': 'country', 'value': 'US'},
  {'field': 'currency', 'value': 'USD'},
  {'field': 'line_details',
   'value': [{'line_count': '1',
     'line_description': 'X-6906 Compact Chemical',
     'line_qty': '50',
     'line_tax': '3106.62',
     'line_total': '51776.94',
     'model': 'X-6906'},
    {'line_count': '2',
     'line_description': 'M-26501 Lightwe

# 2. Feature Engineering


In [4]:
train, test = model_selection.train_test_split(data, test_size=0.2, random_state=42)

print(f"Training set size: {len(train)}")
print(f"Test set size: {len(test)}")

Training set size: 8420
Test set size: 2105


In [5]:
train_df = preprocess.process_invoice(train)
test_df = preprocess.process_invoice(test)

In [6]:
train_df.head()

Unnamed: 0,merchant,merchant_branch,merchant_chain,po_number,payment_method,payment_terms_numeric,invoice_age,invoice_age_mismatch,country,currency,...,duplicate_invoice_flag,avg_description_similarity,invoice_similarity,line_total,line_qty,invoice_date,due_date,merchant_address,state,is_anomalous
0,Pope GmbH,Pope GmbH,Pope GmbH,PO-39821,USPS Priority,30,30,False,US,USD,...,False,0.015204,0.655544,3880.8,41,2025-03-27,2025-04-26,693 Stephens Unions Suite 040 Halltown DE 4419...,DE,0
1,Gonzalez Solutions,Gonzalez Solutions,Lee & Norton Enterprises,PO-73581,FedEx Express,30,30,False,US,USD,...,False,0.014831,0.650988,599.88,3,2025-04-02,2025-05-02,290 Hess Ridges Snyderport VI 80247 USA,VI,1
2,AcceptWalk,AcceptWalk,AcceptWalk,2504-9921,FedEx Express,30,30,False,US,USD,...,False,0.017363,0.638144,15571.34,7,2025-03-05,2025-04-04,35133 Chambers Crossing West Debbieland SC 217...,SC,1
3,DemocraticHit,DemocraticHit,Gonzales Co.,2504-8763,FedEx Express,0,-24,True,US,USD,...,True,0.022414,0.99396,6764.13,12,2025-03-07,2025-02-11,437 Kathryn Street Janeborough WV 59510 USA,WV,1
4,Harrington-Vang Inc.,Harrington-Vang Inc.,Harrington-Vang Inc.,27850-569,Check,45,45,False,US,USD,...,False,0.023669,0.665603,457828.32,166,2025-02-21,2025-04-07,315 Joel Loaf Wolfborough UT 05176 USA,UT,1


In [7]:
test_df.head()

Unnamed: 0,merchant,merchant_branch,merchant_chain,po_number,payment_method,payment_terms_numeric,invoice_age,invoice_age_mismatch,country,currency,...,duplicate_invoice_flag,avg_description_similarity,invoice_similarity,line_total,line_qty,invoice_date,due_date,merchant_address,state,is_anomalous
0,American Electronics Systems,American Electronics Systems,American Electronics Systems,2504-3141,FedEx Ground,0,0,False,US,USD,...,False,0.013686,0.632587,282.05,3,2025-02-07,2025-02-07,626 Abigail Stravenue Suite 394 South Michelle...,WI,1
1,Washington GmbH,Washington GmbH,Washington GmbH,572867,Net Banking,30,30,False,US,USD,...,False,0.024055,0.656696,12821.61,26,2025-03-25,2025-04-24,138 Perry Station Suite 737 South Jacqueline N...,NJ,0
2,Imagine Medical Co.,Imagine Medical Co.,Imagine Medical Co.,720486,DHL Express,30,30,False,US,USD,...,False,0.019909,0.667253,3118.67,2,2025-02-04,2025-03-06,0833 Scott Stream Suite 125 Emmamouth VT 01384...,VT,0
3,Hayes-Bradley,Hayes-Bradley,Hayes-Bradley,49944-760,USPS Priority,30,17,True,US,USD,...,False,0.013981,0.578341,895.78,55,2025-03-28,2025-04-14,244 Tracy Shoal Griffithmouth MS 42792 USA,MS,1
4,Gordon,Gordon,Gordon,2504-3430,FedEx Express,45,45,False,US,USD,...,False,0.01533,0.56797,3070.54,6,2025-01-05,2025-02-19,094 Price Ranch Thomasstad WV 70869 USA,WV,0


In [8]:
train_df.to_parquet("train_df.parquet")
test_df.to_parquet("test_df.parquet")

# 2.1 Reload Data

In [9]:
# train_df = pd.read_parquet("train_df.parquet")
# test_df = pd.read_parquet("train_df.parquet")