Generate test dataset

In [20]:
from invoice_generator import *

generate_dataset(num_invoices=20000, line_anomaly_rate=0.05,general_anomaly_rate=0.2,filename="synthetic_invoices_train",seed=89)

Analysis of 20752 invoices:
- Unique merchants: 957
- Total line items: 71838
- Average line items per invoice: 3.46
- Fraudulent invoices: 6739 (32.47%)

Field frequency:
- merchant: 20752 (100.0%)
- invoice_date: 20752 (100.0%)
- merchant_branch: 20752 (100.0%)
- merchant_chain: 20752 (100.0%)
- due_date: 20752 (100.0%)
- payment_terms: 20752 (100.0%)
- grand_total: 20752 (100.0%)
- tax: 20752 (100.0%)
- po_number: 20752 (100.0%)
- merchant_address: 20752 (100.0%)
- payment_method: 20752 (100.0%)
- country: 20752 (100.0%)
- currency: 20752 (100.0%)
- line_details: 20752 (100.0%)
Generated 20752 synthetic invoices and saved to synthetic_invoices_train.json
Saved invoices in JSONL format to synthetic_invoices_train.jsonl


[{'extractions': [{'field': 'merchant', 'value': 'TvSeveral Laboratory'},
   {'field': 'invoice_date', 'value': '02/08/2025'},
   {'field': 'merchant_branch', 'value': 'TvSeveral Laboratory'},
   {'field': 'merchant_chain', 'value': 'Data Laboratory Industries'},
   {'field': 'due_date', 'value': '02/08/2025'},
   {'field': 'payment_terms', 'value': 'DUE ON RECEIPT'},
   {'field': 'grand_total', 'value': '19586.95'},
   {'field': 'tax', 'value': '0.00'},
   {'field': 'po_number', 'value': '2503-5567'},
   {'field': 'merchant_address',
    'value': '17665 Ramos Square Ferrellport PR 81642 USA'},
   {'field': 'payment_method', 'value': 'USPS Priority'},
   {'field': 'country', 'value': 'US'},
   {'field': 'currency', 'value': 'USD'},
   {'field': 'line_details',
    'value': [{'line_count': '1',
      'line_description': 'CE-210 Industrial Analysis Equipment',
      'line_qty': '10',
      'line_tax': '0.00',
      'line_total': '16886.05',
      'model': 'CE-210'},
     {'line_count': '

In [35]:
import json
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from invoice_generator import *
from preprocess import *
def load_dataset(filename):
    with open(filename, "r") as f:
        return json.load(f)
filename = "synthetic_invoices_train.json"
invoices = load_dataset(filename)
df = process_invoice(invoices)
df["is_fraud"] = df["is_fraud"].astype(int)
df["invoice_date"] = pd.to_datetime(df["invoice_date"], errors="coerce")
df["due_date"] = pd.to_datetime(df["due_date"], errors="coerce")
df["days_to_due"] = (df["due_date"] - df["invoice_date"]).dt.days
df["invoice_weekday"] = df["invoice_date"].dt.weekday  
df["invoice_month"] = df["invoice_date"].dt.month
df["due_on_weekend"] = df["due_date"].dt.weekday > 4  # Boolean feature
df = df.drop(columns=["invoice_date", "due_date"])
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))  # Convert to string and encode
    label_encoders[col] = le  # Store encoder for later use
X = df.drop(columns=["is_fraud"])  # Use encoded features
y = df["is_fraud"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = xgb.XGBClassifier(
    max_depth=3,              
    learning_rate=0.05,       
    n_estimators=800,        
    min_child_weight=3,      
    subsample=0.8,                     
)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f"Model Accuracy: {acc:.4f}")
print("Classification Report:")
print(report)
model.save_model("xgb_fraud_model.json")
print("Model saved as 'xgb_fraud_model.json'")


Model Accuracy: 0.8678
Classification Report:
              precision    recall  f1-score   support

           0       0.83      1.00      0.91      9287
           1       1.00      0.63      0.77      5081

    accuracy                           0.87     14368
   macro avg       0.91      0.81      0.84     14368
weighted avg       0.89      0.87      0.86     14368

Model saved as 'xgb_fraud_model.json'


In [18]:
generate_dataset(num_invoices=10000, line_anomaly_rate=0.05,general_anomaly_rate=0.2,filename="synthetic_invoices_test",seed=44)

Analysis of 10352 invoices:
- Unique merchants: 914
- Total line items: 35884
- Average line items per invoice: 3.47
- Fraudulent invoices: 3219 (31.10%)

Field frequency:
- merchant: 10352 (100.0%)
- invoice_date: 10352 (100.0%)
- merchant_branch: 10352 (100.0%)
- merchant_chain: 10352 (100.0%)
- due_date: 10352 (100.0%)
- payment_terms: 10352 (100.0%)
- grand_total: 10352 (100.0%)
- tax: 10352 (100.0%)
- po_number: 10352 (100.0%)
- merchant_address: 10352 (100.0%)
- payment_method: 10352 (100.0%)
- country: 10352 (100.0%)
- currency: 10352 (100.0%)
- line_details: 10352 (100.0%)
Generated 10352 synthetic invoices and saved to synthetic_invoices_test.json
Saved invoices in JSONL format to synthetic_invoices_test.jsonl


[{'extractions': [{'field': 'merchant', 'value': 'Artist Industries'},
   {'field': 'invoice_date', 'value': '03/12/2025'},
   {'field': 'merchant_branch', 'value': 'Artist Industries'},
   {'field': 'merchant_chain', 'value': 'Artist Industries'},
   {'field': 'due_date', 'value': '05/11/2025'},
   {'field': 'payment_terms', 'value': 'NET 60 DAYS'},
   {'field': 'grand_total', 'value': '10618.56'},
   {'field': 'tax', 'value': '717.81'},
   {'field': 'po_number', 'value': '2503-2156'},
   {'field': 'merchant_address',
    'value': '88409 Berg Trace Apt. 237 Sanchezmouth CA 61245 USA'},
   {'field': 'payment_method', 'value': 'UPS Ground'},
   {'field': 'country', 'value': 'US'},
   {'field': 'currency', 'value': 'EUR'},
   {'field': 'line_details',
    'value': [{'line_count': '1',
      'line_description': 'CV-290 Economy Sensor',
      'line_qty': '3',
      'line_tax': '21.48',
      'line_total': '296.26',
      'model': 'CV-290'},
     {'line_count': '2',
      'line_description'

In [36]:
import json
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from invoice_generator import *
from preprocess import *
model = xgb.XGBClassifier()
model.load_model("xgb_fraud_model.json")  
test_filename = "synthetic_invoices_test.json"
test_invoices = load_dataset(test_filename)
df_test = process_invoice(test_invoices)
df_test["is_fraud"] = df_test["is_fraud"].astype(int)
df_test["invoice_date"] = pd.to_datetime(df_test["invoice_date"], errors="coerce")
df_test["due_date"] = pd.to_datetime(df_test["due_date"], errors="coerce")
df_test["days_to_due"] = (df_test["due_date"] - df_test["invoice_date"]).dt.days
df_test["invoice_weekday"] = df_test["invoice_date"].dt.weekday
df_test["invoice_month"] = df_test["invoice_date"].dt.month
df_test["due_on_weekend"] = df_test["due_date"].dt.weekday > 4  
df_test.drop(columns=["invoice_date", "due_date"], errors="ignore", inplace=True)
categorical_cols = df_test.select_dtypes(include=['object']).columns.tolist()
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_test[col] = le.fit_transform(df_test[col].astype(str))  
    label_encoders[col] = le 
train_features = model.get_booster().feature_names
if train_features is None:
    train_features = X_test.columns.tolist() 
X_test = df_test.drop(columns=["is_fraud"], errors='ignore')
X_test = X_test.reindex(columns=train_features, fill_value=0)
y_test = df_test["is_fraud"]
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f"Test Data Model Accuracy: {acc:.4f}")
print("Classification Report on Test Data:")
print(report)

Test Data Model Accuracy: 0.8004
Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.83      0.87      0.85     23677
           1       0.73      0.66      0.69     12207

    accuracy                           0.80     35884
   macro avg       0.78      0.77      0.77     35884
weighted avg       0.80      0.80      0.80     35884

