In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

def train_baseline_model(df):
    X = df.drop(columns=["label"])
    y = df["label"]

    # Select numeric and categorical features
    numeric_features = X.select_dtypes(include=["number"]).columns.tolist()
    categorical_features = ["payment_method", "payment_terms", "currency", "state"]

    # Encode categorical features
    encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    X_cat = encoder.fit_transform(X[categorical_features])
    X_num = X[numeric_features].values
    X_all = np.hstack([X_num, X_cat])

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X_all, y, test_size=0.3, random_state=42)

    # Train classifier
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Evaluate
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))

    return model, encoder


In [7]:
def extract_features(invoices):
    rows = []
    for invoice in invoices:
        features = {}
        extractions = {item["field"]: item["value"] for item in invoice["extractions"]}

        line_items = extractions.get("line_details", [])
        features["num_line_items"] = len(line_items)

        qtys, totals = [], []
        for line in line_items:
            try:
                qtys.append(float(line["line_qty"]))
                totals.append(float(line["line_total"]))
            except (KeyError, ValueError):
                continue

        features["line_qty_sum"] = sum(qtys)
        features["line_total_mean"] = np.mean(totals) if totals else 0.0

        try:
            features["total_tax"] = float(extractions.get("tax", 0))
            features["grand_total"] = float(extractions.get("grand_total", 0))
        except ValueError:
            features["total_tax"] = 0
            features["grand_total"] = 0

        features["payment_method"] = extractions.get("payment_method", "")
        features["payment_terms"] = extractions.get("payment_terms", "")
        features["currency"] = extractions.get("currency", "")
        features["state"] = extractions.get("merchant_address", "").split()[-3] if "merchant_address" in extractions else ""

        features["label"] = invoice.get("label", 0)

        rows.append(features)

    return pd.DataFrame(rows)


In [8]:
from invoice_generator import *
invoices = load_dataset("synthetic_invoices_test.json")
df = extract_features(invoices)
model, encoder = train_baseline_model(df)


              precision    recall  f1-score   support

           0       0.73      0.95      0.83      2154
           1       0.65      0.22      0.33       952

    accuracy                           0.72      3106
   macro avg       0.69      0.58      0.58      3106
weighted avg       0.71      0.72      0.67      3106

