### **1. Define Inference Helpers**

In [1]:
import json
import joblib
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

# 2.1) Load your models
encoder = SentenceTransformer('all-MiniLM-L6-v2')
clf     = joblib.load("dup_detector_lr.joblib")

# 2.2) Text‐formatter (dict → string)
def invoice_to_text(inv):
    date = inv["INVOICE_DATE"]
    ds   = date.strftime("%Y-%m-%d") if hasattr(date, "strftime") else str(date)
    return (
        f"Vendor: {inv['VENDOR_NAME']} ({inv['VENDOR_ID']}) | "
        f"Amount: {inv['AMOUNT']} {inv['CURRENCY']} | "
        f"Date: {ds} | "
        f"PO: {inv['PURCHASE_ORDER']} | "
        f"Desc: {inv['DESCRIPTION']}"
    )

# 2.3) Feature‐builder
def make_features(inv1, inv2):
    # SBERT embeddings
    t1  = invoice_to_text(inv1)
    t2  = invoice_to_text(inv2)
    e1  = encoder.encode([t1])[0]
    e2  = encoder.encode([t2])[0]
    sb  = np.hstack([np.abs(e1 - e2), e1 * e2])
    # Engineered
    same_amt = int(inv1["AMOUNT"] == inv2["AMOUNT"])
    pct_diff = abs(inv1["AMOUNT"] - inv2["AMOUNT"]) / (inv1["AMOUNT"] or 1)
    within_5 = int(pct_diff <= 0.05)
    eng      = np.array([same_amt, within_5])
    return np.hstack([sb, eng]).reshape(1, -1)

# 2.4) Single‐pair predictor
def predict_pair(inv1, inv2):
    X    = make_features(inv1, inv2)
    prob = float(clf.predict_proba(X)[0,1])
    pred = int(clf.predict(X)[0])
    return {"prob_duplicate": prob, "is_duplicate": pred}


  from .autonotebook import tqdm as notebook_tqdm


### **2. Load a Sample of Data**

In [2]:
# paths may vary
df = pd.read_csv(
    "synthetic_invoice_pairs.csv",
    parse_dates=["INV1_INVOICE_DATE","INV2_INVOICE_DATE"]
)

# Show a few rows
df.head(3)


Unnamed: 0,INV1_VENDOR_NAME,INV1_VENDOR_ID,INV1_AMOUNT,INV1_CURRENCY,INV1_INVOICE_DATE,INV1_DESCRIPTION,INV1_PURCHASE_ORDER,INV1_COMPANY_CODE,INV1_COST_CENTER,INV1_TAX_CODE,...,INV2_AMOUNT,INV2_CURRENCY,INV2_INVOICE_DATE,INV2_DESCRIPTION,INV2_PURCHASE_ORDER,INV2_COMPANY_CODE,INV2_COST_CENTER,INV2_TAX_CODE,INV2_PAYMENT_TERMS,label
0,Cooper Ltd,VE1489,633.22,USD,2025-01-31,Heavy town money.,PO9276,1000,CC525,B1,...,626.6,USD,2025-01-31,Heavy town money.,PO9276,1000,CC525,B1,NET30,1
1,"Diaz, Anderson and Browning",VE1316,1369.83,GBP,2024-09-16,Military place edge environmental even eye mes...,PO8146,1000,CC275,A0,...,1368.64,GBP,16/09/2024,Military place edge environmental even eye mes...,PO8146,1000,CC275,A0,NET60,1
2,Patton-Jenkins,VE1110,2460.07,EUR,2024-12-27,Perhaps lawyer interest star his difficult.,PO2577,2000,CC862,B1,...,2464.33,EUR,27/12/2024,Perhaps lawyer interest star his difficult.,PO2577,2000,CC862,B1,NET60,1


### **3. Smoke-Test on 5 Examples**

In [4]:
results = []
for idx, row in df.head(5).iterrows():
    inv1 = {k.replace("INV1_",""): v for k, v in row.items() if k.startswith("INV1_")}
    inv2 = {k.replace("INV2_",""): v for k, v in row.items() if k.startswith("INV2_")}
    res  = predict_pair(inv1, inv2)
    results.append({
        "invoice1": inv1,
        "invoice2": inv2,
        **res
    })

# Pretty-print JSON
for r in results:
    print(json.dumps(r, indent=2, default=str))


{
  "invoice1": {
    "VENDOR_NAME": "Cooper Ltd",
    "VENDOR_ID": "VE1489",
    "AMOUNT": 633.22,
    "CURRENCY": "USD",
    "INVOICE_DATE": "2025-01-31 00:00:00",
    "DESCRIPTION": "Heavy town money.",
    "PURCHASE_ORDER": "PO9276",
    "COMPANY_CODE": 1000,
    "COST_CENTER": "CC525",
    "TAX_CODE": "B1",
    "PAYMENT_TERMS": "NET30"
  },
  "invoice2": {
    "VENDOR_NAME": "Cooper Ltd",
    "VENDOR_ID": "VE1489",
    "AMOUNT": 626.6,
    "CURRENCY": "USD",
    "INVOICE_DATE": "2025-01-31",
    "DESCRIPTION": "Heavy town money.",
    "PURCHASE_ORDER": "PO9276",
    "COMPANY_CODE": 1000,
    "COST_CENTER": "CC525",
    "TAX_CODE": "B1",
    "PAYMENT_TERMS": "NET30"
  },
  "prob_duplicate": 0.9768155035566964,
  "is_duplicate": 1
}
{
  "invoice1": {
    "VENDOR_NAME": "Diaz, Anderson and Browning",
    "VENDOR_ID": "VE1316",
    "AMOUNT": 1369.83,
    "CURRENCY": "GBP",
    "INVOICE_DATE": "2024-09-16 00:00:00",
    "DESCRIPTION": "Military place edge environmental even eye message