In [11]:
# Install dependencies (run once per environment)
%pip install -q dspy pandas python-dotenv

# dataset comes from here: https://www.kaggle.com/datasets/nikitpatel/invoice-ner-dataset?resource=download


Note: you may need to restart the kernel to use updated packages.


In [19]:
# Basic imports and environment setup
import os
import json
import dspy
import pandas as pd
from dotenv import load_dotenv

# Load API keys from .env (OPENAI_API_KEY is expected)
load_dotenv()

# Configure DSPy default LM similar to other notebooks
lm = dspy.LM("openai/gpt-5-mini", api_key=os.getenv("OPENAI_API_KEY"), temperature=1, max_tokens=16000)
dspy.configure(lm=lm)

print("DSPy configured for invoice extraction.")


DSPy configured for invoice extraction.


In [20]:
# Load dataset and preview a few rows
# Prefer absolute path next to this notebook; fallback to project root
nb_dir = os.path.dirname(os.path.abspath("__file__")) if "__file__" in globals() else os.path.abspath("./dspy")
candidates = [
    os.path.join(nb_dir, "invoice_ner_dataset.csv"),
    os.path.abspath("./dspy/invoice_ner_dataset.csv"),
    os.path.abspath("invoice_ner_dataset.csv"),
]
for p in candidates:
    if os.path.exists(p):
        csv_path = p
        break
else:
    raise FileNotFoundError("invoice_ner_dataset.csv not found in expected locations")

raw_df = pd.read_csv(csv_path)
print("Rows:", len(raw_df))
print(raw_df.head(2))

# Parse Final_Output JSON strings into dicts
raw_df["Final_Output"] = raw_df["Final_Output"].apply(lambda s: json.loads(s))

# Build small train/test splits for quick iteration
# Keep it tiny for a simple example; adjust as needed
train_df = raw_df.iloc[:30].copy()
valid_df = raw_df.iloc[30:40].copy()

def to_examples(df):
    examples = []
    for _, row in df.iterrows():
        examples.append(dspy.Example(text=row["Input"], target=row["Final_Output"]).with_inputs("text"))
    return examples

train_examples = to_examples(train_df)
valid_examples = to_examples(valid_df)

print(f"Train examples: {len(train_examples)}, Valid examples: {len(valid_examples)}")


Rows: 67
                                               Input  \
0  Cream and White Simple Minimalist Catering Ser...   
1  Beige Elegant Professional Business Invoice\n\...   

                                        Final_Output  
0  {"TOTAL_AMOUNT": "$1000", "DUE_AMOUNT": "$550"...  
1  {"INVOICE_NUMBER": "#01234", "BILLED_TO": "Est...  
Train examples: 30, Valid examples: 10


In [21]:
# Define a signature for extracting a JSON dict of invoice fields
class InvoiceExtraction(dspy.Signature):
    """
    Extract key-value invoice fields as a JSON dict from free-form invoice text.
    The output must be valid JSON, parseable into a Python dict[str, str].
    Do not invent fields; prefer fields present in the text.
    """
    text: str = dspy.InputField(description="Raw invoice text")
    rationale: str = dspy.OutputField(description="Brief reasoning, list detected fields")
    extracted: dict = dspy.OutputField(description="JSON dict of extracted fields (keys and string values)")

# Base module (simple Predict)
extractor = dspy.Predict(InvoiceExtraction)

print("Signature and extractor ready.")


Signature and extractor ready.


In [22]:
# Define a per-field accuracy metric: correct_fields / total_gold_fields
from dspy.evaluate import Evaluate


def normalize_dict(d: dict) -> dict:
    if d is None:
        return {}
    def norm_key(k: str) -> str:
        # Uppercase keys to align with dataset convention
        return str(k).strip().upper()
    def norm_val(v: str) -> str:
        s = str(v).strip()
        s = s.replace(",", "")
        s = s.replace("$ ", "$")
        s = " ".join(s.split())  # collapse internal whitespace
        return s
    return {norm_key(k): norm_val(v) for k, v in d.items()}


def field_accuracy_metric(example: dspy.Example, pred: dspy.Prediction, trace=None) -> float:
    gold = normalize_dict(example.target)
    got = normalize_dict(getattr(pred, "extracted", {}))
    if not gold:
        return 0.0
    correct = 0
    for k, v in gold.items():
        if k in got and got[k] == v:
            correct += 1
    return correct / len(gold)


# Wrap a callable for Evaluate
class ExtractCallable:
    def __call__(self, text: str):
        return extractor(text=text)


# Provide the devset at construction per latest API
evaluate = Evaluate(devset=valid_examples, metric=field_accuracy_metric, ordered=True)

initial_score = evaluate(ExtractCallable())
print("Initial field accuracy on valid:", initial_score)


2025/09/16 12:35:59 INFO dspy.evaluate.evaluate: Average Metric: 2.8251373626373626 / 10 (28.3%)


Initial field accuracy on valid: EvaluationResult(score=28.25, results=<list of 10 results>)


In [18]:
# Compare base vs optimized predictions on a sample
sample = valid_examples[0]
print("Sample comparison:")
print("\nINPUT TEXT:")
print("-" * 80)
print(sample.text[:300], "...")
print("-" * 80)

print("\nBASE MODEL PREDICTION:")
base_pred = extractor(text=sample.text)
print(f"Rationale: {base_pred.rationale}")
print("\nExtracted fields:")
for k,v in (base_pred.extracted or {}).items():
    print(f"{k:20s}: {v}")

print("\nGROUND TRUTH:")
print("\nExtracted fields:")
for k,v in sample.target.items():
    print(f"{k:20s}: {v}")


Sample comparison:

INPUT TEXT:
--------------------------------------------------------------------------------
Green Blue Pink Vintage Retro Freelance Invoice

NO. DESCRIPTION RATE QTY TOTAL
01 Graphic design for website 45 20 900.00
02 Graphic design for social content 45 8 360.00
03 Account management fee 280 1 280.00
04 Two days onsite 640 2 1280.00
SUB TOTAL $ 2820.00
TAX $ 225.60
S&H nil
TOTAL $ 3045.60 ...
--------------------------------------------------------------------------------

BASE MODEL PREDICTION:
Rationale: Detected invoice title, line items (descriptions, rates, quantities, totals), subtotal, tax, shipping & handling, grand total. Detected sender (Reese Miller / Timmerman Industries) and recipient (Rosa Maria Aguado / Aldenaire & Partners) contact blocks, invoice number (001) and date (02.05.2024), bank/account details, payment instructions, and invoice notes.

Extracted fields:
invoice_title       : Green Blue Pink Vintage Retro Freelance Invoice
invoice_number  

In [None]:
# Optimize with MiPROv2 (recommended in tutorial)
from dspy.optimizers import MIPROv2

mipro = MIPROv2(metric=field_accuracy_metric, auto="medium")

# Use ChainOfThought wrapper around the signature for better reasoning
program = dspy.ChainOfThought(InvoiceExtraction)

optimized = mipro.compile(
    program=program,
    trainset=train_examples,
    valset=valid_examples,
)

# Evaluate optimized program
class OptimizedCallable:
    def __call__(self, text: str):
        return optimized(text=text)

opt_score = evaluate(OptimizedCallable())
print("Optimized field accuracy on valid:", opt_score)


ModuleNotFoundError: No module named 'dspy.optimizers'

In [None]:
# Quick demo on a random validation example
sample = valid_examples[0]
print("INPUT (truncated):\n", sample.text[:600], "...\n")

base_pred = extractor(text=sample.text)
print("Base extracted keys:", list((base_pred.extracted or {}).keys()))

opt_pred = optimized(text=sample.text)
print("Optimized extracted keys:", list((opt_pred.extracted or {}).keys()))

print("\nGold keys:", list(sample.target.keys()))

# Show the last interaction for transparency
print("\nInspecting last prompt:")
dspy.inspect_history(n=1)
