In [4]:
# Install dependencies (run once per environment)
%pip install -q dspy pandas python-dotenv

# dataset comes from here: https://www.kaggle.com/datasets/nikitpatel/invoice-ner-dataset?resource=download


Note: you may need to restart the kernel to use updated packages.


In [5]:
# Basic imports and environment setup
import os
import json
import dspy
import pandas as pd
from dotenv import load_dotenv

# Load API keys from .env (OPENAI_API_KEY is expected)
load_dotenv()

# Configure DSPy default LM similar to other notebooks
lm = dspy.LM("openai/gpt-5-mini", api_key=os.getenv("OPENAI_API_KEY"), temperature=1, max_tokens=16000)
dspy.configure(lm=lm)

print("DSPy configured for invoice extraction.")


DSPy configured for invoice extraction.


In [17]:
# Load dataset and preview a few rows
# Prefer absolute path next to this notebook; fallback to project root
nb_dir = os.path.dirname(os.path.abspath("__file__")) if "__file__" in globals() else os.path.abspath("./dspy")
candidates = [
    os.path.abspath("invoice_ner_clean.csv"),
]
for p in candidates:
    if os.path.exists(p):
        csv_path = p
        break
else:
    raise FileNotFoundError("invoice_ner_clean.csv not found in expected locations")

raw_df = pd.read_csv(csv_path)
print("Rows:", len(raw_df))
print(raw_df.head(2))

# Parse Final_Output JSON strings into dicts
raw_df["Final_Output"] = raw_df["Final_Output"].apply(lambda s: json.loads(s))

# Build small train/test splits for quick iteration
# Keep it tiny for a simple example; adjust as needed
train_df = raw_df.iloc[:30].copy()
valid_df = raw_df.iloc[30:40].copy()

def to_examples(df):
    examples = []
    for _, row in df.iterrows():
        examples.append(dspy.Example(text=row["Input"], target=row["Final_Output"]).with_inputs("text"))
    return examples

train_examples = to_examples(train_df)
valid_examples = to_examples(valid_df)

print(f"Train examples: {len(train_examples)}, Valid examples: {len(valid_examples)}")


Rows: 39
                                               Input  \
0  Beige Elegant Professional Business Invoice\n\...   
1  Black and White Clean Modern Invoice\n\nConsul...   

                                        Final_Output  
0  {"invoice_number":"1234","invoice_date":"2030-...  
1  {"invoice_number":"INV-01234","invoice_date":"...  
Train examples: 30, Valid examples: 9


In [11]:
# Define a signature for extracting a JSON dict of invoice fields
class InvoiceExtraction(dspy.Signature):
    """
    Extract key-value invoice fields as a JSON dict from free-form invoice text.
    """
    text: str = dspy.InputField(description="Raw invoice text")
    rationale: str = dspy.OutputField(description="Brief reasoning, list detected fields")
    extracted: dict = dspy.OutputField(description="JSON dict with dataset keys and string values")

# Base module (simple Predict)
extractor = dspy.Predict(InvoiceExtraction)

print("Signature and extractor ready.")


Signature and extractor ready.


In [23]:
# Define a per-field partial-credit metric: +0.5 for key presence, +0.5 for exact value
from dspy.evaluate import Evaluate


def field_accuracy_metric(example: dspy.Example, pred: dspy.Prediction, trace=None, pred_name=None, pred_trace=None) -> float:
    """
    Per-field partial-credit metric (no normalization):
    - +0.5 if a gold key is present exactly in the prediction
    - +0.5 if the predicted value equals the gold value exactly
    Extra keys are ignored. Returns float or dspy.Prediction with feedback for GEPA.
    """
    gold = example.target or {}
    got = getattr(pred, "extracted", {}) or {}

    if not gold:
        return 0.0 if pred_name is None else dspy.Prediction(score=0.0, feedback="No gold fields present.")

    total = len(gold)
    present = 0
    correct = 0
    missing_keys = []
    mismatched_keys = []

    for k, v in gold.items():
        if k in got:
            present += 1
            if got[k] == v:
                correct += 1
            else:
                mismatched_keys.append(k)
        else:
            missing_keys.append(k)

    score = (0.5 * present + 0.5 * correct) / total

    if pred_name is None:
        return score

    # Build concise feedback for GEPA
    lines = [
        f"Keys present: {present}/{total}. Values correct: {correct}/{total}.",
    ]
    if missing_keys:
        lines.append(f"Missing keys: {', '.join(missing_keys[:8])}.")
    if mismatched_keys:
        lines.append(f"Mismatched keys: {', '.join(mismatched_keys[:8])}.")

    feedback = "\n".join(lines)
    return dspy.Prediction(score=score, feedback=feedback)


# Provide the devset at construction per latest API
evaluate = Evaluate(devset=valid_examples, metric=field_accuracy_metric, ordered=True)

# Standard usage: pass the module directly
initial_score = evaluate(extractor)
print("Initial field accuracy on valid:", initial_score)


2025/09/17 09:43:16 INFO dspy.evaluate.evaluate: Average Metric: 2.375 / 9 (26.4%)


Initial field accuracy on valid: EvaluationResult(score=26.39, results=<list of 9 results>)


In [24]:
# Compare base vs optimized predictions on a sample
sample = valid_examples[0]
print("Sample comparison:")
print("\nINPUT TEXT:")
print("-" * 80)
print(sample.text[:300], "...")
print("-" * 80)

print("\nBASE MODEL PREDICTION:")
base_pred = extractor(text=sample.text)
print(f"Rationale: {base_pred.rationale}")
print("\nExtracted fields:")
for k,v in (base_pred.extracted or {}).items():
    print(f"{k:20s}: {v}")

print("\nGROUND TRUTH:")
print("\nExtracted fields:")
for k,v in sample.target.items():
    print(f"{k:20s}: {v}")


Sample comparison:

INPUT TEXT:
--------------------------------------------------------------------------------
Yellow and Black Professional Company Invoice

No Items Qty Price Total
1 Cement 50 $20.00 $1000.00
2 Pvc Pipe 10 $10.00 $100.00
3 Brick 10 $10.00 $100.00
4 Wood Board 10 $10.00 $100.00
Taylor Alonso
Ginyard International Bank
Bank Code (123-456-7890)
Total $1200.00
INVOICE
Ingoude
Company
Jonathan  ...
--------------------------------------------------------------------------------

BASE MODEL PREDICTION:
Rationale: Parsed the invoice text and extracted identifiable fields. Detected:
- Vendor: "Yellow and Black Professional Company"
- Invoice number: "#123456789"
- Invoice date: "3rd January 2024" (ISO: 2024-01-03)
- Line items (4): Cement, Pvc Pipe, Brick, Wood Board with qty, unit price, line totals
- Calculated subtotal from line items: $1300.00
- Total shown on invoice: $1200.00 (mismatch vs calculated subtotal)
- Bank: "Ginyard International Bank" and "Bank Code (123-4

In [34]:
# Optimize with GEPA (similar to comedian-agent)
from dspy import GEPA

optimizer = GEPA(
    metric=field_accuracy_metric,
    max_full_evals=3,
    num_threads=8,
    track_stats=True,
    use_merge=False,
    reflection_lm=dspy.LM(model='gpt-5', temperature=1.0, max_tokens=32000),
)

optimized_program = optimizer.compile(
    extractor,
    trainset=train_examples,
    valset=valid_examples,
)

# Standard usage: evaluate the optimized program directly
opt_score = evaluate(optimized_program)
print("Optimized field accuracy on valid:", opt_score)


2025/09/17 09:57:37 INFO dspy.teleprompt.gepa.gepa: Running GEPA for approx 117 metric calls of the program. This amounts to 3.00 full evals on the train+val set.
2025/09/17 09:57:37 INFO dspy.teleprompt.gepa.gepa: Using 9 examples for tracking Pareto scores. You can consider using a smaller sample of the valset to allow GEPA to explore more diverse solutions within the same budget.
GEPA Optimization:   0%|          | 0/117 [00:00<?, ?rollouts/s]2025/09/17 09:57:37 INFO dspy.evaluate.evaluate: Average Metric: 2.375 / 9 (26.4%)
2025/09/17 09:57:37 INFO dspy.teleprompt.gepa.gepa: Iteration 0: Base program full valset score: 0.2638888888888889
2025/09/17 09:57:37 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Selected program 0 score: 0.2638888888888889


Average Metric: 0.94 / 3 (31.2%): 100%|██████████| 3/3 [00:00<00:00, 3479.79it/s]

2025/09/17 09:57:37 INFO dspy.evaluate.evaluate: Average Metric: 0.9375 / 3 (31.2%)
2025/09/17 09:57:37 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Proposed new text for self: Task:
From free-form invoice text, extract a small, fixed set of top-level invoice fields and return them as a flat JSON object.

Output format:
- Return a single JSON object (not wrapped, no comments, no rationale).
- Include exactly these 8 keys (all keys must be present, even if the value is unknown):
  1) company
  2) billed_to
  3) invoice_number
  4) date
  5) total_amount
  6) bank_name
  7) account_name
  8) account_number
- Use string values for all fields.
- If a field is not present in the text, set it to "" (empty string).
- Do not include any extra keys (no addresses, emails, line items, notes, etc. unless they are part of the 8 required keys).

Field definitions and extraction rules:
- company:
  - The issuing/seller entity (the party sending the invoice).
  - Look for labels/contexts like: “From”,


Average Metric: 1.94 / 3 (64.6%): 100%|██████████| 3/3 [00:00<00:00, 4845.17it/s]

2025/09/17 09:57:37 INFO dspy.evaluate.evaluate: Average Metric: 1.9375 / 3 (64.6%)
2025/09/17 09:57:37 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Proposed new text for self: Task
From free-form invoice text (plain text), extract a small, fixed set of top-level invoice fields and return them as a flat JSON object.

Output format
- Return a single JSON object (not wrapped, no comments, no rationale).
- Include exactly these 8 keys (all must be present, even if unknown), with string values:
  1) "company"
  2) "billed_to"
  3) "invoice_number"
  4) "date"
  5) "total_amount"
  6) "bank_name"
  7) "account_name"
  8) "account_number"
- If a field is not present in the text, set it to "" (empty string).
- Do not include any extra keys or metadata. Do NOT rename any keys (e.g., do not use "invoice_date"; the correct key is "date").

General extraction strategy
- Parse the text for clearly labeled fields first. Prefer values explicitly tied to labels over inferred or stylistic text.
- When


Average Metric: 1.94 / 3 (64.6%): 100%|██████████| 3/3 [00:26<00:00,  8.71s/it]

2025/09/17 09:58:03 INFO dspy.evaluate.evaluate: Average Metric: 1.9375 / 3 (64.6%)





2025/09/17 09:59:30 INFO dspy.teleprompt.gepa.gepa: Iteration 3: Proposed new text for self: Task:
From free-form invoice text, extract a small, fixed set of top-level invoice fields and return them as a flat JSON object.

Output format (strict):
- Return a single JSON object only (no wrapper, no comments, no rationale).
- Include exactly these 8 keys (all 8 must be present, even if unknown):
  1) company
  2) billed_to
  3) invoice_number
  4) invoice_date
  5) total_amount
  6) bank_name
  7) account_name
  8) account_number
- Use string values for all fields.
- If a field is not present in the text, set it to "" (empty string).
- Do not include any extra keys (no addresses, emails, line items, notes, etc.).

Field definitions and extraction rules:

- company (issuer/seller sending the invoice):
  - Look for clear issuer blocks such as “From”, “FROM:”, “Seller”, “Vendor”, “Company”, “Service Provider”, “PAY TO:” (issuer’s payee details).
  - Also consider the prominent brand/header n

Average Metric: 2.12 / 3 (70.8%): 100%|██████████| 3/3 [00:24<00:00,  8.15s/it]

2025/09/17 10:01:35 INFO dspy.evaluate.evaluate: Average Metric: 2.125 / 3 (70.8%)





2025/09/17 10:06:03 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Proposed new text for self: Task:
From free-form invoice text, extract a small, fixed set of top-level invoice fields and return them as a flat JSON object.

Strict output format:
- Return a single JSON object only (no wrapper, no comments, no rationale).
- Include exactly these 8 keys (all 8 must be present, even if unknown), with string values only, in this exact order:
  1) company
  2) billed_to
  3) invoice_number
  4) invoice_date
  5) total_amount
  6) bank_name
  7) account_name
  8) account_number
- If a field is not present in the text, set it to "" (empty string).
- Do not include any extra keys (no addresses, emails, line items, notes, etc.).

General extraction principles:
- Be case-insensitive for label detection (e.g., “INVOICE NO”, “invoice no”, “Invoice No”).
- Labels and values can be split across adjacent lines; associate them by proximity, order, and plausibility (e.g., bank names are alphabetic words;

Average Metric: 2.12 / 3 (70.8%): 100%|██████████| 3/3 [00:36<00:00, 12.17s/it]

2025/09/17 10:07:12 INFO dspy.evaluate.evaluate: Average Metric: 2.125 / 3 (70.8%)





2025/09/17 10:12:33 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Proposed new text for self: Task
From free-form invoice text (plain text), extract a small, fixed set of top-level invoice fields and return them as a flat JSON object.

Input
- A single block of plain text containing an invoice (often messy, with headers, labels, placeholders, totals, and bank/payment details).
- Many inputs are mockups/templates with placeholder text. Never guess; only extract values that are explicitly present and clearly tied to labels.

Output format
- Return exactly one JSON object (not wrapped, no comments, no rationale).
- Use double quotes for all keys and string values.
- Include exactly these 8 keys (all must be present, even if unknown), with string values:
  1) "company"
  2) "billed_to"
  3) "invoice_number"
  4) "date"
  5) "total_amount"
  6) "bank_name"
  7) "account_name"
  8) "account_number"
- If a field is not present in the text, set it to "" (empty string).
- Do not include any extr

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:47<00:00, 15.92s/it]

2025/09/17 10:13:59 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/09/17 10:15:23 INFO dspy.teleprompt.gepa.gepa: Iteration 6: Proposed new text for self: Task
From free-form invoice text (plain text), extract a small, fixed set of top-level invoice fields and return them as a flat JSON object.

Input
- A single string that contains the full invoice text. The text may be noisy, include template/branding phrases, and may present fields in any order.

Output format
- Return exactly one JSON object (not wrapped in an array; no comments, no rationale).
- Include exactly these 8 keys with string values (all keys must be present, even if the value is unknown):
  1) "company"
  2) "billed_to"
  3) "invoice_number"
  4) "invoice_date"
  5) "total_amount"
  6) "bank_name"
  7) "account_name"
  8) "account_number"
- If a field is not explicitly present in the text, set its value to "" (empty string).
- Do not include any extra keys or metadata. Do not rename any keys.

General extraction strategy
- Prioritize clearly labeled fields. Prefer values explicitl

Average Metric: 2.12 / 3 (70.8%): 100%|██████████| 3/3 [00:22<00:00,  7.66s/it]

2025/09/17 10:17:14 INFO dspy.evaluate.evaluate: Average Metric: 2.125 / 3 (70.8%)





2025/09/17 10:20:38 INFO dspy.teleprompt.gepa.gepa: Iteration 7: Proposed new text for self: Task:
From free-form invoice text (often noisy/OCR’d), extract a small, fixed set of top-level invoice fields and return them as a flat JSON object.

Output format (strict):
- Return a single JSON object only (no wrapper text, no comments, no rationale).
- Include exactly these 8 keys (all must be present, even if unknown), with string values:
  1) company
  2) billed_to
  3) invoice_number
  4) invoice_date
  5) total_amount
  6) bank_name
  7) account_name
  8) account_number
- If a field is not present in the text, set it to "" (empty string).
- Do not include any extra keys.

General extraction guidance:
- Prefer clearly labeled fields and the most explicit indicators.
- Labels and values may be split across adjacent lines or squeezed together (e.g., missing spaces). Associate by proximity and plausibility.
- Normalize extracted values by:
  - Trimming leading/trailing spaces.
  - Removing 

Average Metric: 2.38 / 3 (79.2%): 100%|██████████| 3/3 [00:27<00:00,  9.06s/it]

2025/09/17 10:22:24 INFO dspy.evaluate.evaluate: Average Metric: 2.375 / 3 (79.2%)





2025/09/17 10:24:21 INFO dspy.teleprompt.gepa.gepa: Iteration 8: Proposed new text for self: Task:
Extract a small, fixed set of top-level invoice fields from free-form invoice text (often noisy/OCR’d) and return them as a flat JSON object.

Input:
- A single block of raw invoice text (may contain OCR errors, concatenated labels/values, missing spaces, and mixed order).

Output (strict):
- Return a single JSON object only (no wrapper text, comments, or rationale).
- Include exactly these 8 keys (all must be present), with string values (use "" if unknown):
  1) company
  2) billed_to
  3) invoice_number
  4) invoice_date
  5) total_amount
  6) bank_name
  7) account_name
  8) account_number
- Do not include any extra keys.

General extraction strategy:
- Prefer clearly labeled fields and the most explicit indicators.
- Associate labels and values across adjacent lines when separated by newlines or missing spaces (e.g., “INVOICEBorcelle” → label “INVOICE” + value “Borcelle”).
- When mul

Average Metric: 2.31 / 3 (77.1%): 100%|██████████| 3/3 [00:33<00:00, 11.12s/it]

2025/09/17 10:26:49 INFO dspy.evaluate.evaluate: Average Metric: 2.3125 / 3 (77.1%)





2025/09/17 10:29:17 INFO dspy.teleprompt.gepa.gepa: Iteration 9: Proposed new text for self: Task
Extract a small, fixed set of top-level invoice fields from free-form invoice text (often noisy/OCR’d) and return them as a flat JSON object.

Input
- A single block of raw invoice text (may contain OCR errors, concatenated labels/values, missing spaces, and mixed order).

Output (strict)
- Return a single JSON object only (no wrapper text, comments, or rationale).
- Include exactly these 8 keys (all must be present), with string values (use "" if unknown):
  1) company
  2) billed_to
  3) invoice_number
  4) invoice_date
  5) total_amount
  6) bank_name
  7) account_name
  8) account_number
- Do not include any extra keys.

General approach and normalization
- Prefer clearly labeled fields and the most explicit indicators. Associate labels and values across adjacent lines and across OCR-concatenations (e.g., “INVOICEBorcelle” → label “INVOICE” + value “Borcelle”).
- If a field is truly no

Optimized field accuracy on valid: EvaluationResult(score=71.53, results=<list of 9 results>)


In [35]:
# Evaluate the optimized program on validation set
opt_score = evaluate(optimized_program)
print("Optimized field accuracy on validation:", opt_score)


2025/09/17 10:40:22 INFO dspy.evaluate.evaluate: Average Metric: 6.4375 / 9 (71.5%)


Optimized field accuracy on validation: EvaluationResult(score=71.53, results=<list of 9 results>)


In [36]:
optimized_program.save("./invoice_program/", save_program=True)

In [37]:
print(valid_examples[0].text)

Yellow and Black Professional Company Invoice

No Items Qty Price Total
1 Cement 50 $20.00 $1000.00
2 Pvc Pipe 10 $10.00 $100.00
3 Brick 10 $10.00 $100.00
4 Wood Board 10 $10.00 $100.00
Taylor Alonso
Ginyard International Bank
Bank Code (123-456-7890)
Total $1200.00
INVOICE
Ingoude
Company
Jonathan Patterson
+123-456-7890
Invoice to
#123456789
3rd January 2024
Payment Terms
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
THANK YOU


In [30]:
print(valid_examples[0].target)

{'invoice_number': '123456789', 'invoice_date': '2024-01-03', 'billed_to': 'Jonathan Patterson', 'company': 'Ingoude', 'total_amount': '1200', 'bank_name': 'Ginyard International Bank', 'account_name': '', 'account_number': ''}


In [38]:
# Quick demo on a random validation example
sample = valid_examples[0]
print("Sample comparison:")
print("\nINPUT TEXT:")
print("-" * 80)
print(sample.text[:300], "...")
print("-" * 80)

print("\nBASE MODEL PREDICTION:")
base_pred = extractor(text=sample.text)
print(f"Rationale: {base_pred.rationale}")
print("\nExtracted fields:")
for k,v in (base_pred.extracted or {}).items():
    print(f"{k:20s}: {v}")

print("\nOPTIMIZED MODEL PREDICTION:")
opt_pred = optimized_program(text=sample.text)
print(f"Rationale: {opt_pred.rationale}")
print("\nExtracted fields:")
for k,v in (opt_pred.extracted or {}).items():
    print(f"{k:20s}: {v}")

print("\nGROUND TRUTH:")
print("\nExtracted fields:")
for k,v in sample.target.items():
    print(f"{k:20s}: {v}")


Sample comparison:

INPUT TEXT:
--------------------------------------------------------------------------------
Yellow and Black Professional Company Invoice

No Items Qty Price Total
1 Cement 50 $20.00 $1000.00
2 Pvc Pipe 10 $10.00 $100.00
3 Brick 10 $10.00 $100.00
4 Wood Board 10 $10.00 $100.00
Taylor Alonso
Ginyard International Bank
Bank Code (123-456-7890)
Total $1200.00
INVOICE
Ingoude
Company
Jonathan  ...
--------------------------------------------------------------------------------

BASE MODEL PREDICTION:
Rationale: Parsed the invoice text and extracted identifiable fields. Detected:
- Vendor: "Yellow and Black Professional Company"
- Invoice number: "#123456789"
- Invoice date: "3rd January 2024" (ISO: 2024-01-03)
- Line items (4): Cement, Pvc Pipe, Brick, Wood Board with qty, unit price, line totals
- Calculated subtotal from line items: $1300.00
- Total shown on invoice: $1200.00 (mismatch vs calculated subtotal)
- Bank: "Ginyard International Bank" and "Bank Code (123-4

In [39]:
# Compare predictions with ground truth
print("\nField Comparison:")
print("-" * 80)
for field in sample.target.keys():
    gt_value = sample.target[field]
    opt_value = opt_pred.extracted.get(field, "MISSING")
    match = "✓" if gt_value == opt_value else "✗"
    print(f"{field:20s}: {match}")
    if match == "✗":
        print(f"  Ground truth: {gt_value}")
        print(f"  Prediction:   {opt_value}")
print("-" * 80)



Field Comparison:
--------------------------------------------------------------------------------
invoice_number      : ✗
  Ground truth: 123456789
  Prediction:   #123456789
invoice_date        : ✗
  Ground truth: 2024-01-03
  Prediction:   3rd January 2024
billed_to           : ✗
  Ground truth: Jonathan Patterson
  Prediction:   Taylor Alonso
company             : ✗
  Ground truth: Ingoude
  Prediction:   Ingoude Company
total_amount        : ✗
  Ground truth: 1200
  Prediction:   $1200.00
bank_name           : ✓
account_name        : ✓
account_number      : ✓
--------------------------------------------------------------------------------


In [40]:
# Show the last interaction for transparency
print("\nInspecting last prompt:")
optimized_program(text=sample.text)
dspy.inspect_history(n=1)


Inspecting last prompt:




[34m[2025-09-17T10:40:39.735766][0m

[31mSystem message:[0m

Your input fields are:
1. `text` (str): Raw invoice text
Your output fields are:
1. `rationale` (str): Brief reasoning, list detected fields
2. `extracted` (dict): JSON dict with dataset keys and string values
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## text ## ]]
{text}

[[ ## rationale ## ]]
{rationale}

[[ ## extracted ## ]]
{extracted}        # note: the value you produce must adhere to the JSON schema: {"type": "object", "additionalProperties": true}

[[ ## completed ## ]]
In adhering to this structure, your objective is: 
        Task
        From free-form invoice text (plain text), extract a small, fixed set of top-level invoice fields and return them as a flat JSON object.
        
        Input
        - A single string that contains the full invoice text. The text may be noisy, include template/branding phrases, and may pre