In [11]:
# Install dependencies (run once per environment)
%pip install -q dspy pandas python-dotenv

# dataset comes from here: https://www.kaggle.com/datasets/nikitpatel/invoice-ner-dataset?resource=download


Note: you may need to restart the kernel to use updated packages.


In [19]:
# Basic imports and environment setup
import os
import json
import dspy
import pandas as pd
from dotenv import load_dotenv

# Load API keys from .env (OPENAI_API_KEY is expected)
load_dotenv()

# Configure DSPy default LM similar to other notebooks
lm = dspy.LM("openai/gpt-5-mini", api_key=os.getenv("OPENAI_API_KEY"), temperature=1, max_tokens=16000)
dspy.configure(lm=lm)

print("DSPy configured for invoice extraction.")


DSPy configured for invoice extraction.


In [20]:
# Load dataset and preview a few rows
# Prefer absolute path next to this notebook; fallback to project root
nb_dir = os.path.dirname(os.path.abspath("__file__")) if "__file__" in globals() else os.path.abspath("./dspy")
candidates = [
    os.path.join(nb_dir, "invoice_ner_dataset.csv"),
    os.path.abspath("./dspy/invoice_ner_dataset.csv"),
    os.path.abspath("invoice_ner_dataset.csv"),
]
for p in candidates:
    if os.path.exists(p):
        csv_path = p
        break
else:
    raise FileNotFoundError("invoice_ner_dataset.csv not found in expected locations")

raw_df = pd.read_csv(csv_path)
print("Rows:", len(raw_df))
print(raw_df.head(2))

# Parse Final_Output JSON strings into dicts
raw_df["Final_Output"] = raw_df["Final_Output"].apply(lambda s: json.loads(s))

# Build small train/test splits for quick iteration
# Keep it tiny for a simple example; adjust as needed
train_df = raw_df.iloc[:30].copy()
valid_df = raw_df.iloc[30:40].copy()

def to_examples(df):
    examples = []
    for _, row in df.iterrows():
        examples.append(dspy.Example(text=row["Input"], target=row["Final_Output"]).with_inputs("text"))
    return examples

train_examples = to_examples(train_df)
valid_examples = to_examples(valid_df)

print(f"Train examples: {len(train_examples)}, Valid examples: {len(valid_examples)}")


Rows: 67
                                               Input  \
0  Cream and White Simple Minimalist Catering Ser...   
1  Beige Elegant Professional Business Invoice\n\...   

                                        Final_Output  
0  {"TOTAL_AMOUNT": "$1000", "DUE_AMOUNT": "$550"...  
1  {"INVOICE_NUMBER": "#01234", "BILLED_TO": "Est...  
Train examples: 30, Valid examples: 10


In [28]:
# Define a signature for extracting a JSON dict of invoice fields
class InvoiceExtraction(dspy.Signature):
    """
    Extract key-value invoice fields as a JSON dict from free-form invoice text.
    """
    text: str = dspy.InputField(description="Raw invoice text")
    rationale: str = dspy.OutputField(description="Brief reasoning, list detected fields")
    extracted: dict = dspy.OutputField(description="JSON dict with only UPPERCASE dataset keys and string values")

# Base module (simple Predict)
extractor = dspy.Predict(InvoiceExtraction)

print("Signature and extractor ready.")


Signature and extractor ready.


In [29]:
# Define a per-field accuracy metric: correct_fields / total_gold_fields
from dspy.evaluate import Evaluate


def normalize_dict(d: dict) -> dict:
    if d is None:
        return {}
    def norm_key(k: str) -> str:
        # Uppercase keys to align with dataset convention
        return str(k).strip().upper()
    def norm_val(v: str) -> str:
        s = str(v).strip()
        s = s.replace(",", "")
        s = s.replace("$ ", "$")
        s = " ".join(s.split())  # collapse internal whitespace
        return s
    return {norm_key(k): norm_val(v) for k, v in d.items()}


def field_accuracy_metric(example: dspy.Example, pred: dspy.Prediction, trace=None) -> float:
    # count how many fields are correct
    gold = normalize_dict(example.target)
    got = normalize_dict(getattr(pred, "extracted", {}))
    if not gold:
        return 0.0
    correct = 0
    for k, v in gold.items():
        if k in got and got[k] == v:
            correct += 1
    return correct / len(gold)


# Provide the devset at construction per latest API
evaluate = Evaluate(devset=valid_examples, metric=field_accuracy_metric, ordered=True)

# Standard usage: pass the module directly
initial_score = evaluate(extractor)
print("Initial field accuracy on valid:", initial_score)


2025/09/16 12:44:50 INFO dspy.evaluate.evaluate: Average Metric: 3.228846153846154 / 10 (32.3%)


Initial field accuracy on valid: EvaluationResult(score=32.29, results=<list of 10 results>)


In [30]:
# Compare base vs optimized predictions on a sample
sample = valid_examples[0]
print("Sample comparison:")
print("\nINPUT TEXT:")
print("-" * 80)
print(sample.text[:300], "...")
print("-" * 80)

print("\nBASE MODEL PREDICTION:")
base_pred = extractor(text=sample.text)
print(f"Rationale: {base_pred.rationale}")
print("\nExtracted fields:")
for k,v in (base_pred.extracted or {}).items():
    print(f"{k:20s}: {v}")

print("\nGROUND TRUTH:")
print("\nExtracted fields:")
for k,v in sample.target.items():
    print(f"{k:20s}: {v}")


Sample comparison:

INPUT TEXT:
--------------------------------------------------------------------------------
Green Blue Pink Vintage Retro Freelance Invoice

NO. DESCRIPTION RATE QTY TOTAL
01 Graphic design for website 45 20 900.00
02 Graphic design for social content 45 8 360.00
03 Account management fee 280 1 280.00
04 Two days onsite 640 2 1280.00
SUB TOTAL $ 2820.00
TAX $ 225.60
S&H nil
TOTAL $ 3045.60 ...
--------------------------------------------------------------------------------

BASE MODEL PREDICTION:
Rationale: Detected invoice fields parsed from the text:
- Invoice title
- Invoice number ("NO. 001")
- Invoice date ("02.05.2024")
- From (biller) name, company, email, address, code
- To (client) name, company, email, address, code
- Line items (4 entries) with rate, quantity, totals
- Subtotal, Tax, Shipping (S&H), Total
- Bank / account details (Account Name, Bank, Acc)
- Payment instructions / terms ("Please pay within 60 days", cheques payable)
- Notes text

Extracte

In [34]:
# Optimize with MiPROv2 (recommended in tutorial)
from dspy import MIPROv2

mipro = MIPROv2(metric=field_accuracy_metric, auto="medium")


optimized = mipro.compile(
    extractor,
    trainset=train_examples,
    max_bootstrapped_demos=4,
    minibatch=False
)

# Standard usage: evaluate the optimized module directly
opt_score = evaluate(optimized)
print("Optimized field accuracy on valid:", opt_score)


2025/09/16 12:48:06 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING MEDIUM AUTO RUN SETTINGS:
num_trials: 18
minibatch: False
num_fewshot_candidates: 12
num_instruct_candidates: 6
valset size: 24

2025/09/16 12:48:06 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/09/16 12:48:06 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/09/16 12:48:06 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=12 sets of demonstrations...


Bootstrapping set 1/12
Bootstrapping set 2/12
Bootstrapping set 3/12


 67%|██████▋   | 4/6 [02:00<01:00, 30.21s/it]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 4/12


 67%|██████▋   | 4/6 [00:18<00:09,  4.68s/it]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 5/12


 67%|██████▋   | 4/6 [00:00<00:00, 32.59it/s]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 6/12


 50%|█████     | 3/6 [00:21<00:21,  7.26s/it]


Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 7/12


 67%|██████▋   | 4/6 [00:00<00:00, 133.86it/s]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 8/12


 17%|█▋        | 1/6 [00:00<00:00, 138.35it/s]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 9/12


 33%|███▎      | 2/6 [00:00<00:00, 130.83it/s]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 10/12


 17%|█▋        | 1/6 [00:00<00:00, 129.99it/s]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 11/12


 67%|██████▋   | 4/6 [00:00<00:00, 168.73it/s]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 12/12


 17%|█▋        | 1/6 [00:00<00:00, 127.72it/s]
2025/09/16 12:50:48 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/09/16 12:50:48 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.


2025/09/16 12:51:15 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing N=6 instructions...

2025/09/16 12:58:53 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/09/16 12:58:53 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Extract key-value invoice fields as a JSON dict from free-form invoice text.

2025/09/16 12:58:53 INFO dspy.teleprompt.mipro_optimizer_v2: 1: You are an expert invoice-data extraction assistant. Your task is to implement the Predict(text) -> (rationale, extracted) behavior: take a single block of OCR/plain invoice text (may contain one or more invoices) and return:
  1) rationale — a concise, human-readable explanation that maps each detected field to the exact source text snippets that produced it and describes any inference or validation performed; and
  2) extracted — a normalized dictionary (or list of dictionaries if multiple invoices are detected) of canonical invoice fields usable by downstream systems.

Follow these rules ex

Average Metric: 8.73 / 24 (36.4%): 100%|██████████| 24/24 [01:31<00:00,  3.81s/it]

2025/09/16 13:00:24 INFO dspy.evaluate.evaluate: Average Metric: 8.72515163285287 / 24 (36.4%)
2025/09/16 13:00:24 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 36.35

2025/09/16 13:00:24 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 2 / 18 =====



Average Metric: 6.60 / 24 (27.5%): 100%|██████████| 24/24 [03:29<00:00,  8.73s/it]

2025/09/16 13:03:54 INFO dspy.evaluate.evaluate: Average Metric: 6.600238813474108 / 24 (27.5%)
2025/09/16 13:03:54 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 27.5 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 6'].
2025/09/16 13:03:54 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [36.35, 27.5]
2025/09/16 13:03:54 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 36.35


2025/09/16 13:03:54 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 3 / 18 =====



Average Metric: 5.07 / 24 (21.1%): 100%|██████████| 24/24 [02:13<00:00,  5.58s/it]

2025/09/16 13:06:08 INFO dspy.evaluate.evaluate: Average Metric: 5.06882803728779 / 24 (21.1%)
2025/09/16 13:06:08 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 21.12 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 2'].
2025/09/16 13:06:08 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [36.35, 27.5, 21.12]
2025/09/16 13:06:08 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 36.35


2025/09/16 13:06:08 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 4 / 18 =====



Average Metric: 9.11 / 24 (38.0%): 100%|██████████| 24/24 [01:56<00:00,  4.84s/it]

2025/09/16 13:08:04 INFO dspy.evaluate.evaluate: Average Metric: 9.112777938435833 / 24 (38.0%)
2025/09/16 13:08:04 INFO dspy.teleprompt.mipro_optimizer_v2: [92mBest full score so far![0m Score: 37.97
2025/09/16 13:08:04 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 37.97 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 6'].
2025/09/16 13:08:04 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [36.35, 27.5, 21.12, 37.97]
2025/09/16 13:08:04 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 37.97


2025/09/16 13:08:04 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 5 / 18 =====



Average Metric: 3.71 / 24 (15.5%): 100%|██████████| 24/24 [03:54<00:00,  9.76s/it]

2025/09/16 13:11:58 INFO dspy.evaluate.evaluate: Average Metric: 3.712158648884655 / 24 (15.5%)
2025/09/16 13:11:58 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 15.47 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 4'].
2025/09/16 13:11:58 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [36.35, 27.5, 21.12, 37.97, 15.47]
2025/09/16 13:11:58 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 37.97


2025/09/16 13:11:58 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 6 / 18 =====



Average Metric: 4.80 / 24 (20.0%): 100%|██████████| 24/24 [03:37<00:00,  9.04s/it]

2025/09/16 13:15:36 INFO dspy.evaluate.evaluate: Average Metric: 4.801246848266972 / 24 (20.0%)
2025/09/16 13:15:36 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 20.01 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 5'].
2025/09/16 13:15:36 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [36.35, 27.5, 21.12, 37.97, 15.47, 20.01]
2025/09/16 13:15:36 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 37.97


2025/09/16 13:15:36 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 18 =====



Average Metric: 4.17 / 24 (17.4%): 100%|██████████| 24/24 [02:54<00:00,  7.25s/it]

2025/09/16 13:18:30 INFO dspy.evaluate.evaluate: Average Metric: 4.168006743858137 / 24 (17.4%)
2025/09/16 13:18:30 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 17.37 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 6'].
2025/09/16 13:18:30 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [36.35, 27.5, 21.12, 37.97, 15.47, 20.01, 17.37]
2025/09/16 13:18:30 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 37.97


2025/09/16 13:18:30 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 8 / 18 =====



Average Metric: 0.00 / 24 (0.0%): 100%|██████████| 24/24 [04:02<00:00, 10.12s/it]

2025/09/16 13:22:32 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 24 (0.0%)
2025/09/16 13:22:32 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 0.0 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 1'].
2025/09/16 13:22:32 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [36.35, 27.5, 21.12, 37.97, 15.47, 20.01, 17.37, 0.0]
2025/09/16 13:22:32 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 37.97


2025/09/16 13:22:32 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 9 / 18 =====



Average Metric: 4.66 / 24 (19.4%): 100%|██████████| 24/24 [03:24<00:00,  8.51s/it]

2025/09/16 13:25:57 INFO dspy.evaluate.evaluate: Average Metric: 4.66480279696224 / 24 (19.4%)
2025/09/16 13:25:57 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 19.44 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 3'].
2025/09/16 13:25:57 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [36.35, 27.5, 21.12, 37.97, 15.47, 20.01, 17.37, 0.0, 19.44]
2025/09/16 13:25:57 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 37.97


2025/09/16 13:25:57 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 10 / 18 =====



Average Metric: 4.88 / 24 (20.3%): 100%|██████████| 24/24 [03:12<00:00,  8.02s/it]

2025/09/16 13:29:09 INFO dspy.evaluate.evaluate: Average Metric: 4.881148950808393 / 24 (20.3%)
2025/09/16 13:29:09 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 20.34 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 10'].
2025/09/16 13:29:09 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [36.35, 27.5, 21.12, 37.97, 15.47, 20.01, 17.37, 0.0, 19.44, 20.34]
2025/09/16 13:29:09 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 37.97


2025/09/16 13:29:09 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 11 / 18 =====



Average Metric: 6.89 / 24 (28.7%): 100%|██████████| 24/24 [02:00<00:00,  5.04s/it]

2025/09/16 13:31:10 INFO dspy.evaluate.evaluate: Average Metric: 6.892343982127264 / 24 (28.7%)
2025/09/16 13:31:10 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 28.72 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 9'].
2025/09/16 13:31:10 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [36.35, 27.5, 21.12, 37.97, 15.47, 20.01, 17.37, 0.0, 19.44, 20.34, 28.72]
2025/09/16 13:31:10 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 37.97


2025/09/16 13:31:10 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 12 / 18 =====



Average Metric: 8.73 / 24 (36.4%): 100%|██████████| 24/24 [00:00<00:00, 4450.98it/s]

2025/09/16 13:31:10 INFO dspy.evaluate.evaluate: Average Metric: 8.72515163285287 / 24 (36.4%)
2025/09/16 13:31:10 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 36.35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 0'].
2025/09/16 13:31:10 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [36.35, 27.5, 21.12, 37.97, 15.47, 20.01, 17.37, 0.0, 19.44, 20.34, 28.72, 36.35]
2025/09/16 13:31:10 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 37.97


2025/09/16 13:31:10 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 13 / 18 =====



Average Metric: 9.11 / 24 (38.0%): 100%|██████████| 24/24 [00:00<00:00, 2106.01it/s]

2025/09/16 13:31:10 INFO dspy.evaluate.evaluate: Average Metric: 9.112777938435833 / 24 (38.0%)
2025/09/16 13:31:10 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 37.97 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 6'].
2025/09/16 13:31:10 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [36.35, 27.5, 21.12, 37.97, 15.47, 20.01, 17.37, 0.0, 19.44, 20.34, 28.72, 36.35, 37.97]
2025/09/16 13:31:10 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 37.97


2025/09/16 13:31:10 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 14 / 18 =====



Average Metric: 3.78 / 24 (15.7%): 100%|██████████| 24/24 [04:05<00:00, 10.23s/it]

2025/09/16 13:35:16 INFO dspy.evaluate.evaluate: Average Metric: 3.7783351194728905 / 24 (15.7%)
2025/09/16 13:35:16 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 15.74 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 6'].
2025/09/16 13:35:16 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [36.35, 27.5, 21.12, 37.97, 15.47, 20.01, 17.37, 0.0, 19.44, 20.34, 28.72, 36.35, 37.97, 15.74]
2025/09/16 13:35:16 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 37.97


2025/09/16 13:35:16 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 15 / 18 =====



Average Metric: 9.11 / 24 (38.0%): 100%|██████████| 24/24 [00:00<00:00, 3812.43it/s]

2025/09/16 13:35:16 INFO dspy.evaluate.evaluate: Average Metric: 9.112777938435833 / 24 (38.0%)
2025/09/16 13:35:16 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 37.97 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 6'].
2025/09/16 13:35:16 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [36.35, 27.5, 21.12, 37.97, 15.47, 20.01, 17.37, 0.0, 19.44, 20.34, 28.72, 36.35, 37.97, 15.74, 37.97]
2025/09/16 13:35:16 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 37.97


2025/09/16 13:35:16 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 16 / 18 =====



Average Metric: 0.00 / 24 (0.0%): 100%|██████████| 24/24 [04:42<00:00, 11.76s/it]

2025/09/16 13:39:58 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 24 (0.0%)
2025/09/16 13:39:58 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 0.0 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 8'].
2025/09/16 13:39:58 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [36.35, 27.5, 21.12, 37.97, 15.47, 20.01, 17.37, 0.0, 19.44, 20.34, 28.72, 36.35, 37.97, 15.74, 37.97, 0.0]
2025/09/16 13:39:58 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 37.97


2025/09/16 13:39:58 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 17 / 18 =====



Average Metric: 9.66 / 24 (40.2%): 100%|██████████| 24/24 [01:41<00:00,  4.24s/it]

2025/09/16 13:41:40 INFO dspy.evaluate.evaluate: Average Metric: 9.657095463016516 / 24 (40.2%)
2025/09/16 13:41:40 INFO dspy.teleprompt.mipro_optimizer_v2: [92mBest full score so far![0m Score: 40.24
2025/09/16 13:41:40 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 40.24 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 11'].
2025/09/16 13:41:40 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [36.35, 27.5, 21.12, 37.97, 15.47, 20.01, 17.37, 0.0, 19.44, 20.34, 28.72, 36.35, 37.97, 15.74, 37.97, 0.0, 40.24]
2025/09/16 13:41:40 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 40.24


2025/09/16 13:41:40 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 18 / 18 =====



Average Metric: 6.60 / 24 (27.5%): 100%|██████████| 24/24 [01:38<00:00,  4.09s/it]

2025/09/16 13:43:18 INFO dspy.evaluate.evaluate: Average Metric: 6.599753059118385 / 24 (27.5%)
2025/09/16 13:43:18 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 27.5 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 7'].
2025/09/16 13:43:18 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [36.35, 27.5, 21.12, 37.97, 15.47, 20.01, 17.37, 0.0, 19.44, 20.34, 28.72, 36.35, 37.97, 15.74, 37.97, 0.0, 40.24, 27.5]
2025/09/16 13:43:18 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 40.24


2025/09/16 13:43:18 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 19 / 18 =====



Average Metric: 2.87 / 24 (12.0%): 100%|██████████| 24/24 [04:02<00:00, 10.09s/it]

2025/09/16 13:47:20 INFO dspy.evaluate.evaluate: Average Metric: 2.8722477857539777 / 24 (12.0%)
2025/09/16 13:47:20 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 11.97 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 11'].
2025/09/16 13:47:20 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [36.35, 27.5, 21.12, 37.97, 15.47, 20.01, 17.37, 0.0, 19.44, 20.34, 28.72, 36.35, 37.97, 15.74, 37.97, 0.0, 40.24, 27.5, 11.97]
2025/09/16 13:47:20 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 40.24


2025/09/16 13:47:20 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 40.24!





2025/09/16 13:48:08 INFO dspy.evaluate.evaluate: Average Metric: 2.9943223443223443 / 10 (29.9%)


Optimized field accuracy on valid: EvaluationResult(score=29.94, results=<list of 10 results>)


In [42]:
# Evaluate the optimized model on validation set
opt_score = evaluate(optimized)
print("Optimized field accuracy on validation:", opt_score)


2025/09/16 13:54:44 INFO dspy.evaluate.evaluate: Average Metric: 2.9943223443223443 / 10 (29.9%)


Optimized field accuracy on validation: EvaluationResult(score=29.94, results=<list of 10 results>)


In [39]:
# Quick demo on a random validation example
sample = valid_examples[0]
print("Sample comparison:")
print("\nINPUT TEXT:")
print("-" * 80)
print(sample.text[:300], "...")
print("-" * 80)

print("\nBASE MODEL PREDICTION:")
base_pred = extractor(text=sample.text)
print(f"Rationale: {base_pred.rationale}")
print("\nExtracted fields:")
for k,v in (base_pred.extracted or {}).items():
    print(f"{k:20s}: {v}")

print("\nOPTIMIZED MODEL PREDICTION:")
opt_pred = optimized(text=sample.text)
print(f"Rationale: {opt_pred.rationale}")
print("\nExtracted fields:")
for k,v in (opt_pred.extracted or {}).items():
    print(f"{k:20s}: {v}")

print("\nGROUND TRUTH:")
print("\nExtracted fields:")
for k,v in sample.target.items():
    print(f"{k:20s}: {v}")


Sample comparison:

INPUT TEXT:
--------------------------------------------------------------------------------
Green Blue Pink Vintage Retro Freelance Invoice

NO. DESCRIPTION RATE QTY TOTAL
01 Graphic design for website 45 20 900.00
02 Graphic design for social content 45 8 360.00
03 Account management fee 280 1 280.00
04 Two days onsite 640 2 1280.00
SUB TOTAL $ 2820.00
TAX $ 225.60
S&H nil
TOTAL $ 3045.60 ...
--------------------------------------------------------------------------------

BASE MODEL PREDICTION:
Rationale: Detected invoice fields parsed from the text:
- Invoice title
- Invoice number ("NO. 001")
- Invoice date ("02.05.2024")
- From (biller) name, company, email, address, code
- To (client) name, company, email, address, code
- Line items (4 entries) with rate, quantity, totals
- Subtotal, Tax, Shipping (S&H), Total
- Bank / account details (Account Name, Bank, Acc)
- Payment instructions / terms ("Please pay within 60 days", cheques payable)
- Notes text

Extracte

In [40]:
# Compare predictions with ground truth
print("\nField Comparison:")
print("-" * 80)
for field in sample.target.keys():
    gt_value = sample.target[field]
    opt_value = opt_pred.extracted.get(field, "MISSING")
    match = "✓" if gt_value == opt_value else "✗"
    print(f"{field:20s}: {match}")
    if match == "✗":
        print(f"  Ground truth: {gt_value}")
        print(f"  Prediction:   {opt_value}")
print("-" * 80)



Field Comparison:
--------------------------------------------------------------------------------
INVOICE_NUMBER      : ✓
DATE                : ✗
  Ground truth: 02.05.2024
  Prediction:   MISSING
BILL_TO             : ✗
  Ground truth: Rosa Maria Aguado
  Prediction:   MISSING
COMPANY             : ✗
  Ground truth: Aldenaire & Partners
  Prediction:   MISSING
ADDRESS             : ✗
  Ground truth: 123 Anywhere St., Any City, ST 12345
  Prediction:   MISSING
EMAIL               : ✗
  Ground truth: hello@reallygreatsite.com
  Prediction:   MISSING
BANK_NAME           : ✓
BANK_ACCOUNT        : ✗
  Ground truth: 123-456-7890
  Prediction:   MISSING
ITEM_DESCRIPTION    : ✗
  Ground truth: Two days onsite
  Prediction:   MISSING
QTY                 : ✗
  Ground truth: 2
  Prediction:   MISSING
RATE                : ✗
  Ground truth: 640
  Prediction:   MISSING
AMOUNT              : ✗
  Ground truth: 1280.00
  Prediction:   MISSING
SUBTOTAL            : ✗
  Ground truth: $ 2820.00
  Pred

In [38]:
# Show the last interaction for transparency
print("\nInspecting last prompt:")
dspy.inspect_history(n=1)


Inspecting last prompt:




[34m[2025-09-16T13:50:03.809543][0m

[31mSystem message:[0m

Your input fields are:
1. `text` (str): Raw invoice text
Your output fields are:
1. `rationale` (str): Brief reasoning, list detected fields
2. `extracted` (dict): JSON dict with only UPPERCASE dataset keys and string values
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## text ## ]]
{text}

[[ ## rationale ## ]]
{rationale}

[[ ## extracted ## ]]
{extracted}        # note: the value you produce must adhere to the JSON schema: {"type": "object", "additionalProperties": true}

[[ ## completed ## ]]
In adhering to this structure, your objective is: 
        Extract key-value invoice fields as a JSON dict from free-form invoice text.


[31mUser message:[0m

[[ ## text ## ]]
Black and White Minimalist Business Invoice

Invoice16 June 2025

Invoice No. 12345

BILL TO:

Marceline Anderson

+123-456-7890

DESCRIPTION PRICE

TAX (20%)SUBTOTAL

S