In [None]:
!pip install -U dspy
!pip install -U datasets # HuggingFace Datasets for Fine-tuning
from google.colab import userdata

Collecting dspy
  Downloading dspy-3.0.3-py3-none-any.whl.metadata (7.2 kB)
Collecting backoff>=2.2 (from dspy)
  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Collecting optuna>=3.4.0 (from dspy)
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting magicattr>=0.1.6 (from dspy)
  Downloading magicattr-0.1.6-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting litellm>=1.64.0 (from dspy)
  Downloading litellm-1.77.5-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting diskcache>=5.6.0 (from dspy)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting json-repair>=0.30.0 (from dspy)
  Downloading json_repair-0.51.0-py3-none-any.whl.metadata (11 kB)
Collecting asyncer==0.0.8 (from dspy)
  Downloading asyncer-0.0.8-py3-none-any.whl.metadata (6.7 kB)
Collecting gepa==0.0.7 (from gepa[dspy]==0.0.7->dspy)
  Downloading gepa-0.0

In [None]:
import dspy
OPENAI_API_KEY=userdata.get("OPENAI_API_KEY")
HF_TOKEN=userdata.get("HF_TOKEN")

lm = dspy.LM("openai/gpt-4o-mini",
             api_key=OPENAI_API_KEY,)
dspy.configure(lm=lm)

## Math

In [None]:
math = dspy.ChainOfThought("question -> answer: float")
math(question="Two dice are tossed. What is the probability that the sum equals two?")

Prediction(
    reasoning='When two dice are tossed, each die has 6 faces, resulting in a total of 6 * 6 = 36 possible outcomes. The only way to achieve a sum of 2 is if both dice show a 1 (1, 1). There is only 1 favorable outcome for this event. Therefore, the probability of the sum equaling 2 is the number of favorable outcomes divided by the total number of outcomes, which is 1/36.',
    answer=0.027777777777777776
)

## RAG

In [None]:
def search_wikipedia(query: str) -> list[str]:
    try:
        results = dspy.ColBERTv2(url="http://20.102.90.50:2017/wiki17_abstracts")(query, k=3)
        # Check if the results structure is as expected before accessing 'text'
        if results and isinstance(results, list) and all('text' in x for x in results):
            return [x["text"] for x in results]
        else:
            print("Warning: 'text' key not found in all results or results format unexpected.")
            return [] # Return empty list or handle as appropriate
    except KeyError as e:
        print(f"Error accessing expected key: {e}")
        print("The response from the ColBERTv2 server did not contain the expected 'topk' key.")
        return [] # Return an empty list or handle the error as appropriate
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return []


rag = dspy.ChainOfThought("context, question -> response")

# question = "What's the name of the castle that David Gregory inherited?"
question = "How long did Steve Jobs run Apple for?"
rag(context=search_wikipedia(question), question=question)

Error accessing expected key: 'topk'
The response from the ColBERTv2 server did not contain the expected 'topk' key.


Prediction(
    reasoning='Steve Jobs co-founded Apple Inc. in 1976 and played a significant role in the company until he was ousted in 1985. He returned to Apple in 1997 and remained with the company until his death in 2011. Therefore, he ran Apple for approximately 9 years during his second tenure and about 9 years during his first tenure, totaling around 18 years.',
    response='Steve Jobs ran Apple for a total of approximately 18 years, with his first tenure from 1976 to 1985 and his second from 1997 until his death in 2011.'
)

## Classification

In [None]:
from typing import Literal

class Classify(dspy.Signature):
    """Classify sentiment of a given sentence."""

    sentence: str = dspy.InputField()
    sentiment: Literal["positive", "negative", "neutral"] = dspy.OutputField()
    confidence: float = dspy.OutputField()

classify = dspy.Predict(Classify)
classify(sentence="This book was super fun to read, though not the last chapter.")

Prediction(
    sentiment='positive',
    confidence=0.85
)

## Information Extraction

In [None]:
class ExtractInfo(dspy.Signature):
    """Extract structured information from text."""

    text: str = dspy.InputField()
    title: str = dspy.OutputField()
    headings: list[str] = dspy.OutputField()
    entities: list[dict[str, str]] = dspy.OutputField(desc="a list of entities and their metadata")

module = dspy.Predict(ExtractInfo)

text = "Apple Inc. announced its latest iPhone 14 today." \
    "The CEO, Tim Cook, highlighted its new features in a press release."
response = module(text=text)

print(response.title)
print(response.headings)
print(response.entities)

Apple Inc. Announces iPhone 14
['Announcement', 'CEO Statement', 'New Features']
[{'name': 'Apple Inc.', 'type': 'Organization'}, {'name': 'iPhone 14', 'type': 'Product'}, {'name': 'Tim Cook', 'type': 'Person'}]


## Agents

In [None]:
def evaluate_math(expression: str):
    return dspy.PythonInterpreter({}).execute(expression)

def search_wikipedia(query: str):
    results = dspy.ColBERTv2(url="http://20.102.90.50:2017/wiki17_abstracts")(query, k=3)
    return [x["text"] for x in results]

react = dspy.ReAct("question -> answer: float", tools=[evaluate_math, search_wikipedia])

pred = react(question="What is 9362158 divided by the year of birth of David Gregory of Kinnairdy castle?")
print(pred.answer)

5765.0


## Mult-Stage Pipeline

In [None]:

class Outline(dspy.Signature):
    """Outline a thorough overview of a topic."""

    topic: str = dspy.InputField()
    title: str = dspy.OutputField()
    sections: list[str] = dspy.OutputField()
    section_subheadings: dict[str, list[str]] = dspy.OutputField(desc="mapping from section headings to subheadings")

class DraftSection(dspy.Signature):
    """Draft a top-level section of an article."""

    topic: str = dspy.InputField()
    section_heading: str = dspy.InputField()
    section_subheadings: list[str] = dspy.InputField()
    content: str = dspy.OutputField(desc="markdown-formatted section")

class DraftArticle(dspy.Module):
    def __init__(self):
        self.build_outline = dspy.ChainOfThought(Outline)
        self.draft_section = dspy.ChainOfThought(DraftSection)

    def forward(self, topic):
        outline = self.build_outline(topic=topic)
        sections = []
        for heading, subheadings in outline.section_subheadings.items():
            section, subheadings = f"## {heading}", [f"### {subheading}" for subheading in subheadings]
            section = self.draft_section(topic=outline.title, section_heading=section, section_subheadings=subheadings)
            sections.append(section.content)
        return dspy.Prediction(title=outline.title, sections=sections)

draft_article = DraftArticle()
article = draft_article(topic="Apocolyptic 2012 Theory")

In [None]:
print(article)

Prediction(
    title='The Apocalyptic 2012 Theory: Origins, Impact, and Aftermath',
    sections=['## Origins of the 2012 Theory\n\n### Mayan Calendar and Cosmology\nThe Mayan civilization, known for its advanced understanding of astronomy and mathematics, developed a complex calendar system that included several cycles. One of the most significant cycles is the Long Count calendar, which spans approximately 5,125.36 years. This calendar is often cited as having a "completion" date of December 21, 2012. Many interpreted this date as a prophecy of an impending apocalypse, despite the fact that the Mayans themselves did not predict an end to the world but rather a transition to a new era.\n\n### Interpretations of End Dates\nThe interpretation of the Mayan calendar\'s end date has varied widely. Some scholars and enthusiasts suggested that it signified a cataclysmic event, while others argued it represented a time of renewal and transformation. This divergence in interpretation fueled s

## Optimising Prompts for a ReAct Agent

In [None]:
import dspy
from dspy.datasets import HotPotQA

# dspy.configure(lm=dspy.LM("openai/gpt-4o-mini", api_key=OPENAI_API_KEY))

def search_wikipedia(query: str) -> list[str]:
    results = dspy.ColBERTv2(url="http://20.102.90.50:2017/wiki17_abstracts")(query, k=3)
    # Check if the results structure is as expected before accessing 'text'
    if results and isinstance(results, list) and all('text' in x for x in results):
        return [x["text"] for x in results]
    else:
        print("Warning: 'text' key not found in all results or results format unexpected.")
        return [] # Return empty list or handle as appropriate


trainset = [x.with_inputs('question') for x in HotPotQA(train_seed=2024, train_size=100).train]
react = dspy.ReAct("question -> answer", tools=[search_wikipedia])

# tp = dspy.MIPROv2(metric=dspy.evaluate.answer_exact_match, auto="light", num_threads=24) # if using Local LLM 24 is possible
tp = dspy.MIPROv2(metric=dspy.evaluate.answer_exact_match, auto="light", num_threads=4) # 4 is ideal for API Calls
optimized_react = tp.compile(react, trainset=trainset)

`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'hotpot_qa' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
ERROR:datasets.load:`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'hotpot_qa' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.


README.md: 0.00B [00:00, ?B/s]

fullwiki/train-00000-of-00002.parquet:   0%|          | 0.00/166M [00:00<?, ?B/s]

fullwiki/train-00001-of-00002.parquet:   0%|          | 0.00/166M [00:00<?, ?B/s]

fullwiki/validation-00000-of-00001.parqu(…):   0%|          | 0.00/28.0M [00:00<?, ?B/s]

fullwiki/test-00000-of-00001.parquet:   0%|          | 0.00/27.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/90447 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/7405 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7405 [00:00<?, ? examples/s]

`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'hotpot_qa' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
ERROR:datasets.load:`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'hotpot_qa' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
2025/10/03 15:44:04 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING LIGHT AUTO RUN SETTINGS:
num_trials: 20
minibatch: True
num_fewshot_candidates: 6
num_instruct_candidates: 3
valset size: 80

2025/10/03 15:44:04 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/10/03 15:44:04 INFO dspy.teleprompt.mipro_optimiz

Bootstrapping set 1/6
Bootstrapping set 2/6
Bootstrapping set 3/6


 65%|██████▌   | 13/20 [02:19<01:14, 10.71s/it]


Bootstrapped 4 full traces after 13 examples for up to 1 rounds, amounting to 13 attempts.
Bootstrapping set 4/6


 30%|███       | 6/20 [00:40<01:35,  6.82s/it]


Bootstrapped 3 full traces after 6 examples for up to 1 rounds, amounting to 6 attempts.
Bootstrapping set 5/6


 10%|█         | 2/20 [00:13<02:00,  6.69s/it]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 6/6


 15%|█▌        | 3/20 [00:15<01:30,  5.31s/it]
2025/10/03 15:47:34 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/10/03 15:47:34 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.


Bootstrapped 1 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.


2025/10/03 15:47:43 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing N=3 instructions...

2025/10/03 15:49:01 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/10/03 15:49:01 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Given the fields `question`, produce the fields `answer`.

You are an Agent. In each episode, you will be given the fields `question` as input. And you can see your past trajectory so far.
Your goal is to use one or more of the supplied tools to collect any necessary information for producing `answer`.

To do this, you will interleave next_thought, next_tool_name, and next_tool_args in each turn, and also when finishing the task.
After each tool call, you receive a resulting observation, which gets appended to your trajectory.

When writing next_thought, you may reason about the current situation and plan for future steps.
When selecting the next_tool_name and its next_tool_args, the tool must be one of:

(1) search_wikipedia. It ta

Average Metric: 22.00 / 80 (27.5%): 100%|██████████| 80/80 [04:23<00:00,  3.29s/it]

2025/10/03 15:53:24 INFO dspy.evaluate.evaluate: Average Metric: 22 / 80 (27.5%)
2025/10/03 15:53:24 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 27.5

2025/10/03 15:53:24 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 2 / 25 - Minibatch ==



Average Metric: 6.00 / 35 (17.1%): 100%|██████████| 35/35 [01:52<00:00,  3.21s/it]

2025/10/03 15:55:16 INFO dspy.evaluate.evaluate: Average Metric: 6 / 35 (17.1%)
2025/10/03 15:55:16 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 17.14 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 3', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 0'].
2025/10/03 15:55:16 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [17.14]
2025/10/03 15:55:16 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.5]
2025/10/03 15:55:16 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 27.5


2025/10/03 15:55:16 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 3 / 25 - Minibatch ==



Average Metric: 12.00 / 35 (34.3%): 100%|██████████| 35/35 [01:45<00:00,  3.02s/it]

2025/10/03 15:57:02 INFO dspy.evaluate.evaluate: Average Metric: 12 / 35 (34.3%)
2025/10/03 15:57:02 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 34.29 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 5', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 2'].
2025/10/03 15:57:02 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [17.14, 34.29]
2025/10/03 15:57:02 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.5]
2025/10/03 15:57:02 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 27.5


2025/10/03 15:57:02 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 4 / 25 - Minibatch ==



Average Metric: 8.00 / 35 (22.9%): 100%|██████████| 35/35 [01:44<00:00,  2.99s/it]

2025/10/03 15:58:46 INFO dspy.evaluate.evaluate: Average Metric: 8 / 35 (22.9%)
2025/10/03 15:58:46 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 22.86 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 5', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 0'].
2025/10/03 15:58:46 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [17.14, 34.29, 22.86]
2025/10/03 15:58:46 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.5]
2025/10/03 15:58:46 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 27.5


2025/10/03 15:58:46 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 5 / 25 - Minibatch ==



Average Metric: 12.00 / 35 (34.3%): 100%|██████████| 35/35 [01:58<00:00,  3.38s/it]

2025/10/03 16:00:45 INFO dspy.evaluate.evaluate: Average Metric: 12 / 35 (34.3%)
2025/10/03 16:00:45 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 34.29 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 5', 'Predictor 1: Instruction 1', 'Predictor 1: Few-Shot Set 4'].
2025/10/03 16:00:45 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [17.14, 34.29, 22.86, 34.29]
2025/10/03 16:00:45 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.5]
2025/10/03 16:00:45 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 27.5


2025/10/03 16:00:45 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 6 / 25 - Minibatch ==



Average Metric: 16.00 / 35 (45.7%): 100%|██████████| 35/35 [01:46<00:00,  3.04s/it]

2025/10/03 16:02:31 INFO dspy.evaluate.evaluate: Average Metric: 16 / 35 (45.7%)
2025/10/03 16:02:31 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 45.71 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 5', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 2'].
2025/10/03 16:02:31 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [17.14, 34.29, 22.86, 34.29, 45.71]
2025/10/03 16:02:31 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.5]
2025/10/03 16:02:31 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 27.5


2025/10/03 16:02:31 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 25 - Full Evaluation =====
2025/10/03 16:02:31 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 45.71) from minibatch trials...



Average Metric: 32.00 / 80 (40.0%): 100%|██████████| 80/80 [02:48<00:00,  2.11s/it]

2025/10/03 16:05:20 INFO dspy.evaluate.evaluate: Average Metric: 32 / 80 (40.0%)
2025/10/03 16:05:20 INFO dspy.teleprompt.mipro_optimizer_v2: [92mNew best full eval score![0m Score: 40.0
2025/10/03 16:05:20 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.5, 40.0]
2025/10/03 16:05:20 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 40.0
2025/10/03 16:05:20 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/10/03 16:05:20 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 8 / 25 - Minibatch ==



Average Metric: 12.00 / 35 (34.3%): 100%|██████████| 35/35 [01:34<00:00,  2.69s/it]

2025/10/03 16:06:54 INFO dspy.evaluate.evaluate: Average Metric: 12 / 35 (34.3%)
2025/10/03 16:06:54 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 34.29 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 5', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 0'].
2025/10/03 16:06:54 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [17.14, 34.29, 22.86, 34.29, 45.71, 34.29]
2025/10/03 16:06:54 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.5, 40.0]
2025/10/03 16:06:54 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 40.0


2025/10/03 16:06:54 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 9 / 25 - Minibatch ==



Average Metric: 14.00 / 35 (40.0%): 100%|██████████| 35/35 [01:44<00:00,  2.99s/it]

2025/10/03 16:08:39 INFO dspy.evaluate.evaluate: Average Metric: 14 / 35 (40.0%)
2025/10/03 16:08:39 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 40.0 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 2', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 1'].
2025/10/03 16:08:39 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [17.14, 34.29, 22.86, 34.29, 45.71, 34.29, 40.0]
2025/10/03 16:08:39 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.5, 40.0]
2025/10/03 16:08:39 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 40.0


2025/10/03 16:08:39 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 10 / 25 - Minibatch ==



Average Metric: 10.00 / 35 (28.6%): 100%|██████████| 35/35 [01:49<00:00,  3.13s/it]

2025/10/03 16:10:28 INFO dspy.evaluate.evaluate: Average Metric: 10 / 35 (28.6%)
2025/10/03 16:10:28 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 28.57 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 0', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 0'].
2025/10/03 16:10:28 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [17.14, 34.29, 22.86, 34.29, 45.71, 34.29, 40.0, 28.57]
2025/10/03 16:10:28 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.5, 40.0]
2025/10/03 16:10:28 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 40.0


2025/10/03 16:10:28 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 11 / 25 - Minibatch ==



Average Metric: 8.00 / 35 (22.9%): 100%|██████████| 35/35 [02:09<00:00,  3.71s/it]

2025/10/03 16:12:38 INFO dspy.evaluate.evaluate: Average Metric: 8 / 35 (22.9%)
2025/10/03 16:12:38 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 22.86 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 1', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 3'].
2025/10/03 16:12:38 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [17.14, 34.29, 22.86, 34.29, 45.71, 34.29, 40.0, 28.57, 22.86]
2025/10/03 16:12:38 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.5, 40.0]
2025/10/03 16:12:38 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 40.0


2025/10/03 16:12:38 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 12 / 25 - Minibatch ==



Average Metric: 15.00 / 35 (42.9%): 100%|██████████| 35/35 [00:41<00:00,  1.19s/it]

2025/10/03 16:13:20 INFO dspy.evaluate.evaluate: Average Metric: 15 / 35 (42.9%)
2025/10/03 16:13:20 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 42.86 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 5', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 2'].
2025/10/03 16:13:20 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [17.14, 34.29, 22.86, 34.29, 45.71, 34.29, 40.0, 28.57, 22.86, 42.86]
2025/10/03 16:13:20 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.5, 40.0]
2025/10/03 16:13:20 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 40.0


2025/10/03 16:13:20 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 13 / 25 - Full Evaluation =====
2025/10/03 16:13:20 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 40.0) from minibatch trials...



Average Metric: 29.00 / 80 (36.2%): 100%|██████████| 80/80 [03:34<00:00,  2.68s/it]

2025/10/03 16:16:54 INFO dspy.evaluate.evaluate: Average Metric: 29 / 80 (36.2%)
2025/10/03 16:16:54 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.5, 40.0, 36.25]
2025/10/03 16:16:54 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 40.0
2025/10/03 16:16:54 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/10/03 16:16:54 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 14 / 25 - Minibatch ==



Average Metric: 18.00 / 35 (51.4%): 100%|██████████| 35/35 [01:04<00:00,  1.85s/it]

2025/10/03 16:17:59 INFO dspy.evaluate.evaluate: Average Metric: 18 / 35 (51.4%)
2025/10/03 16:17:59 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 51.43 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 5', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 2'].
2025/10/03 16:17:59 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [17.14, 34.29, 22.86, 34.29, 45.71, 34.29, 40.0, 28.57, 22.86, 42.86, 51.43]
2025/10/03 16:17:59 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.5, 40.0, 36.25]
2025/10/03 16:17:59 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 40.0


2025/10/03 16:17:59 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 15 / 25 - Minibatch ==



Average Metric: 8.00 / 25 (32.0%):  71%|███████▏  | 25/35 [02:07<01:18,  7.86s/it]



Average Metric: 9.00 / 26 (34.6%):  74%|███████▍  | 26/35 [02:16<01:12,  8.09s/it]

2025/10/03 16:20:24 ERROR dspy.utils.parallelizer: Error for Example({'question': 'So Long, Scarecrow is titled in reference to which 1939 musical fantasy film?', 'answer': 'The Wizard of Oz'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on tokens per min (TPM): Limit 200000, Used 196948, Requested 5750. Please try again in 809ms. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 9.00 / 26 (34.6%):  77%|███████▋  | 27/35 [02:24<01:05,  8.18s/it]

2025/10/03 16:20:26 ERROR dspy.utils.parallelizer: Error for Example({'question': 'Alicia Gräfin is best known for her role in a war film directed by who?', 'answer': 'David Ayer'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on tokens per min (TPM): Limit 200000, Used 200000, Requested 2811. Please try again in 843ms. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 9.00 / 26 (34.6%):  80%|████████  | 28/35 [02:27<00:45,  6.50s/it]

2025/10/03 16:20:31 ERROR dspy.utils.parallelizer: Error for Example({'question': "How many times has the driver, who won the Nation's Cup with Petter Solberg, in the 2014 Race of Champions, won  the 24 Hours of Le Mans ?", 'answer': 'nine'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on tokens per min (TPM): Limit 200000, Used 197073, Requested 3281. Please try again in 106ms. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 9.00 / 26 (34.6%):  83%|████████▎ | 29/35 [02:31<00:36,  6.00s/it]

2025/10/03 16:20:53 ERROR dspy.utils.parallelizer: Error for Example({'question': "Are Finance and I'm the Boss! both based on cooperation?", 'answer': 'no'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 9.00 / 26 (34.6%):  86%|████████▌ | 30/35 [02:54<00:54, 10.86s/it]

2025/10/03 16:21:30 ERROR dspy.utils.parallelizer: Error for Example({'question': 'What channel can you watch George Paul Blagden perform on television on in the U.S.?', 'answer': 'Ovation'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 10.00 / 27 (37.0%):  91%|█████████▏| 32/35 [03:41<00:48, 16.09s/it]

2025/10/03 16:21:58 ERROR dspy.utils.parallelizer: Error for Example({'question': 'Who directed the 1987 romantic drama dance film that is directly parodied by the television film "Totally Awesome"?', 'answer': 'Emile Ardolino'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 11.00 / 28 (39.3%):  97%|█████████▋| 34/35 [04:18<00:17, 17.35s/it]

2025/10/03 16:22:53 ERROR dspy.utils.parallelizer: Error for Example({'question': 'What Nigerian president appointed the Managing Director of the NPA in 2016?', 'answer': 'Muhammadu Buhari'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 11.00 / 28 (39.3%): 100%|██████████| 35/35 [04:53<00:00,  8.39s/it]

2025/10/03 16:22:53 INFO dspy.evaluate.evaluate: Average Metric: 11.0 / 35 (31.4%)
2025/10/03 16:22:53 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 31.43 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 4', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 2'].
2025/10/03 16:22:53 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [17.14, 34.29, 22.86, 34.29, 45.71, 34.29, 40.0, 28.57, 22.86, 42.86, 51.43, 31.43]
2025/10/03 16:22:53 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.5, 40.0, 36.25]
2025/10/03 16:22:53 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 40.0


2025/10/03 16:22:53 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 16 / 25 - Minibatch ==



  0%|          | 0/35 [00:00<?, ?it/s]

2025/10/03 16:23:52 ERROR dspy.utils.parallelizer: Error for Example({'question': 'Konnichiwa was launched in a broadcast that was streamed by the platform that was founded in what city in 2010?', 'answer': 'London'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 1.00 / 2 (50.0%):   9%|▊         | 3/35 [01:13<10:08, 19.01s/it]

2025/10/03 16:24:23 ERROR dspy.utils.parallelizer: Error for Example({'question': 'The artist who composed K.365/316a, better known as Piano Concerto No. 10, was the son of whom?', 'answer': 'Leopold and Anna Maria Mozart'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 1.00 / 2 (50.0%):  11%|█▏        | 4/35 [01:30<09:27, 18.30s/it]

2025/10/03 16:24:51 ERROR dspy.utils.parallelizer: Error for Example({'question': 'which city and an important business and cultural centre in northern Italy was Gleb Wataghin born in ', 'answer': 'Turin'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 1.00 / 3 (33.3%):  17%|█▋        | 6/35 [02:31<12:22, 25.59s/it]

2025/10/03 16:25:27 ERROR dspy.utils.parallelizer: Error for Example({'question': 'Which National Football League team was coached by the same coach from the winning team in The Kick?', 'answer': 'Dallas Cowboys'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 3.00 / 13 (23.1%):  49%|████▊     | 17/35 [07:10<07:38, 25.45s/it]

2025/10/03 16:30:31 ERROR dspy.utils.parallelizer: Error for Example({'question': 'Who has more letters in their middle name out of Margaret Laurence and Paul Heyse?', 'answer': 'Margaret Laurence'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 6.00 / 19 (31.6%):  69%|██████▊   | 24/35 [09:43<03:19, 18.17s/it]

2025/10/03 16:32:55 ERROR dspy.utils.parallelizer: Error for Example({'question': "What is the European Union's name for the artificial sweetener found in Nutrisoda?", 'answer': 'E955'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 7.00 / 20 (35.0%):  74%|███████▍  | 26/35 [10:18<02:38, 17.60s/it]

2025/10/03 16:33:35 ERROR dspy.utils.parallelizer: Error for Example({'question': "What main thoroughfare in Warsaw's borough of Ursynów connects to a 6,500 km international road which coincides with the Trans-Siberian Highway?", 'answer': 'Jana Rosoła street'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 7.00 / 21 (33.3%):  80%|████████  | 28/35 [10:49<01:49, 15.68s/it]

2025/10/03 16:34:10 ERROR dspy.utils.parallelizer: Error for Example({'question': 'What actor born on January 6, 1982 starred in a movie with Romola Garai and Bill Nighy in 2009?', 'answer': 'Edward John David Redmayne'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 7.00 / 21 (33.3%):  83%|████████▎ | 29/35 [11:17<01:56, 19.45s/it]

2025/10/03 16:34:15 ERROR dspy.utils.parallelizer: Error for Example({'question': 'Who played the part of the Marquis de Sade in the film for which Martin Childs was nominated for an award at the 74th Academy Awards?', 'answer': 'Geoffrey Rush'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 11.00 / 26 (42.3%): 100%|██████████| 35/35 [13:22<00:00, 22.93s/it]

2025/10/03 16:36:15 INFO dspy.evaluate.evaluate: Average Metric: 11.0 / 35 (31.4%)
2025/10/03 16:36:15 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 31.43 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 3', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 5'].
2025/10/03 16:36:15 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [17.14, 34.29, 22.86, 34.29, 45.71, 34.29, 40.0, 28.57, 22.86, 42.86, 51.43, 31.43, 31.43]
2025/10/03 16:36:15 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.5, 40.0, 36.25]
2025/10/03 16:36:15 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 40.0


2025/10/03 16:36:15 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 17 / 25 - Minibatch ==



Average Metric: 1.00 / 2 (50.0%):   6%|▌         | 2/35 [00:39<10:05, 18.33s/it]

2025/10/03 16:37:14 ERROR dspy.utils.parallelizer: Error for Example({'question': 'What Australian sportsman died in a suburb of Adelaide in the City of Burnside and the City of Campbelltown?', 'answer': 'Ernest Jones'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 1.00 / 2 (50.0%):   9%|▊         | 3/35 [00:59<09:57, 18.68s/it]

2025/10/03 16:37:15 ERROR dspy.utils.parallelizer: Error for Example({'question': "What opera company has its permanent home as he Staatsoper Unter den Linden and had Franz Betz as an opera singer in the 1800's?", 'answer': 'Berlin State Opera'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 2.00 / 5 (40.0%):  20%|██        | 7/35 [01:30<04:44, 10.16s/it]



Average Metric: 3.00 / 7 (42.9%):  26%|██▌       | 9/35 [01:56<04:43, 10.92s/it]

2025/10/03 16:38:24 ERROR dspy.utils.parallelizer: Error for Example({'question': 'Who composed the BQF and afrofuturist movement blueprint with Camae Ayewa?', 'answer': 'Rasheedah Phillips'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 4.00 / 8 (50.0%):  31%|███▏      | 11/35 [02:24<05:05, 12.74s/it]



Average Metric: 5.00 / 10 (50.0%):  37%|███▋      | 13/35 [02:40<03:41, 10.06s/it]



Average Metric: 10.00 / 21 (47.6%):  69%|██████▊   | 24/35 [05:07<02:31, 13.75s/it]

2025/10/03 16:41:33 ERROR dspy.utils.parallelizer: Error for Example({'question': 'Seiko Holdings Corporation was formerly known as which watch and jewlery shop?', 'answer': 'K. Hattori'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 10.00 / 21 (47.6%):  71%|███████▏  | 25/35 [05:17<02:06, 12.69s/it]



Average Metric: 11.00 / 22 (50.0%):  74%|███████▍  | 26/35 [05:35<02:08, 14.23s/it]

2025/10/03 16:42:21 ERROR dspy.utils.parallelizer: Error for Example({'question': 'Who is the wife of Charlemagne who is a step mother to Pepin the Hunchback?', 'answer': 'Hildegard'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 12.00 / 24 (50.0%):  83%|████████▎ | 29/35 [06:06<00:57,  9.61s/it]



Average Metric: 15.00 / 28 (53.6%):  94%|█████████▍| 33/35 [07:05<00:27, 13.99s/it]



Average Metric: 16.00 / 29 (55.2%):  97%|█████████▋| 34/35 [07:23<00:15, 15.36s/it]

2025/10/03 16:44:02 ERROR dspy.utils.parallelizer: Error for Example({'question': 'What U.S. court presided over both the Slaughter-House Cases and Reed v. Reed?', 'answer': 'the Supreme Court'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 16.00 / 29 (55.2%): 100%|██████████| 35/35 [07:47<00:00, 13.34s/it]

2025/10/03 16:44:02 INFO dspy.evaluate.evaluate: Average Metric: 16.0 / 35 (45.7%)
2025/10/03 16:44:02 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 45.71 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 5', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 3'].
2025/10/03 16:44:02 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [17.14, 34.29, 22.86, 34.29, 45.71, 34.29, 40.0, 28.57, 22.86, 42.86, 51.43, 31.43, 31.43, 45.71]
2025/10/03 16:44:02 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.5, 40.0, 36.25]
2025/10/03 16:44:02 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 40.0


2025/10/03 16:44:02 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 18 / 25 - Minibatch ==



Average Metric: 17.00 / 32 (53.1%):  91%|█████████▏| 32/35 [04:38<00:29,  9.79s/it]

2025/10/03 16:49:02 ERROR dspy.utils.parallelizer: Error for Example({'question': 'What Australian sportsman died in a suburb of Adelaide in the City of Burnside and the City of Campbelltown?', 'answer': 'Ernest Jones'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 17.00 / 32 (53.1%):  94%|█████████▍| 33/35 [04:59<00:24, 12.38s/it]

2025/10/03 16:49:41 ERROR dspy.utils.parallelizer: Error for Example({'question': 'What year was the prequel to a film made whose sequel was directed by Bradley Raymond in which almost the entire key cast returned for the sequel?', 'answer': '1996'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 17.00 / 34 (50.0%): : 37it [07:10, 11.63s/it]

2025/10/03 16:51:13 INFO dspy.evaluate.evaluate: Average Metric: 17.0 / 35 (48.6%)
2025/10/03 16:51:13 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 48.57 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 5', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 2'].
2025/10/03 16:51:13 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [17.14, 34.29, 22.86, 34.29, 45.71, 34.29, 40.0, 28.57, 22.86, 42.86, 51.43, 31.43, 31.43, 45.71, 48.57]
2025/10/03 16:51:13 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.5, 40.0, 36.25]
2025/10/03 16:51:13 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 40.0


2025/10/03 16:51:13 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 19 / 25 - Full Evaluation =====
2025/10/03 16:51:13 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 50.0) from minibatch trials...



Average Metric: 8.00 / 13 (61.5%):  16%|█▋        | 13/80 [00:41<04:22,  3.91s/it]

2025/10/03 16:52:12 ERROR dspy.utils.parallelizer: Error for Example({'question': 'This group encompassed a federation of Alpine tribes, including the Calucones, for about 6 centuries starting around 500 BC.  ', 'answer': 'The Raeti'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 10.00 / 18 (55.6%):  24%|██▍       | 19/80 [01:21<03:52,  3.81s/it]



Average Metric: 13.00 / 22 (59.1%):  28%|██▊       | 22/80 [01:38<07:38,  7.90s/it]

2025/10/03 16:53:12 ERROR dspy.utils.parallelizer: Error for Example({'question': 'Apan Amar Apan was a film that had music composed by which Indian composer?', 'answer': 'Rahul Dev Burman'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 13.00 / 24 (54.2%):  31%|███▏      | 25/80 [01:59<05:58,  6.53s/it]

2025/10/03 16:53:14 ERROR dspy.utils.parallelizer: Error for Example({'question': 'Which person is older, Ivan Bella, or Frank De Winne? ', 'answer': 'Frank, Viscount De Winne'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 15.00 / 30 (50.0%):  41%|████▏     | 33/80 [02:39<04:23,  5.62s/it]

2025/10/03 16:54:13 ERROR dspy.utils.parallelizer: Error for Example({'question': "What is the same of the city that  is the second largest city municipality in Italy and appeared in The 2010 Giro d'Italia?", 'answer': 'Verona'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 16.00 / 33 (48.5%):  46%|████▋     | 37/80 [03:10<04:31,  6.32s/it]

2025/10/03 16:54:41 ERROR dspy.utils.parallelizer: Error for Example({'question': 'Are the bands Flow and Against the Current from the same country?', 'answer': 'no'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 17.00 / 35 (48.6%):  49%|████▉     | 39/80 [03:27<05:55,  8.67s/it]

2025/10/03 16:54:51 ERROR dspy.utils.parallelizer: Error for Example({'question': 'Seiko Holdings Corporation was formerly known as which watch and jewlery shop?', 'answer': 'K. Hattori'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 24.00 / 44 (54.5%):  61%|██████▏   | 49/80 [04:26<04:42,  9.11s/it]

2025/10/03 16:55:45 ERROR dspy.utils.parallelizer: Error for Example({'question': "How many times has the driver, who won the Nation's Cup with Petter Solberg, in the 2014 Race of Champions, won  the 24 Hours of Le Mans ?", 'answer': 'nine'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 32.00 / 54 (59.3%):  76%|███████▋  | 61/80 [05:47<02:31,  7.96s/it]

2025/10/03 16:57:14 ERROR dspy.utils.parallelizer: Error for Example({'question': "Rectrix Aviation's jet charter business involves what service?", 'answer': 'renting an entire aircraft'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 33.00 / 58 (56.9%):  82%|████████▎ | 66/80 [06:22<01:38,  7.02s/it]



Average Metric: 34.00 / 63 (54.0%):  88%|████████▊ | 70/80 [06:47<00:56,  5.66s/it]



Average Metric: 37.00 / 71 (52.1%):  99%|█████████▉| 79/80 [07:07<00:03,  3.21s/it]



Average Metric: 38.00 / 72 (52.8%): 100%|██████████| 80/80 [08:51<00:00,  6.64s/it]

2025/10/03 17:00:04 INFO dspy.evaluate.evaluate: Average Metric: 38.0 / 80 (47.5%)
2025/10/03 17:00:04 INFO dspy.teleprompt.mipro_optimizer_v2: [92mNew best full eval score![0m Score: 47.5
2025/10/03 17:00:04 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [27.5, 40.0, 36.25, 47.5]
2025/10/03 17:00:04 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 47.5
2025/10/03 17:00:04 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/10/03 17:00:04 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 20 / 25 - Minibatch ==



Average Metric: 3.00 / 5 (60.0%):  14%|█▍        | 5/35 [00:43<02:09,  4.31s/it]



Average Metric: 3.00 / 8 (37.5%):  23%|██▎       | 8/35 [02:17<08:43, 19.37s/it]

2025/10/03 17:02:45 ERROR dspy.utils.parallelizer: Error for Example({'question': "Hae Min Lee's body was found in a park that covers how many acres ?", 'answer': '1216 acre'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 3.00 / 8 (37.5%):  26%|██▌       | 9/35 [02:40<08:55, 20.59s/it]

2025/10/03 17:03:21 ERROR dspy.utils.parallelizer: Error for Example({'question': 'What was the gang merged into by the gang leader born on November 30, 1950?', 'answer': 'Black Gangster Disciple Nation (BGDN)'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 3.00 / 8 (37.5%):  29%|██▊       | 10/35 [03:16<10:33, 25.32s/it]

2025/10/03 17:03:38 ERROR dspy.utils.parallelizer: Error for Example({'question': 'What is the name of the song, originally performed by Taylor Swift, that came before Kendrick Lamar\'s second number-one single, "Humble"?', 'answer': 'Bad Blood'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 3.00 / 8 (37.5%):  31%|███▏      | 11/35 [03:33<09:07, 22.80s/it]

2025/10/03 17:03:55 ERROR dspy.utils.parallelizer: Error for Example({'question': "What is the same of the city that  is the second largest city municipality in Italy and appeared in The 2010 Giro d'Italia?", 'answer': 'Verona'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 3.00 / 8 (37.5%):  34%|███▍      | 12/35 [03:51<08:09, 21.27s/it]

2025/10/03 17:04:20 ERROR dspy.utils.parallelizer: Error for Example({'question': 'Who has more letters in their middle name out of Margaret Laurence and Paul Heyse?', 'answer': 'Margaret Laurence'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 5.00 / 12 (41.7%):  49%|████▊     | 17/35 [04:48<04:29, 14.98s/it]

2025/10/03 17:06:10 ERROR dspy.utils.parallelizer: Error for Example({'question': "What is the European Union's name for the artificial sweetener found in Nutrisoda?", 'answer': 'E955'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 5.00 / 12 (41.7%):  51%|█████▏    | 18/35 [06:05<09:31, 33.61s/it]

2025/10/03 17:06:34 ERROR dspy.utils.parallelizer: Error for Example({'question': "Where does the the mountain pygmy possum found which is also the mainland Australia's highest peak?", 'answer': 'Mount Kosciuszko'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 6.00 / 15 (40.0%):  63%|██████▎   | 22/35 [07:24<04:59, 23.04s/it]

2025/10/03 17:07:52 ERROR dspy.utils.parallelizer: Error for Example({'question': 'Which French car manufacturer supplied the technology for the Burton?', 'answer': 'Citroën'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 6.00 / 15 (40.0%):  66%|██████▌   | 23/35 [07:48<04:38, 23.18s/it]

2025/10/03 17:08:07 ERROR dspy.utils.parallelizer: Error for Example({'question': 'Caesars Palace is located between The Mirage and a resort casino owned by what company?', 'answer': 'MGM Resorts International'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 7.00 / 16 (43.8%):  71%|███████▏  | 25/35 [08:34<03:57, 23.78s/it]

2025/10/03 17:09:09 ERROR dspy.utils.parallelizer: Error for Example({'question': 'What type of person does Prime Minister of Hungary and Viktor Orbán have in common?', 'answer': 'leader'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 7.00 / 16 (43.8%):  91%|█████████▏| 32/35 [09:04<00:51, 17.02s/it]

2025/10/03 17:09:09 ERROR dspy.teleprompt.utils: An exception occurred during evaluation
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/dspy/teleprompt/utils.py", line 56, in eval_candidate_program
    return evaluate(
           ^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/dspy/utils/callback.py", line 326, in sync_wrapper
    return fn(instance, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/dspy/evaluate/evaluate.py", line 162, in __call__
    results = executor.execute(process_item, devset)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/dspy/utils/parallelizer.py", line 48, in execute
    return self._execute_parallel(wrapped, data)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/dspy/utils/parallelizer.py", line 203, in _execute_parallel
    raise Exception("Execution cance


Average Metric: 9.00 / 15 (60.0%):  43%|████▎     | 15/35 [00:11<00:00, 27.19it/s]

2025/10/03 17:09:33 ERROR dspy.utils.parallelizer: Error for Example({'question': 'Clinton B. Seely has translated the works of a Shakta poet and saint of eighteenth century Bengal, whose poems are usually addressed to what Hindu goddess?', 'answer': 'Kali'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.
2025/10/03 17:10:38 ERROR dspy.utils.parallelizer: Error for Example({'question': 'What type of person does Prime Minister of Hungary and Viktor Orbán have in common?', 'answer': 'leader'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests pe

Average Metric: 10.00 / 16 (62.5%):  46%|████▌     | 16/35 [01:29<03:50, 12.15s/it]



Average Metric: 11.00 / 20 (55.0%):  60%|██████    | 21/35 [02:21<02:50, 12.20s/it]

2025/10/03 17:11:41 ERROR dspy.utils.parallelizer: Error for Example({'question': "What role does Ghanashyam Nayak play in India's longest running sitcom serial?", 'answer': 'Natwarlal Prabhashankar Undhaiwala'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 14.00 / 25 (56.0%):  71%|███████▏  | 25/35 [02:49<01:17,  7.79s/it]



Average Metric: 17.00 / 31 (54.8%):  91%|█████████▏| 32/35 [03:11<00:11,  3.67s/it]

2025/10/03 17:12:37 ERROR dspy.utils.parallelizer: Error for Example({'question': "How many times has the driver, who won the Nation's Cup with Petter Solberg, in the 2014 Race of Champions, won  the 24 Hours of Le Mans ?", 'answer': 'nine'}) (input_keys={'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o-mini in organization org-8PmqQww7DKKRrI6VcljnZDsz on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 17.00 / 33 (51.5%): 100%|██████████| 35/35 [03:59<00:00,  8.91s/it]

An informal run like this raises ReAct's score from 24% to 51%, by teaching gpt-4o-mini more about the specifics of the task.

## Optimising Prompts for a RAG Agent

In [None]:
# Optimising Prompts for a RAG Agent

class RAG(dspy.Module):
    def __init__(self, num_docs=5):
        self.num_docs = num_docs
        self.respond = dspy.ChainOfThought("context, question -> response")

    def forward(self, question):
        context = search(question, k=self.num_docs)   # defined in tutorial linked below
        return self.respond(context=context, question=question)

# tp = dspy.MIPROv2(metric=dspy.evaluate.SemanticF1(decompositional=True), auto="medium", num_threads=24)
tp = dspy.MIPROv2(metric=dspy.evaluate.SemanticF1(decompositional=True), auto="medium", num_threads=4)
optimized_rag = tp.compile(RAG(), trainset=trainset, max_bootstrapped_demos=2, max_labeled_demos=2)

## Optimising Weights for Classification

In [None]:
import random
from typing import Literal

from datasets import load_dataset

import dspy
from dspy.datasets import DataLoader

# Load the Banking77 dataset.
CLASSES = load_dataset("PolyAI/banking77", split="train", trust_remote_code=True).features["label"].names
kwargs = {"fields": ("text", "label"), "input_keys": ("text",), "split": "train", "trust_remote_code": True}

# Load the first 2000 examples from the dataset, and assign a hint to each *training* example.
trainset = [
    dspy.Example(x, hint=CLASSES[x.label], label=CLASSES[x.label]).with_inputs("text", "hint")
    for x in DataLoader().from_huggingface(dataset_name="PolyAI/banking77", **kwargs)[:2000]
]
random.Random(0).shuffle(trainset)

In [None]:
import dspy
# lm=dspy.LM('openai/gpt-4o-mini-2024-07-18')

# Define the DSPy module for classification. It will use the hint at training time, if available.
signature = dspy.Signature("text, hint -> label").with_updated_fields("label", type_=Literal[tuple(CLASSES)])
classify = dspy.ChainOfThought(signature)
classify.set_lm(lm)

# Optimize via BootstrapFinetune.
optimizer = dspy.BootstrapFinetune(metric=(lambda x, y, trace=None: x.label == y.label), num_threads=24)
optimized = optimizer.compile(classify, trainset=trainset)

optimized(text="What does a pending cash withdrawal mean?")

# For a complete fine-tuning tutorial, see: https://dspy.ai/tutorials/classification_finetuning/

An informal run similar to this on DSPy 2.5.29 raises GPT-4o-mini's score 66% to 87%.