In [1]:
import dspy
lm = dspy.LM('ollama_chat/llama3.1:8b', api_base='http://localhost:11434', api_key='')
dspy.configure(lm=lm)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
lm("Say this is a test!", temperature=0.7)  # => ['This is a test!']
lm(messages=[{"role": "user", "content": "Say this is a test!"}])  # => ['This is a test!']

['It looks like you\'re referencing the classic phrase from the TV show "The Twilight Zone"!\n\n"You\'re traveling through another dimension, a dimension not only of sight and sound but of mind. A journey into a wondrous land whose boundaries are that of imagination. Your next stop: The Twilight Zone."\n\nBut I\'ll play along! Say what?']

In [3]:
# Define a module (ChainOfThought) and assign it a signature (return an answer, given a question).
qa = dspy.ChainOfThought('question -> answer')

# Run with the default LM configured with `dspy.configure` above.
response = qa(question="How many floors are in the castle David Gregory inherited?")
print(response.answer)

Unfortunately, I couldn't find any reliable information about the number of floors in David Gregory's castle.


In [4]:
print(len(lm.history))  # e.g., 3 calls to the LM

lm.history[-1].keys()  # access the last call to the LM, with all metadata

3


dict_keys(['prompt', 'messages', 'kwargs', 'response', 'outputs', 'usage', 'cost', 'timestamp', 'uuid', 'model', 'model_type'])

In [5]:
sentence = "it's a charming and often affecting journey."  # example from the SST-2 dataset.
sentence2 = "I hate going to school and I don't like my teacher."  # example from the SST-2 dataset.

classify = dspy.Predict('sentence -> sentiment: bool')  # we'll see an example with Literal[] later
classify(sentence=sentence2).sentiment

True

In [6]:
# Example from the XSum dataset.
document = """The 21-year-old made seven appearances for the Hammers and netted his only goal for them in a Europa League qualification round match against Andorran side FC Lustrains last season. Lee had two loan spells in League One last term, with Blackpool and then Colchester United. He scored twice for the U's but was unable to save them from relegation. The length of Lee's contract with the promoted Tykes has not been revealed. Find all the latest football transfers on our dedicated page."""

summarize = dspy.ChainOfThought('document -> summary')
response = summarize(document=document)

print(response.summary)

Lee joins Barnsley from West Ham after making 7 appearances and scoring 1 goal in the Europa League qualification round. He had loan spells at Blackpool and Colchester United last season.


In [7]:
print("Reasoning:", response.reasoning)

Reasoning: The document appears to be a news article about a football player named Lee who has joined a new team called Barnsley. The article mentions his previous appearances and goals for West Ham, as well as his loan spells at Blackpool and Colchester United.


In [8]:
# Example from the XSum dataset.
document = """This is a car. the car is flying"""

summarize = dspy.ChainOfThought('document -> summary')
response = summarize(document=document)

print(response.summary)

The document describes a car that defies physical laws by flying, which is an unrealistic scenario.


In [9]:
from typing import Literal

class Emotion(dspy.Signature):
    """Classify emotion."""

    sentence: str = dspy.InputField()
    sentiment: Literal['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'] = dspy.OutputField()

sentence = "i started feeling a little vulnerable when the giant spotlight started blinding me"  # from dair-ai/emotion

classify = dspy.Predict(Emotion)
classify(sentence=sentence)

Prediction(
    sentiment='fear'
)

In [10]:
class CheckCitationFaithfulness(dspy.Signature):
    """Verify that the text is based on the provided context."""

    context: str = dspy.InputField(desc="facts here are assumed to be true")
    text: str = dspy.InputField()
    faithfulness: bool = dspy.OutputField()
    evidence: dict[str, list[str]] = dspy.OutputField(desc="Supporting evidence for claims")

context = "The 21-year-old made seven appearances for the Hammers and netted his only goal for them in a Europa League qualification round match against Andorran side FC Lustrains last season. Lee had two loan spells in League One last term, with Blackpool and then Colchester United. He scored twice for the U's but was unable to save them from relegation. The length of Lee's contract with the promoted Tykes has not been revealed. Find all the latest football transfers on our dedicated page."

text = "Lee scored 3 goals for Colchester United."

faithfulness = dspy.ChainOfThought(CheckCitationFaithfulness)
faithfulness(context=context, text=text)

Prediction(
    reasoning="The statement that Lee scored 3 goals for Colchester United is not supported by the provided context. The text only mentions that he scored twice for the U's.",
    faithfulness=False,
    evidence={'text': ['Lee had two loan spells in League One last term, with Blackpool and then Colchester United.', "He scored twice for the U's but was unable to save them from relegation."]}
)

In [11]:
sentence = "it's a boring and often revolting journey."  # example from the SST-2 dataset.

# 1) Declare with a signature.
classify = dspy.Predict('sentence -> sentiment: bool')

# 2) Call with input argument(s). 
response = classify(sentence=sentence)

# 3) Access the output.
print(response.sentiment)

True


In [12]:
question = "What's something not great about the ColBERT retrieval model?"

# Currently NOT WORKING


# 1) Declare with a signature, and pass some config.
# classify = dspy.ChainOfThought('question -> answer', n = 5)
classify = dspy.ChainOfThought('question -> answer')

# 2) Call with input argument.
response = classify(question=question)

# 3) Access the outputs.
response.completions.answer

['One potential drawback of ColBERT is its high computational cost and reliance on pre-trained language models.']

In [13]:
print(f"Reasoning: {response.reasoning}")
print(f"Answer: {response.answer}")

Reasoning: ColBERT is a contextualized retrieval model that has shown state-of-the-art performance in various question-answering tasks. However, one potential drawback of ColBERT is its high computational cost due to the need to compute dense vector representations for each query and document pair. This can make it challenging to deploy in resource-constrained environments or when dealing with large-scale datasets.

Additionally, ColBERT relies on a pre-trained language model (e.g., BERT) as its encoder, which may not always be available or suitable for specific tasks or domains. This could limit the model's applicability and flexibility.
Answer: One potential drawback of ColBERT is its high computational cost and reliance on pre-trained language models.


In [15]:
# response.completions[3].reasoning == response.completions.reasoning[3]

In [14]:
math = dspy.ChainOfThought("question -> answer: float")
math(question="Two dice are tossed. What is the probability that the sum equals two?")

Prediction(
    reasoning='To find the probability that the sum equals two when two dice are tossed, we need to count the number of favorable outcomes (i.e., the sum is 2) and divide it by the total number of possible outcomes. The only way for the sum to be 2 is if both dice show a 1. There are 6 possible outcomes for each die (1, 2, 3, 4, 5, 6), so there are 6 * 6 = 36 total possible outcomes when two dice are tossed. Since only one of these outcomes results in a sum of 2 (i.e., both dice showing a 1), the probability is 1/36.',
    answer=0.027777777777777776
)

In [15]:
def search(query: str) -> list[str]:
    """Retrieves abstracts from Wikipedia."""
    results = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')(query, k=3)
    return [x['text'] for x in results]

rag = dspy.ChainOfThought('context, question -> response')

question = "What's the name of the castle that David Gregory inherited?"
rag(context=search(question), question=question)

Prediction(
    reasoning='The question asks about the castle inherited by David Gregory. The provided context mentions that David Gregory "inherited Kinnairdy Castle in 1664." Therefore, the answer to the question is the name of the castle mentioned.',
    response='Kinnairdy Castle'
)

In [16]:
from typing import Literal

class Classify(dspy.Signature):
    """Classify sentiment of a given sentence."""

    sentence: str = dspy.InputField()
    sentiment: Literal['positive', 'negative', 'neutral'] = dspy.OutputField()
    confidence: float = dspy.OutputField()

classify = dspy.Predict(Classify)
classify(sentence="This book was super fun to read, though not the last chapter.")

Prediction(
    sentiment='neutral',
    confidence=0.5
)

In [17]:
text = "Apple Inc. announced its latest iPhone 14 today. The CEO, Tim Cook, highlighted its new features in a press release."

module = dspy.Predict("text -> title, headings: list[str], entities_and_metadata: list[dict[str, str]]")
response = module(text=text)

print(response.title)
print(response.headings)
print(response.entities_and_metadata)

Apple Inc. iPhone 14 Announcement
['Announcement', 'Press Release']
[{'entity': 'Company', 'name': 'Apple Inc.'}, {'entity': 'Person', 'name': 'Tim Cook'}, {'entity': 'Product', 'name': 'iPhone 14'}]


In [18]:
def evaluate_math(expression: str) -> float:
    return dspy.PythonInterpreter({}).execute(expression)

def search_wikipedia(query: str) -> str:
    results = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')(query, k=3)
    return [x['text'] for x in results]

react = dspy.ReAct("question -> answer: float", tools=[evaluate_math, search_wikipedia])

pred = react(question="What is 9362158 divided by the year of birth of David Gregory of Kinnairdy castle?")
print(pred.answer)

5761.328


In [19]:
qa_pair = dspy.Example(question="This is a question?", answer="This is an answer.")

print(qa_pair)
print(qa_pair.question)
print(qa_pair.answer)

Example({'question': 'This is a question?', 'answer': 'This is an answer.'}) (input_keys=None)
This is a question?
This is an answer.


In [20]:
# Single Input.
print(qa_pair.with_inputs("question"))

# Multiple Inputs; be careful about marking your labels as inputs unless you mean it.
print(qa_pair.with_inputs("question", "answer"))

Example({'question': 'This is a question?', 'answer': 'This is an answer.'}) (input_keys={'question'})
Example({'question': 'This is a question?', 'answer': 'This is an answer.'}) (input_keys={'answer', 'question'})


In [21]:
trainset = [dspy.Example(report="LONG REPORT 1", summary="short summary 1")]

In [22]:
article_summary = dspy.Example(article= "This is an article.", summary= "This is a summary.").with_inputs("article")

input_key_only = article_summary.inputs()
non_input_key_only = article_summary.labels()

print("Example object with Input fields only:", input_key_only)
print("Example object with Non-Input fields only:", non_input_key_only)

Example object with Input fields only: Example({'article': 'This is an article.'}) (input_keys={'article'})
Example object with Non-Input fields only: Example({'summary': 'This is a summary.'}) (input_keys=None)


BELOW THIS IS METRICs

In [23]:
def validate_answer(example, pred, trace=None):
    return example.answer.lower() == pred.answer.lower()

In [24]:
def validate_context_and_answer(example, pred, trace=None):
    # check the gold label and the predicted answer are the same
    answer_match = example.answer.lower() == pred.answer.lower()

    # check the predicted answer comes from one of the retrieved contexts
    context_match = any((pred.answer.lower() in c) for c in pred.context)

    if trace is None: # if we're doing evaluation or optimization
        return (answer_match + context_match) / 2.0
    else: # if we're doing bootstrapping, i.e. self-generating good demonstrations of each step
        return answer_match and context_match

In [None]:
scores = []
for x in devset:
    pred = program(**x.inputs())
    score = metric(x, pred)
    scores.append(score)

In [None]:
from dspy.evaluate import Evaluate

# Set up the evaluator, which can be re-used in your code.
evaluator = Evaluate(devset=YOUR_DEVSET, num_threads=1, display_progress=True, display_table=5)

# Launch evaluation.
evaluator(YOUR_PROGRAM, metric=YOUR_METRIC)

In [25]:
# Define the signature for automatic assessments.
class Assess(dspy.Signature):
    """Assess the quality of a tweet along the specified dimension."""

    assessed_text = dspy.InputField()
    assessment_question = dspy.InputField()
    assessment_answer: bool = dspy.OutputField()

In [26]:
def metric(gold, pred, trace=None):
    question, answer, tweet = gold.question, gold.answer, pred.output

    engaging = "Does the assessed text make for a self-contained, engaging tweet?"
    correct = f"The text should answer `{question}` with `{answer}`. Does the assessed text contain this answer?"

    correct =  dspy.Predict(Assess)(assessed_text=tweet, assessment_question=correct)
    engaging = dspy.Predict(Assess)(assessed_text=tweet, assessment_question=engaging)

    correct, engaging = [m.assessment_answer for m in [correct, engaging]]
    score = (correct + engaging) if correct and (len(tweet) <= 280) else 0

    if trace is not None: return score >= 2
    return score / 2.0

In [27]:
def validate_hops(example, pred, trace=None):
    hops = [example.question] + [outputs.query for *_, outputs in trace if 'query' in outputs]

    if max([len(h) for h in hops]) > 100: return False
    if any(dspy.evaluate.answer_exact_match_str(hops[idx], hops[:idx], frac=0.8) for idx in range(2, len(hops))): return False

    return True

REMAINING IS OPTIMISERS TO PICK AN IMPROVE ON DATASET

IT IS POSSIBLE TO CONNECT TO ChromaDB and the following

In [None]:
from dspy.retrieve.chromadb_rm import ChromadbRM
import os
import openai
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

embedding_function = OpenAIEmbeddingFunction(
    api_key=os.environ.get('OPENAI_API_KEY'),
    model_name="text-embedding-ada-002"
)

retriever_model = ChromadbRM(
    'your_collection_name',
    '/path/to/your/db',
    embedding_function=embedding_function,
    k=5
)

results = retriever_model("Explore the significance of quantum computing", k=5)

for result in results:
    print("Document:", result.long_text, "\n")

In [28]:
! ollama stop llama3.1:8b

[?25l[?25l[?25h[2K[1G[?25h

### RAG

In [29]:
import dspy
lm = dspy.LM('ollama_chat/llama3.1:8b', api_base='http://localhost:11434', api_key='')
dspy.configure(lm=lm)

In [30]:
qa = dspy.Predict('question: str -> response: str')
response = qa(question="what are high memory and low memory on linux?")

print(response.response)

On Linux, "high memory" and "low memory" refer to the kernel's perception of available memory. 

High memory: This is the amount of RAM that is free and can be used by applications without any issues. It's also known as "free memory." When an application requests memory from the kernel, it checks if there's enough high memory available. If there is, the kernel allocates the requested memory to the application.

Low memory: This refers to the amount of RAM that's free but not immediately available for use by applications due to various reasons such as:

- The kernel has reserved some memory for its own use (e.g., for caching).
- Memory is being used by the disk cache, which can be reclaimed if needed.
- Some memory might be allocated to devices or other system components.

When an application requests a large amount of memory and there's not enough high memory available, the kernel may start using low memory. This can lead to performance issues as the kernel has to swap data between RAM

In [31]:
dspy.inspect_history(n=1)





[34m[2024-12-23T21:39:16.121334][0m

[31mSystem message:[0m

Your input fields are:
1. `question` (str)

Your output fields are:
1. `response` (str)

All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## question ## ]]
{question}

[[ ## response ## ]]
{response}

[[ ## completed ## ]]

In adhering to this structure, your objective is: 
        Given the fields `question`, produce the fields `response`.


[31mUser message:[0m

[[ ## question ## ]]
what are high memory and low memory on linux?

Respond with the corresponding output fields, starting with the field `[[ ## response ## ]]`, and then ending with the marker for `[[ ## completed ## ]]`.


[31mResponse:[0m

[32m[[ ## response ## ]]
On Linux, "high memory" and "low memory" refer to the kernel's perception of available memory. 

High memory: This is the amount of RAM that is free and can be used by applications without any issues. It's also known as "free memory." When

In [32]:
cot = dspy.ChainOfThought('question -> response')
cot(question="should curly braces appear on their own line?")

Prediction(
    reasoning='The input format should have the curly braces surrounding the value on the same line.',
    response='I will follow this format from now on. Please provide the question as a string in the field labeled "question".'
)

In [33]:
import ujson
from dspy.utils import download

# Download question--answer pairs from the RAG-QA Arena "Tech" dataset.
download("https://huggingface.co/dspy/cache/resolve/main/ragqa_arena_tech_examples.jsonl")

with open("ragqa_arena_tech_examples.jsonl") as f:
    data = [ujson.loads(line) for line in f]

Downloading 'ragqa_arena_tech_examples.jsonl'...


In [34]:
# Inspect one datapoint.
data[0]

{'question': 'why igp is used in mpls?',
 'response': "An IGP exchanges routing prefixes between gateways/routers.  \nWithout a routing protocol, you'd have to configure each route on every router and you'd have no dynamic updates when routes change because of link failures. \nFuthermore, within an MPLS network, an IGP is vital for advertising the internal topology and ensuring connectivity for MP-BGP inside the network.",
 'gold_doc_ids': [2822, 2823]}

In [35]:
data = [dspy.Example(**d).with_inputs('question') for d in data]

# Let's pick an `example` here from the data.
example = data[2]
example

Example({'question': 'why are my text messages coming up as maybe?', 'response': 'This is part of the Proactivity features new with iOS 9: It looks at info in emails to see if anyone with this number sent you an email and if it finds the phone number associated with a contact from your email, it will show you "Maybe". \n\nHowever, it has been suggested there is a bug in iOS 11.2 that can result in "Maybe" being displayed even when "Find Contacts in Other Apps" is disabled.', 'gold_doc_ids': [3956, 3957, 8034]}) (input_keys={'question'})

In [36]:
import random

random.Random(0).shuffle(data)
trainset, devset, testset = data[:200], data[200:500], data[500:1000]

len(trainset), len(devset), len(testset)

(200, 300, 500)

In [37]:
from dspy.evaluate import SemanticF1

# Instantiate the metric.
metric = SemanticF1(decompositional=True)

# Produce a prediction from our `cot` module, using the `example` above as input.
pred = cot(**example.inputs())

# Compute the metric score for the prediction.
score = metric(example, pred)

print(f"Question: \t {example.question}\n")
print(f"Gold Response: \t {example.response}\n")
print(f"Predicted Response: \t {pred.response}\n")
print(f"Semantic F1 Score: {score:.2f}")

Question: 	 why are my text messages coming up as maybe?

Gold Response: 	 This is part of the Proactivity features new with iOS 9: It looks at info in emails to see if anyone with this number sent you an email and if it finds the phone number associated with a contact from your email, it will show you "Maybe". 

However, it has been suggested there is a bug in iOS 11.2 that can result in "Maybe" being displayed even when "Find Contacts in Other Apps" is disabled.

Predicted Response: 	 To resolve this issue, try checking with the recipient directly to see if they received your message. If you're still having trouble, consider using a different messaging app that offers more reliable delivery confirmation features.

Semantic F1 Score: 0.00


In [38]:
dspy.inspect_history(n=1)





[34m[2024-12-23T21:43:03.019907][0m

[31mSystem message:[0m

Your input fields are:
1. `question` (str)
2. `ground_truth` (str)
3. `system_response` (str)

Your output fields are:
1. `reasoning` (str)
2. `ground_truth_key_ideas` (str): enumeration of key ideas in the ground truth
3. `system_response_key_ideas` (str): enumeration of key ideas in the system response
4. `discussion` (str): discussion of the overlap between ground truth and system response
5. `recall` (float): fraction (out of 1.0) of ground truth covered by the system response
6. `precision` (float): fraction (out of 1.0) of system response covered by the ground truth

All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## question ## ]]
{question}

[[ ## ground_truth ## ]]
{ground_truth}

[[ ## system_response ## ]]
{system_response}

[[ ## reasoning ## ]]
{reasoning}

[[ ## ground_truth_key_ideas ## ]]
{ground_truth_key_ideas}

[[ ## system_response_key_ideas ## ]

In [39]:
# Define an evaluator that we can re-use.
evaluate = dspy.Evaluate(devset=devset, metric=metric, num_threads=24,
                         display_progress=True, display_table=2)

# Evaluate the Chain-of-Thought program.
evaluate(cot)

Average Metric: 182.50 / 300 (60.8%): 100%|██████████| 300/300 [17:16<00:00,  3.45s/it]

2024/12/23 22:00:58 INFO dspy.evaluate.evaluate: Average Metric: 182.50242816809165 / 300 (60.8%)





Unnamed: 0,question,example_response,gold_doc_ids,reasoning,pred_response,SemanticF1
0,"when to use c over c++, and c++ over c?","If you are equally familiar with both C++ and C, it's advisable to...",[733],"When deciding between C and C++, consider the following factors: *...","The decision between using C over C++, and C++ over C, depends on ...",✔️ [0.667]
1,should images be stored in a git repository?,"One viewpoint expresses that there is no significant downside, esp...","[6253, 6254, 6275, 6278, 8215]",Images should not be stored in a Git repository. This is because i...,"No, images should not be stored in a Git repository. Consider usin...",✔️ [0.667]


60.83

In [40]:
download("https://huggingface.co/dspy/cache/resolve/main/ragqa_arena_tech_corpus.jsonl")

Downloading 'ragqa_arena_tech_corpus.jsonl'...


In [41]:
max_characters = 6000  # for truncating >99th percentile of documents
topk_docs_to_retrieve = 5  # number of documents to retrieve per search query

with open("ragqa_arena_tech_corpus.jsonl") as f:
    corpus = [ujson.loads(line)['text'][:max_characters] for line in f]
    print(f"Loaded {len(corpus)} documents. Will encode them below.")

embedder = dspy.Embedder('openai/text-embedding-3-small', dimensions=512)
search = dspy.retrievers.Embeddings(embedder=embedder, corpus=corpus, k=topk_docs_to_retrieve)

Loaded 28436 documents. Will encode them below.


AuthenticationError: litellm.AuthenticationError: AuthenticationError: OpenAIException - The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

In [None]:
class RAG(dspy.Module):
    def __init__(self):
        self.respond = dspy.ChainOfThought('context, question -> response')

    def forward(self, question):
        context = search(question).passages
        return self.respond(context=context, question=question)

In [None]:
rag = RAG()
rag(question="what are high memory and low memory on linux?")

In [None]:
dspy.inspect_history()

In [None]:
evaluate(RAG())

In [None]:
tp = dspy.MIPROv2(metric=metric, auto="medium", num_threads=24)  # use fewer threads if your rate limit is small

optimized_rag = tp.compile(RAG(), trainset=trainset,
                           max_bootstrapped_demos=2, max_labeled_demos=2,
                           requires_permission_to_run=False)

In [None]:
baseline = rag(question="cmd+tab does not work on hidden or minimized windows")
print(baseline.response)

In [None]:
pred = optimized_rag(question="cmd+tab does not work on hidden or minimized windows")
print(pred.response)

In [None]:
evaluate(optimized_rag)

In [None]:
cost = sum([x['cost'] for x in lm.history if x['cost'] is not None])  # in USD, as calculated by LiteLLM for certain providers

In [None]:
optimized_rag.save("optimized_rag.json")

loaded_rag = RAG()
loaded_rag.load("optimized_rag.json")

loaded_rag(question="cmd+tab does not work on hidden or minimized windows")

### AGENT

In [1]:
import dspy
lm = dspy.LM('ollama_chat/llama3.1:8b', api_base='http://localhost:11434', api_key='')
dspy.configure(lm=lm)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import random
from dspy.datasets import DataLoader

kwargs = dict(fields=("claim", "supporting_facts", "hpqa_id", "num_hops"), input_keys=("claim",))
hover = DataLoader().from_huggingface(dataset_name="hover-nlp/hover", split="train", trust_remote_code=True, **kwargs)

hpqa_ids = set()
hover = [
    dspy.Example(claim=x.claim, titles=list(set([y["key"] for y in x.supporting_facts]))).with_inputs("claim")
    for x in hover
    if x["num_hops"] == 3 and x["hpqa_id"] not in hpqa_ids and not hpqa_ids.add(x["hpqa_id"])
]

random.Random(0).shuffle(hover)
trainset, devset, testset = hover[:100], hover[100:200], hover[650:]

In [3]:
example = trainset[0]

print("Claim:", example.claim)
print("Pages that must be retrieved:", example.titles)

Claim: This director is known for his work on Miss Potter. The Academy of Motion Picture Arts and Sciences presents the award in which he was nominated for his work in "Babe".
Pages that must be retrieved: ['Academy Award for Best Director', 'Chris Noonan', 'Miss Potter']


In [4]:
DOCS = {}

def search(query: str, k: int) -> list[str]:
    results = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')(query, k=k)
    results = [x['text'] for x in results]

    for result in results:
        title, text = result.split(" | ", 1)
        DOCS[title] = text

    return results

In [5]:
def search_wikipedia(query: str) -> list[str]:
    """Returns top-5 results and then the titles of the top-5 to top-30 results."""

    topK = search(query, 30)
    titles, topK = [f"`{x.split(' | ')[0]}`" for x in topK[5:30]], topK[:5]
    return topK + [f"Other retrieved pages have titles: {', '.join(titles)}."]

def lookup_wikipedia(title: str) -> str:
    """Returns the text of the Wikipedia page, if it exists."""

    if title in DOCS:
        return DOCS[title]

    results = [x for x in search(title, 10) if x.startswith(title + " | ")]
    if not results:
        return f"No Wikipedia page found for title: {title}"
    return results[0]

In [6]:
instructions = "Find all Wikipedia titles relevant to verifying (or refuting) the claim."
signature = dspy.Signature("claim -> titles: list[str]", instructions)
react = dspy.ReAct(signature, tools=[search_wikipedia, lookup_wikipedia], max_iters=20)

In [7]:
react(claim="David Gregory was born in 1625.").titles[:3]

['David Gregory (physician)', 'Early Life']

In [8]:
def top5_recall(example, pred, trace=None):
    gold_titles = example.titles
    recall = sum(x in pred.titles[:5] for x in gold_titles) / len(gold_titles)

    # If we're "bootstrapping" for optimization, return True if and only if the recall is perfect.
    if trace is not None:
        return recall >= 1.0
    
    # If we're just doing inference, just measure the recall.
    return recall

evaluate = dspy.Evaluate(devset=devset, metric=top5_recall, num_threads=16, display_progress=True, display_table=5)

In [None]:
def safe_react(claim: str):
    try:
        return react(claim=claim)
    except Exception as e:
        return dspy.Prediction(titles=[])

evaluate(safe_react)

  0%|          | 0/100 [00:00<?, ?it/s]



In [None]:
# USE A LARGER MODEL HERE TO IMPROVE PERFORMANCE AND IMPROVE PROMPTS

kwargs = dict(teacher_settings=dict(lm=lm), prompt_model=lm, max_errors=999)

tp = dspy.MIPROv2(metric=top5_recall, auto="medium", num_threads=16, **kwargs)
optimized_react = tp.compile(react, trainset=trainset, max_bootstrapped_demos=3, max_labeled_demos=0)

In [None]:
evaluate(optimized_react)

In [None]:
optimized_react(claim="The author of the 1960s unproduced script written for The Beatles, Up Against It, and Bernard-Marie Koltès are both playwrights.").titles

In [None]:
dspy.inspect_history(n=2)

In [None]:
optimized_react.save("optimized_react.json")

loaded_react = dspy.ReAct("claim -> titles: list[str]", tools=[search_wikipedia, lookup_wikipedia], max_iters=20)
loaded_react.load("optimized_react.json")

loaded_react(claim="The author of the 1960s unproduced script written for The Beatles, Up Against It, and Bernard-Marie Koltès are both playwrights.").titles

### REASONING

In [5]:
import dspy
lm = dspy.LM('ollama_chat/llama3.1:8b', api_base='http://localhost:11434', api_key='')
dspy.configure(lm=lm)

In [6]:
%pip install git+https://github.com/hendrycks/math.git

Collecting git+https://github.com/hendrycks/math.git
  Cloning https://github.com/hendrycks/math.git to /tmp/pip-req-build-2abq8rjd
  Running command git clone --filter=blob:none --quiet https://github.com/hendrycks/math.git /tmp/pip-req-build-2abq8rjd


  Resolved https://github.com/hendrycks/math.git to commit 357963a7f5501a6c1708cf3f3fb0cdf525642761
  Preparing metadata (setup.py) ... [?25ldone
[?25hNote: you may need to restart the kernel to use updated packages.


In [7]:
from dspy.datasets import MATH

dataset = MATH(subset='algebra')
print(len(dataset.train), len(dataset.dev))

350 350


In [8]:
example = dataset.train[0]
print("Question:", example.question)
print("Answer:", example.answer)

Question: The doctor has told Cal O'Ree that during his ten weeks of working out at the gym, he can expect each week's weight loss to be $1\%$ of his weight at the end of the previous week. His weight at the beginning of the workouts is $244$ pounds. How many pounds does he expect to weigh at the end of the ten weeks? Express your answer to the nearest whole number.
Answer: 221


In [9]:
module = dspy.ChainOfThought("question -> answer")
module(question=example.question)

Prediction(
    reasoning="To find Cal O'Ree's weight at the end of ten weeks, we need to calculate his weight after each week. We can do this by multiplying his weight at the beginning of each week by $1\\%$, which is equivalent to multiplying it by $\\frac{1}{100}$ or $0.01$. \n\nLet's start with his initial weight: $244$ pounds.\n\nAfter one week, he will weigh $244 \\cdot 0.99 = 242.76$ pounds.\n\nAfter two weeks, he will weigh $(242.76) \\cdot 0.99 = 240.55$ pounds.\n\nWe can continue this process for ten weeks to find his final weight.\n\nHowever, we can also use a formula to simplify the calculation: $W_n = W_0 \\cdot (1 - 0.01)^n$, where $W_n$ is Cal O'Ree's weight at the end of week $n$, and $W_0$ is his initial weight.\n\nPlugging in the values, we get:\n\n$W_{10} = 244 \\cdot (1 - 0.01)^{10}$\n\nUsing a calculator to evaluate this expression, we find that Cal O'Ree expects to weigh approximately $217.4$ pounds at the end of ten weeks.\n\nRounding to the nearest whole number,

In [10]:
THREADS = 24
kwargs = dict(num_threads=THREADS, display_progress=True, display_table=5)
evaluate = dspy.Evaluate(devset=dataset.dev, metric=dataset.metric, **kwargs)

evaluate(module)

Average Metric: 18.00 / 89 (20.2%):  25%|██▌       | 89/350 [03:45<10:02,  2.31s/it]

2024/12/24 09:39:14 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'If $a$, $b$, and $c$ are integers satisfying $a + \\frac 1b = \\frac{22}{7}$, $b + \\frac 1c = 8$, and $abc = 21$, then find $c + \\frac 1a$. Express your answer as a common fraction.', 'reasoning': 'Let $x = c + \\frac 1a$. Multiplying to take advantage of the symmetry, \\begin{align*}\\frac {22}7 \\cdot 8 \\cdot x &= \\left(a + \\frac 1b\\right)\\left(b + \\frac 1c\\right)\\left(c + \\frac 1a\\right) \\\\\n&= abc + a + b + c + \\frac 1a + \\frac 1b + \\frac 1c + \\frac{1}{abc} \\\\\n&= 21 + \\left(a + \\frac 1b\\right) + \\left(b + \\frac 1c \\right) + \\left(c + \\frac 1a\\right) + \\frac{1}{21} \\\\\n&= 21 + \\frac{22}{7} + 8 + x + \\frac 1{21} \\\\\n&= \\frac{29 \\cdot 21 + 22 \\cdot 3 + 1}{21} + x\n\\end{align*} Thus, $\\frac{22 \\cdot 8 \\cdot 3}{21} x = \\frac{29 \\cdot 21 + 22 \\cdot 3 + 1}{21} + x \\Longrightarrow x = \\frac{29 \\cdot 21 + 22 \\cdot 3 + 1}{22 \\cdot 8 \\cdot 3 - 21}

Average Metric: 75.00 / 349 (21.5%): 100%|██████████| 350/350 [13:43<00:00,  2.35s/it]

2024/12/24 09:49:06 INFO dspy.evaluate.evaluate: Average Metric: 75.0 / 350 (21.4%)





Unnamed: 0,question,example_reasoning,example_answer,pred_reasoning,pred_answer,method,reasoning,answer
0,What is the smallest integer value of $c$ such that the function $...,The given function has a domain of all real numbers if and only if...,1,To find the smallest integer value of $c$ such that the function h...,The final answer is: $\boxed{1}$,,,
1,What is the least value of $x$ that is a solution of $|{-x+3}|=7$?,"In order to have $|{-x+3}| = 7$, we must have $-x + 3 = 7$ or $-x ...",-4,To find the least value of $x$ that is a solution to the equation ...,The final answer is: $\boxed{-4}$,,,
2,Evaluate $\left\lceil -\frac{7}{4}\right\rceil$.,"$-\frac{7}{4}$ is between $-1$ and $-2$, so $\left\lceil -\frac{7}...",-1,"To evaluate $\left\lceil -\frac{7}{4}\right\rceil$, we need to fir...",The final answer is: $\boxed{-1}$,,,
3,"A triangle has vertices at coordinates $(11,1)$, $(2,3)$ and $(3,7...",We must find the distance between each pair of points by using the...,10,"To find the length of each side of the triangle, we can use the di...",The final answer is: $\boxed{10}$,,,
4,Let $f(x) = x + 2$ and $g(x) = 1/f(x)$. What is $g(f(-3))$?,"First, we find that $f(-3) = (-3) + 2 = -1$. Then, $$g(f(-3)) = g(...",1,"To find $g(f(-3))$, we first need to calculate $f(-3)$. We have th...",1,✔️ [True],,


21.43