In [1]:
import os
import sys

sys.path.append(os.path.expanduser("~/dspy/"))
os.environ["DSP_CACHEBOOL"] = "False"  # disable cache

In [2]:
from openinference.instrumentation.dspy import DSPyInstrumentor
from openinference.instrumentation.openai import OpenAIInstrumentor
from opentelemetry import trace as trace_api
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk import trace as trace_sdk
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace.export import SimpleSpanProcessor

endpoint = "http://localhost:6006/v1/traces"
resource = Resource(attributes={})
tracer_provider = trace_sdk.TracerProvider(resource=resource)
tracer_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter(endpoint)))

trace_api.set_tracer_provider(tracer_provider=tracer_provider)

OpenAIInstrumentor().instrument()
DSPyInstrumentor().instrument()

In [23]:
from phoenix.trace import using_project
from phoenix.evals import download_benchmark_dataset

In [51]:
import dspy

lm = dspy.OpenAI(model="gpt-4-turbo", max_tokens=2000)
dspy.settings.configure(lm=lm)

In [18]:
lm.kwargs

{'temperature': 0.0,
 'max_tokens': 500,
 'top_p': 1,
 'frequency_penalty': 0,
 'presence_penalty': 0,
 'n': 1,
 'model': 'gpt-4-turbo'}

In [4]:
from dspy.datasets import HotPotQA

dataset = HotPotQA(train_seed=1, train_size=20, eval_seed=2023, dev_size=50, test_size=0)

trainset, devset = dataset.train, dataset.dev

In [21]:
for example in devset:
    if "election law journal" in example["question"].lower():
        print(example["question"])
        print(example["answer"])

In what city was the Election Law Journal founded?
Portland


In [14]:
for example in devset:
    print(example["answer"])

no
National Hockey League
Steve Yzerman
the River Tyne
King Alfred the Great
Port Authority of New York and New Jersey
Bundesliga
no
Waldo County, Maine
The Afghan Whigs
79 AD
the oldest
not
Del Lord
Jonathan William Patrick Aitken
Marche
7,402 at the 2010 census
design their own interdisciplinary program
English
Robert F. Chew
Manchester
Deepa Mehta
the good market
Christine Comer
William Street Hutchings
Battle of the Ch'ongch'on River
Ian Botham
defensive assistant at Florida Atlantic
Ewan McGregor
space
Maria Yermolova
no
the voice of basketball
yes
Hamas
1989
Apera
no
Exon
yes
Bill Melendez
Clarence River
Pixar
Renault
Cadwalader Heights
"Forza Italia" party.
the Wehrmacht
cricketer
Danny Wallace
Portland


In [15]:
class CoTSignature(dspy.Signature):
    """Carefully analyze the question and any other supplied information. First, write out in a step by step manner
    an EXPLANATION to show how to arrive at the correct answer. Avoid simply stating the correct answer
    at the outset."""

    question = dspy.InputField(desc="question about something")
    answer = dspy.OutputField(
        desc="as concisely as possible, state the answer to the question"
    )


class CoTPipeline(dspy.Module):
    def __init__(self):
        super().__init__()
        self.signature = CoTSignature
        self.predictor = dspy.ChainOfThought(self.signature)

    def forward(self, question):
        result = self.predictor(question=question)
        return dspy.Prediction(
            answer=result.answer,
            reasoning=result.rationale,
        )

In [6]:
from dspy.evaluate import Evaluate


def validate_context_and_answer(example, pred, trace=None):
    answer_EM = dspy.evaluate.answer_exact_match(example, pred)
    return answer_EM


NUM_THREADS = 5
evaluate = Evaluate(
    devset=devset,
    metric=validate_context_and_answer,
    num_threads=NUM_THREADS,
    display_progress=True,
    display_table=False,
)

In [10]:
cot_baseline = CoTPipeline()

devset_with_input = [
    dspy.Example({"question": r["question"], "answer": r["answer"]}).with_inputs(
        "question"
    )
    for r in devset
]
trainset_with_input = [
    dspy.Example({"question": r["question"], "answer": r["answer"]}).with_inputs(
        "question"
    )
    for r in trainset
]
with using_project("baseline"):
    evaluate(cot_baseline, devset=devset_with_input)

Average Metric: 21 / 50  (42.0): 100%|██████████| 50/50 [00:54<00:00,  1.09s/it]


In [19]:
from dspy.teleprompt import COPRO

teleprompter = COPRO(metric=validate_context_and_answer, verbose=True)
cot = CoTPipeline()
with using_project("compile-3"):
    compiled_prompt_opt = teleprompter.compile(
        cot,
        trainset=devset_with_input,
        eval_kwargs={"num_threads": 64, "display_progress": True, "display_table": 0},
    )

Average Metric: 0 / 50  (0.0): 100%|██████████| 50/50 [00:21<00:00,  2.37it/s]
Average Metric: 0 / 50  (0.0): 100%|██████████| 50/50 [00:16<00:00,  3.01it/s]
Average Metric: 0 / 50  (0.0): 100%|██████████| 50/50 [00:16<00:00,  3.04it/s]
Average Metric: 0 / 50  (0.0): 100%|██████████| 50/50 [00:12<00:00,  4.00it/s]
Average Metric: 2 / 50  (4.0): 100%|██████████| 50/50 [00:15<00:00,  3.26it/s]
Average Metric: 0 / 50  (0.0): 100%|██████████| 50/50 [00:21<00:00,  2.35it/s]
Average Metric: 0 / 50  (0.0): 100%|██████████| 50/50 [00:15<00:00,  3.25it/s]
Average Metric: 0 / 50  (0.0): 100%|██████████| 50/50 [00:09<00:00,  5.31it/s]
Average Metric: 0 / 50  (0.0): 100%|██████████| 50/50 [00:19<00:00,  2.59it/s]
Average Metric: 11 / 50  (22.0): 100%|██████████| 50/50 [00:14<00:00,  3.57it/s]
Average Metric: 0 / 50  (0.0): 100%|██████████| 50/50 [00:30<00:00,  1.63it/s]
Average Metric: 0 / 50  (0.0): 100%|██████████| 50/50 [00:27<00:00,  1.81it/s]
Average Metric: 0 / 50  (0.0): 100%|██████████| 50

In [None]:
evaluate(compiled_prompt_opt, devset=devset_with_input)

In [None]:
compiled_prompt_opt

In [22]:
import pandas as pd

df = (
    pd.read_csv(
        "https://storage.googleapis.com/arize-phoenix-assets/evals/ref-link-classification/ref_link_golden_test_data.csv",
    )
    .sample(n=100)
    .rename(columns={"conversation": "input", "document_text": "reference"})
)
df.head(3)

Unnamed: 0.1,Unnamed: 0,input,url,reference,is_correct_ref_link
94,119,Can you give me an example schema I could use ...,https://docs.arize.com/arize/sending-data-guid...,\n\n\n\n\n\nWhat Is A Model Schema - Arize Doc...,True
42,51,How can I run Arize on my own hardware?,https://docs.arize.com/arize/on-premise-deploy...,\n\n\n\n\n\nOverview - Arize Docs\n\n\n\n\n\n\...,True
157,197,Can I log batches of data?,https://docs.arize.com/arize/api-reference/jav...,\n\n\n\n\n\nbulkLog - Arize Docs\n\n\n\n\n\n\n...,True


In [56]:
wikiqa_train_df = (
    download_benchmark_dataset(task="binary-relevance-classification", dataset_name="wiki_qa-train")
    .sample(n=50)
    .rename(
        columns={
            "query_text": "query",
            "document_text": "reference",
            "relevant": "relevance",
        }
    )
)
wikiqa_train_df.head()

Unnamed: 0,query_id,query,document_title,reference,document_text_with_emphasis,relevance
1796,Q585,what countries have won the world cup,FIFA World Cup,"The FIFA World Cup, often simply the World Cup...","The FIFA World Cup, often simply the World Cup...",False
1123,Q2446,who is the actor who played the gay old guy in...,List of Family Guy cast members,Family Guy creator Seth MacFarlane also provid...,Family Guy creator Seth MacFarlane also provid...,False
2073,Q946,how was marijuana discovered,Cannabis (drug),Cannabis flower with visible trichomes Cannabi...,Cannabis flower with visible trichomes Cannabi...,False
476,Q1601,when do solar eclipses happen?,Solar eclipse,Photo of 1999 total eclipse As seen from the E...,Photo of 1999 total eclipse AS SEEN FROM THE E...,True
853,Q2097,what was the first honda car,Honda S600,The Honda S600 is an automobile manufactured b...,THE HONDA S600 IS AN AUTOMOBILE MANUFACTURED B...,True


In [57]:
wikiqa_trainset = []
for _, row in wikiqa_train_df.iterrows():
    query = row["query"]
    reference = row["reference"]
    answer = "relevant" if row["relevance"] else "irrelevant"
    wikiqa_trainset.append(
        dspy.Example({"query": query, "reference": reference, "answer": answer}).with_inputs(
            "query", "reference"
        )
    )
wikiqa_trainset

[Example({'query': 'what countries have won the world cup', 'reference': "The FIFA World Cup, often simply the World Cup, is an international association football competition contested by the senior men's national teams of the members of Fédération Internationale de Football Association ( FIFA ), the sport's global governing body. The championship has been awarded every four years since the inaugural tournament in 1930 , except in 1942 and 1946 when it was not held because of the Second World War . The current champions are Spain , who won the 2010 tournament . The current format of the tournament involves 32 teams competing for the title at venues within the host nation(s) over a period of about a month; this phase is often called the World Cup Finals. A qualification phase , which currently takes place over the preceding three years, is used to determine which teams qualify for the tournament together with the host nation(s). The 19 World Cup tournaments have been won by eight differ

In [61]:
class RelevanceCoTSignature(dspy.Signature):
    """
    Compare the query above to the reference text. You must determine whether the reference text
    contains information that can help answer the query. First, write out in a step by step manner
    an EXPLANATION that reasons about how to arrive at the correct answer. Avoid simply stating the correct answer
    at the outset.
    """

    query = dspy.InputField(desc="a query from the user")
    reference = dspy.InputField(desc="a reference document")
    answer = dspy.OutputField(
        desc="a one-word answer, either 'relevant' or 'irrelevant'"
    )


class RelevanceCoTPipeline(dspy.Module):
    def __init__(self):
        super().__init__()
        self.signature = RelevanceCoTSignature
        self.predictor = dspy.ChainOfThought(self.signature)

    def forward(self, query, reference):
        result = self.predictor(query=query, reference=reference)
        return dspy.Prediction(
            answer=result.answer,
            reasoning=result.rationale,
        )

In [62]:
teleprompter = COPRO(metric=validate_context_and_answer, verbose=True)
relevance_cot = RelevanceCoTPipeline()
with using_project("compile-relevance-2"):
    compiled_prompt_opt = teleprompter.compile(
        relevance_cot,
        trainset=wikiqa_trainset,
        eval_kwargs={"num_threads": 64, "display_progress": True, "display_table": 0},
    )

Average Metric: 6 / 50  (12.0): 100%|██████████| 50/50 [00:41<00:00,  1.21it/s]
Average Metric: 37 / 50  (74.0): 100%|██████████| 50/50 [00:15<00:00,  3.22it/s]
Average Metric: 37 / 50  (74.0): 100%|██████████| 50/50 [00:21<00:00,  2.30it/s]
Average Metric: 13 / 50  (26.0): 100%|██████████| 50/50 [00:23<00:00,  2.15it/s]
Average Metric: 0 / 50  (0.0): 100%|██████████| 50/50 [01:01<00:00,  1.24s/it]
Average Metric: 31 / 50  (62.0): 100%|██████████| 50/50 [00:15<00:00,  3.23it/s]
Average Metric: 24 / 50  (48.0): 100%|██████████| 50/50 [00:20<00:00,  2.42it/s]
Average Metric: 40 / 50  (80.0): 100%|██████████| 50/50 [00:25<00:00,  1.97it/s]
Average Metric: 16 / 50  (32.0): 100%|██████████| 50/50 [00:39<00:00,  1.26it/s]
Average Metric: 38 / 50  (76.0): 100%|██████████| 50/50 [00:17<00:00,  2.78it/s]
Average Metric: 38 / 50  (76.0): 100%|██████████| 50/50 [00:17<00:00,  2.89it/s]
Average Metric: 36 / 50  (72.0): 100%|██████████| 50/50 [00:25<00:00,  1.96it/s] 
Average Metric: 35 / 50  (70.0