## Welcome to the Notebook! 🥳

In this notebook, you will learn how to optimize your DSPy program using BootstrapFewShot, BootstrapFewShotWithRandomSearch, BootstrapFewShotOptuna, COPRO, and MIPRO! 

We will use Cohere's `Command-R+` and `Command-R` LLMs, as well as OpenAI's `GPT-4`. We will log LLM calls and pipeline traces with `Arize Phoenix`, and show how you can use `Weights & Biases` to monitor `BootstrapFewShot` runs, our first step in this integration!

We will also of course use the `Weaviate` database, storing and indexing the Weaviate blog posts.


A few requirements:
1. You'll need a running Weaviate instance
    1. You can create a 14-day free cluster on [WCS](https://console.weaviate.cloud/)
    2. Or run Weaviate locally (use the `yaml` file in this folder with `docker-compose up -d`)
2. Generate Cohere and/or OpenAI API keys
3. Installations
    1. weaviate-client
    2. dspy-ai
4. Load your Weaviate cluster with data
    1. If you want to use the Weaviate blogs as the dataset, refer to the `Weaviate-Import.ipynb` file in this folder.

# Connect DSPy to our LLMs and Weaviate

In [None]:
#!pip install dspy-ai[cohere]

#pip uninstall dspy-ai << Y

!pip install cohere


In [7]:
import dspy
import os
from dspy.retrieve.weaviate_rm import WeaviateRM
import weaviate

from dotenv import load_dotenv

load_dotenv()
cohere_api_key = os.environ.get('COHERE_API_KEY')
print(cohere_api_key)

command_r = dspy.Cohere(model="command-r", max_tokens=4000, api_key=cohere_api_key)
command_r_plus = dspy.Cohere(model="command-r-plus", max_tokens=4000, api_key=cohere_api_key)
gpt4 = dspy.OpenAI(model="gpt-4", max_tokens=4000)

weaviate_client = weaviate.connect_to_local(host='172.22.0.2')
retriever_model = WeaviateRM("WeaviateBlogChunk", weaviate_client=weaviate_client)
dspy.settings.configure(lm=command_r, rm=retriever_model)

yzmOsK3sm17HvmWqSVCr9VheZENhUuM8Y3B8cM5J


# Test Connection

In [9]:
print(command_r_plus("say hello"))
print(gpt4("say hello"))

['Hello! How can I help you today?']
['Hello! How can I assist you today?']


# Connect to Arize Phoenix Observability

In [None]:
!pip install arize-phoenix

In [None]:
from openinference.instrumentation.dspy import DSPyInstrumentor
from opentelemetry import trace as trace_api
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk import trace as trace_sdk
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
endpoint = "http://127.0.0.1:6006/v1/traces"
resource = Resource(attributes={})
tracer_provider = trace_sdk.TracerProvider(resource=resource)
span_otlp_exporter = OTLPSpanExporter(endpoint=endpoint)
tracer_provider.add_span_processor(SimpleSpanProcessor(span_exporter=span_otlp_exporter))
trace_api.set_tracer_provider(tracer_provider=tracer_provider)
DSPyInstrumentor().instrument()

# Load Dataset

In [None]:
import json

file_path = './WeaviateBlogRAG-0-0-0.json'
with open(file_path, 'r') as file:
    dataset = json.load(file)

gold_answers = []
queries = []

for row in dataset:
    gold_answers.append(row["gold_answer"])
    queries.append(row["query"])
    
data = []

for i in range(len(gold_answers)):
    data.append(dspy.Example(gold_answer=gold_answers[i], question=queries[i]).with_inputs("question"))

trainset, devset, testset = data[:25], data[25:35], data[35:]

# Typed LLM Metrics

In [None]:
class TypedEvaluator(dspy.Signature):
    """Evaluate the quality of a system's answer to a question according to a given criterion."""
    
    criterion: str = dspy.InputField(desc="The evaluation criterion.")
    question: str = dspy.InputField(desc="The question asked to the system.")
    ground_truth_answer: str = dspy.InputField(desc="An expert written Ground Truth Answer to the question.")
    predicted_answer: str = dspy.InputField(desc="The system's answer to the question.")
    rating: float = dspy.OutputField(desc="A float rating between 1 and 5")


def MetricWrapper(gold, pred, trace=None):
    alignment_criterion = "How aligned is the predicted_answer with the ground_truth?"
    return dspy.TypedPredictor(TypedEvaluator)(criterion=alignment_criterion,
                                          question=gold.question,
                                          ground_truth_answer=gold.gold_answer,
                                          predicted_answer=pred.answer).rating

# RAG

In [None]:
class GenerateAnswer(dspy.Signature):
    """Assess the the context and answer the question."""

    context = dspy.InputField(desc="Helpful information for answering the question.")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="A detailed answer that is supported by the context.")
    
class RAG(dspy.Module):
    def __init__(self, k=3):
        super().__init__()
        
        self.retrieve = dspy.Retrieve(k=k)
        self.generate_answer = dspy.Predict(GenerateAnswer)
    
    def forward(self, question):
        context = self.retrieve(question).passages
        with dspy.context(lm=command_r):
            pred = self.generate_answer(context=context, question=question).answer
        return dspy.Prediction(context=context, answer=pred, question=question)

In [None]:
lgtm_query = "What do cross encoders do?"
toy_ground_truth_answer = """
Cross encoders score the relevance of a document to a query. They are commonly used to rerank documents.
"""
lgtm_example = dspy.Example(question=lgtm_query, gold_answer=toy_ground_truth_answer)

uncompiled_Prediction = RAG()(lgtm_query)
print(f"LGTM test query: {lgtm_query} \n \n ")
print(f"Uncompiled Answer: {uncompiled_Prediction.answer} \n \n")
test_example = dspy.Example(question=lgtm_query, gold_answer=toy_ground_truth_answer)
test_pred = uncompiled_Prediction
llm_metric_rating = MetricWrapper(test_example, test_pred)
print(f"LLM Metric Rating: {llm_metric_rating}")

In [None]:
command_r.inspect_history(n=2)

In [None]:
from dspy.evaluate.evaluate import Evaluate

evaluate = Evaluate(devset=devset, num_threads=4, display_progress=False)

uncompiled_score = evaluate(RAG(), metric=MetricWrapper)

# BootstrapFewShot

In [None]:
from dspy.teleprompt import BootstrapFewShot

# Replace with Teacher Model

teacher_settings = {"lm": command_r_plus}

for i in range(1, 4, 1):
    teleprompter = BootstrapFewShot(teacher_settings=teacher_settings,
                                    metric=MetricWrapper, 
                                    max_bootstrapped_demos=i, 
                                    max_rounds=1)
    compiled_RAG = teleprompter.compile(RAG(), trainset=trainset)
    compiled_RAG_score = evaluate(compiled_RAG, metric=MetricWrapper)
    print(f"\n\033[91mCompiled RAG Score at Demos = {i}: {compiled_RAG_score}\n")

# Weights & Biases

Learn more about how you can use Weights & Biases logging for `BootstrapFewShot` runs [here!](https://github.com/weaviate/recipes/blob/main/integrations/weights_and_biases/wandb_logging_RAG_dspy_cohere.ipynb)

In [PR #849] to DSPy, we introduce wandb logging in order to see the `metric_val` returned for each bootstrapped example. To motivate the use case, you may have a rating on a scale of 1 to 5 for answers and you only want to use examples that achieve a 5 in your prompt. This is the first of many in our collaborations between Weaviate and Weights & Biases!

In [None]:
from IPython.display import Image, display

display(Image(filename='./weights_and_biases/RAG-optimization-dashboard.png'))

# BootstrapFewShotWithRandomSearch

In [None]:
from dspy.teleprompt import BootstrapFewShotWithRandomSearch

teacher_settings = {"lm": command_r}

teleprompter = BootstrapFewShotWithRandomSearch(teacher_settings=teacher_settings,
                                                metric=MetricWrapper,
                                                max_bootstrapped_demos=2,
                                                num_candidate_programs=2)

compiled_RAG = teleprompter.compile(RAG(), trainset=trainset)
compiled_RAG_score = evaluate(compiled_RAG, metric=MetricWrapper)
print(f"\n\033[91mCompiled RAG Score: {compiled_RAG_score}\n")

In [None]:
from dspy.teleprompt import BootstrapFewShotWithRandomSearch

teacher_settings = {"lm": command_r}

teleprompter = BootstrapFewShotWithRandomSearch(teacher_settings=teacher_settings,
                                                metric=MetricWrapper,
                                                max_bootstrapped_demos=1,
                                                num_candidate_programs=5)

compiled_RAG = teleprompter.compile(RAG(), trainset=trainset)
compiled_RAG_score = evaluate(compiled_RAG, metric=MetricWrapper)
print(f"\n\033[91mCompiled RAG Score: {compiled_RAG_score}\n")

# BootstrapFewShotWithOptuna

In [None]:
from dspy.teleprompt import BootstrapFewShotWithOptuna

teacher_settings = {"lm": command_r}

teleprompter = BootstrapFewShotWithOptuna(teacher_settings=teacher_settings,
                                          metric=MetricWrapper,
                                          max_bootstrapped_demos=2,
                                          num_candidate_programs=2)

compiled_RAG = teleprompter.compile(RAG(), trainset=trainset, max_demos=2)
compiled_RAG_score = evaluate(compiled_RAG, metric=MetricWrapper)
print(f"\n\033[91mCompiled RAG Score: {compiled_RAG_score}\n")

# COPRO

In [None]:
from dspy.teleprompt import COPRO

gpt4 = dspy.OpenAI(model="gpt-4-1106-preview", max_tokens=4000, model_type="chat")

COPRO_teleprompter = COPRO(prompt_model=gpt4,
                          metric=MetricWrapper,
                          breadth=5,
                          depth=3,
                          init_temperature=0.7,
                          verbose=False,
                          track_stats=True)
kwargs = dict(num_threads=1, display_progress=True, display_table=5)

COPRO_compiled_RAG = COPRO_teleprompter.compile(RAG(), trainset=trainset[:3], eval_kwargs=kwargs)
eval_score = evaluate(COPRO_compiled_RAG, metric=MetricWrapper)
print(eval_score)

In [None]:
print(COPRO_compiled_RAG(question="What is ref2vec?").answer)

In [None]:
print(command_r.inspect_history(n=1))

# Typed COPRO

[Work in Progress]

```python
from dspy.evaluate.evaluate import Evaluate

evaluator_for_TypedCOPRO = Evaluate(metric=MetricWrapper, devset=devset, num_threads=4, display_progress=False)

from dspy.teleprompt import optimize_signature

TypedCOPRO_compiled_RAG = optimize_signature(
    student=RAG(),
    evaluator=evaluator_for_TypedCOPRO,
    n_iterations=10,
    sorted_order="increasing",
    strategy="best",
    max_examples=20,
    prompt_model=command_r,
    initial_prompts=2,
    verbose=False,
)

eval_score = evaluate(TypedCOPRO_compiled_RAG, metric=MetricWrapper)
print(eval_score)
```

# MIPRO

In [None]:
class ObservationSummarizer(dspy.Signature):
    """Given a series of observations I have made about my dataset, please summarize them into a brief 2-3 sentence summary which highlights only the most important details."""

    observations = dspy.InputField(desc="Observations I have made about my dataset")
    summary = dspy.OutputField(
        desc="Two to Three sentence summary of only the most significant highlights of my observations",
    )


class DatasetDescriptor(dspy.Signature):
    (
        """Given several examples from a dataset please write observations about trends that hold for most or all of the samples. """
        """Some areas you may consider in your observations: topics, content, syntax, conciceness, etc. """
        """It will be useful to make an educated guess as to the nature of the task this dataset will enable. Don't be afraid to be creative"""
    )

    examples = dspy.InputField(desc="Sample data points from the dataset")
    observations = dspy.OutputField(desc="Somethings that holds true for most or all of the data you observed")


class DatasetDescriptorWithPriorObservations(dspy.Signature):
    (
        """Given several examples from a dataset please write observations about trends that hold for most or all of the samples. """
        """I will also provide you with a few observations I have already made.  Please add your own observations or if you feel the observations are comprehensive say 'COMPLETE' """
        """Some areas you may consider in your observations: topics, content, syntax, conciceness, etc. """
        """It will be useful to make an educated guess as to the nature of the task this dataset will enable. Don't be afraid to be creative"""
    )

    examples = dspy.InputField(desc="Sample data points from the dataset")
    prior_observations = dspy.InputField(desc="Some prior observations I made about the data")
    observations = dspy.OutputField(
        desc="Somethings that holds true for most or all of the data you observed or COMPLETE if you have nothing to add",
    )

In [None]:
dspy.settings.configure(lm=gpt4)

dataset_descriptor = dspy.Predict(DatasetDescriptor)
dataset_descriptor_with_prior = dspy.Predict(DatasetDescriptorWithPriorObservations)
observation_summarizer = dspy.Predict(ObservationSummarizer)

def examples_to_strings(trainset):
    example_strings = []
    for example in trainset:
        question = example.question
        gold_answer = example.gold_answer
        example_string = f"Question: {question}\nAnswer: {gold_answer}"
        example_strings.append(example_string)
    return example_strings

batch_size=5
dataset_description = ""
for start_index in range(0, len(trainset), batch_size):
    examples = examples_to_strings(trainset[start_index:start_index+batch_size])
    examples = "".join(examples)
    if start_index == 0:
        dataset_description = dataset_descriptor(examples=examples).observations
    else:
        dataset_description = dataset_descriptor_with_prior(examples=examples,
                                                           prior_observations=dataset_description).observations
    summary = observation_summarizer(observations=dataset_description).summary
    print(f"\033[32m\nStart index: {start_index}.")
    print(f"\033[0m\nDatasetDescriptor output: {dataset_description}")
    print(f"\033[31m\nSummarizer output: {summary}\n")
    dataset_description = summary
    

In [None]:
from dspy.teleprompt import MIPRO

dspy.settings.configure(lm=command_r)

teleprompter = MIPRO(prompt_model=gpt4, task_model=command_r, metric=MetricWrapper, num_candidates=10, init_temperature=0.5)
kwargs = dict(num_threads=1, display_progress=True, display_table=0)
MIPRO_compiled_RAG = teleprompter.compile(RAG(), trainset=trainset, num_trials=3, max_bootstrapped_demos=1, max_labeled_demos=0, eval_kwargs=kwargs)
eval_score = evaluate(MIPRO_compiled_RAG, metric=MetricWrapper)
print(eval_score)

In [None]:
print(MIPRO_compiled_RAG(question="What is ref2vec?").answer)

In [None]:
print(command_r.inspect_history(n=1))