# RAG Evaluation

In [40]:
DATASET_FPATH = './data/processed/dataset.json'
import json
import os
from src import chromadb_rm
import dspy
from dspy.evaluate.evaluate import Evaluate


In [None]:
os.chdir('../')

In [41]:
os.environ["OPENAI_API_KEY"] = "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"
turbo = dspy.OpenAI(model='gpt-3.5-turbo')

In [42]:
# Read the dataset.
with open(DATASET_FPATH, 'r') as f:
    dataset = json.load(f)

In [43]:
from src.custom_dataset import CustomQADataSet
    
dataset = CustomQADataSet(dataset['queries'], dataset['answers'], train_size=0.8, dev_size=0.1, test_size=0.1)

In [44]:
# Tell DSPy that the 'question' field is the input. Any other fields are labels and/or metadata.
trainset = [x.with_inputs('question') for x in dataset.train]
testset = [x.with_inputs('question') for x in dataset.test]
devset = [x.with_inputs('question') for x in dataset.dev]

In [45]:
chroma_rm = chromadb_rm.ChromadbRM(collection_name="test", persist_directory="chroma.db", local_embed_model="sentence-transformers/paraphrase-MiniLM-L6-v2",
                                   openai_api_key=os.environ["OPENAI_API_KEY"])

Collection Count: 3795


In [46]:
dspy.settings.configure(lm=turbo, rm=chroma_rm)

In [47]:
class GenerateAnswer(dspy.Signature):
    """Answer questions with short factoid answers."""

    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="Explain with words between 1 and 5 words")

In [48]:
class RAG(dspy.Module):
    def __init__(self, num_passages=3):
        super().__init__()
        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
    
    def forward(self, question):
        context = self.retrieve(question).passages
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=prediction.answer)

In [49]:
rag = RAG(3)

In [50]:
# Set up the `evaluate_on_hotpotqa` function. We'll use this many times below.
evaluate_on_custom = Evaluate(devset=testset, num_threads=1, display_progress=True, display_table=5)

# Evaluate the `compiled_rag` program with the `answer_exact_match` metric.
metric = dspy.evaluate.answer_exact_match
score = evaluate_on_custom(rag, metric=metric)


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Average Metric: 14 / 44  (31.8): 100%|██████████| 44/44 [00:54<00:00,  1.24s/it]

Average Metric: 14 / 44  (31.8%)



  df = df.applymap(truncate_cell)


Unnamed: 0,question,example_answer,context,pred_answer,answer_exact_match
0,What are the two types of vertebrae recognized in temnospondyls?,- Stereospondylous and rhachitomous.,"[""temnospondyls'vertebrae are divided into several segments. in living tetrapods, the main body of the vertebra is a single piece of bone called the centrum, but...",Stereospondylous and rhachitomous.,✔️ [True]
1,What instruments did Thomas Newman mainly use to create the score for American Beauty?,Percussion instruments.,"['thomas newman\'s score was recorded in santa monica, california. he mainly used percussion instruments to create the mood and rhythm, the inspiration for which was...",Percussion instruments.,✔️ [True]
2,When was the name 'Tetrarch' given to the Mk VII tank?,22 September 1941,"['. the tetrarch was chosen because it was an obsolete design, and was therefore available to be used by the airborne forces.', ""the cumulative effect...",22 September 1941,✔️ [True]
3,Where is San Lorenzo Colossal Head 2 currently located?,Mexico City.,"['. the monument was found lying on its back, facing the sky, and was excavated in 1946 by stirling and philip drucker. in 1962 the...",Museo Nacional de Antropologia,False
4,When was the first British airborne operation conducted?,Operation Colossus.,"['because of a lack of equipment training facilities in mid @ - @ 1940, when the british airborne establishment was formed, the war office was...",After mid-1940.,False


----

Using Teleprompter
---

In [53]:
few_shot_set = trainset[:10]

In [58]:
from dspy.teleprompt import BootstrapFewShot

# Validation logic: check that the predicted answer is correct.
# Also check that the retrieved context does actually contain that answer.
def validate_context_and_answer(example, pred, trace=None):
    answer_EM = dspy.evaluate.answer_exact_match(example, pred)
    answer_PM = dspy.evaluate.answer_passage_match(example, pred)
    return answer_EM and answer_PM

# Set up a basic teleprompter, which will compile our RAG program.
teleprompter = BootstrapFewShot(metric=validate_context_and_answer)

# Compile!
compiled_rag = teleprompter.compile(RAG(3), trainset=few_shot_set)


[A
100%|██████████| 10/10 [00:00<00:00, 91.41it/s]

Bootstrapped 3 full traces after 10 examples in round 0.





In [56]:
# Set up the `evaluate_on_hotpotqa` function. We'll use this many times below.
evaluate_on_custom = Evaluate(devset=testset, num_threads=1, display_progress=True, display_table=5)

# Evaluate the `compiled_rag` program with the `answer_exact_match` metric.
metric = dspy.evaluate.answer_exact_match
score = evaluate_on_custom(compiled_rag, metric=metric)


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Average Metric: 15 / 44  (34.1): 100%|██████████| 44/44 [00:56<00:00,  1.29s/it]

Average Metric: 15 / 44  (34.1%)



  df = df.applymap(truncate_cell)


Unnamed: 0,question,example_answer,context,pred_answer,answer_exact_match
0,What are the two types of vertebrae recognized in temnospondyls?,- Stereospondylous and rhachitomous.,"[""temnospondyls'vertebrae are divided into several segments. in living tetrapods, the main body of the vertebra is a single piece of bone called the centrum, but...",Stereospondylous and rhachitomous vertebrae.,False
1,What instruments did Thomas Newman mainly use to create the score for American Beauty?,Percussion instruments.,"['thomas newman\'s score was recorded in santa monica, california. he mainly used percussion instruments to create the mood and rhythm, the inspiration for which was...",Percussion instruments.,✔️ [True]
2,When was the name 'Tetrarch' given to the Mk VII tank?,22 September 1941,"['. the tetrarch was chosen because it was an obsolete design, and was therefore available to be used by the airborne forces.', ""the cumulative effect...","September 22, 1941.",False
3,Where is San Lorenzo Colossal Head 2 currently located?,Mexico City.,"['. the monument was found lying on its back, facing the sky, and was excavated in 1946 by stirling and philip drucker. in 1962 the...",Museo Nacional de Antropologia in Mexico City.,False
4,When was the first British airborne operation conducted?,Operation Colossus.,"['because of a lack of equipment training facilities in mid @ - @ 1940, when the british airborne establishment was formed, the war office was...",Mid-1940.,False
