# RAG

In [73]:
from src.utils import *
import dspy
import os

from langchain.text_splitter import SentenceTransformersTokenTextSplitter
import chromadb
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction, SentenceTransformerEmbeddingFunction

In [74]:
os.chdir('../')

In [None]:

# Load the model
turbo = dspy.OpenAI(model='gpt-3.5-turbo')

In [None]:
# Read the text
with open('./data/raw/test.txt', 'r') as f:
    text = f.read().strip()

dspy.settings.configure(lm=turbo)

In [75]:
class BasicQA(dspy.Signature):
    """Answer questions with short factoid answers."""

    question = dspy.InputField()
    answer = dspy.OutputField(desc="often between 1 and 5 words")

In [76]:
# Define the predictor.
generate_answer = dspy.Predict(BasicQA)

# Call the predictor on a particular input.
pred = generate_answer(question="What is the capital of France?")

# Print the output.
pred.answer

'Paris'

----

## ChromaDB

In [77]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

character_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],
    chunk_size=512,
    chunk_overlap=0
)
character_split_texts = character_splitter.split_text(text)

print(f"\nTotal chunks: {len(character_split_texts)}\n")


Total chunks: 3789



In [80]:
token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0, tokens_per_chunk=256)

token_split_texts = []
for text in character_split_texts:
    token_split_texts += token_splitter.split_text(text)

print(f"\nTotal chunks: {len(token_split_texts)}")


Total chunks: 3789


In [81]:
embedding_function = SentenceTransformerEmbeddingFunction()


print("Length of embedding:")
print(len(embedding_function([token_split_texts[0]])[0]))


Length of embedding:
384


In [19]:
chroma_client = chromadb.PersistentClient("chroma.db")

In [82]:
# Create a new collection
chroma_collection = chroma_client.get_or_create_collection("test", embedding_function=embedding_function)

ids = [str(i) for i in range(len(token_split_texts))]

In [83]:
chroma_collection.add(ids=ids, documents=token_split_texts)

Add of existing embedding ID: 0
Add of existing embedding ID: 1
Add of existing embedding ID: 2
Add of existing embedding ID: 3
Add of existing embedding ID: 4
Add of existing embedding ID: 5
Add of existing embedding ID: 6
Add of existing embedding ID: 7
Add of existing embedding ID: 8
Add of existing embedding ID: 9
Add of existing embedding ID: 10
Add of existing embedding ID: 11
Add of existing embedding ID: 12
Add of existing embedding ID: 13
Add of existing embedding ID: 14
Add of existing embedding ID: 15
Add of existing embedding ID: 16
Add of existing embedding ID: 17
Add of existing embedding ID: 18
Add of existing embedding ID: 19
Add of existing embedding ID: 20
Add of existing embedding ID: 21
Add of existing embedding ID: 22
Add of existing embedding ID: 23
Add of existing embedding ID: 24
Add of existing embedding ID: 25
Add of existing embedding ID: 26
Add of existing embedding ID: 27
Add of existing embedding ID: 28
Add of existing embedding ID: 29
Add of existing embe

In [84]:
chroma_client.list_collections()

[Collection(name=test)]

In [85]:
chroma_collection.peek(1)

{'ids': ['0'],
 'embeddings': [[-0.044287558645009995,
   -0.038164783269166946,
   -0.07295431941747665,
   0.027609067037701607,
   -0.005572253372520208,
   -0.06242752447724342,
   0.0648723691701889,
   0.009721271693706512,
   0.0771678239107132,
   -0.03820229694247246,
   -0.022042306140065193,
   0.01082813460379839,
   0.05214730277657509,
   -0.0428561195731163,
   -0.05621713027358055,
   0.04174640402197838,
   -0.05734525993466377,
   0.01149690430611372,
   0.01060971524566412,
   -0.0101225720718503,
   -0.08386775851249695,
   0.054102588444948196,
   -0.008147317916154861,
   0.11363304406404495,
   -0.02989031746983528,
   -0.003911081235855818,
   0.05566437169909477,
   -0.010348524898290634,
   -0.03902805224061012,
   0.02653108723461628,
   0.02769559994339943,
   -0.0019249316537752748,
   -0.0035193057265132666,
   -0.03921325132250786,
   0.010268784128129482,
   0.0024343577679246664,
   0.00842150580137968,
   0.12012021243572235,
   0.024217667058110237,
 

----

In [86]:
query = "Who was Robert Boulter?"

results = chroma_collection.query(query_texts=[query], n_results=2)
retrieved_documents = results['documents'][0]

print(f"Query: {query}")

print(f"\nRetrieved {len(retrieved_documents)} documents\n")

for docs in retrieved_documents:
    print(word_wrap(docs))


Query: Who was Robert Boulter?

Retrieved 2 documents

= robert boulter =
robert boulter is an english film, television and theatre actor. he had
a guest @ - @ starring role on the television series the bill in 2000.
this was followed by a starring role in the play herons written by
simon stephens, which was performed in 2001 at the royal court theatre.
he had a guest role in the television series judge john deed in 2002


In [87]:
turbo = dspy.OpenAI(model='gpt-3.5-turbo')

dspy.settings.configure(lm=turbo)

In [88]:
class GenerateAnswer(dspy.Signature):
    """Answer questions with short factoid answers."""

    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="Explain with words between 1 and 5 words")

In [89]:
# Modifying the default RAG module because it doesn't work with the SentenceTransformerEmbeddingFunction
class RAG(dspy.Module):
    def __init__(self, num_passages=3):
        super().__init__()
        self.chroma_collection = chroma_client.get_collection("test")
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
        self.num_passages = num_passages
    
    def forward(self, question):
        context = self.chroma_collection.query(query_texts=[question], n_results=self.num_passages)
        context = context['documents']
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=prediction.answer)

In [90]:
rag = RAG(num_passages=3)

In [91]:
question = "Who was Robert Boulter?"
rag(question)

Prediction(
    context=[['= robert boulter =', 'robert boulter is an english film, television and theatre actor. he had a guest @ - @ starring role on the television series the bill in 2000. this was followed by a starring role in the play herons written by simon stephens, which was performed in 2001 at the royal court theatre. he had a guest role in the television series judge john deed in 2002', 'in 2000 boulter had a guest @ - @ starring role on the television series the bill ; he portrayed " scott parry " in the episode, " in safe hands ". boulter starred as " scott " in the play herons written by simon stephens, which was performed in 2001 at the royal court theatre. a review of boulter\'s performance in the independent on sunday described him as " horribly menacing " in the role, and he received critical reviews in the herald, and evening standard']],
    answer='English actor.'
)

In [94]:
turbo.inspect_history(n=1)





Answer questions with short factoid answers.

---

Follow the following format.

Context: may contain relevant facts

Question: ${question}

Reasoning: Let's think step by step in order to ${produce the answer}. We ...

Answer: Explain with words between 1 and 5 words

---

Context: «['= robert boulter =', 'robert boulter is an english film, television and theatre actor. he had a guest @ - @ starring role on the television series the bill in 2000. this was followed by a starring role in the play herons written by simon stephens, which was performed in 2001 at the royal court theatre. he had a guest role in the television series judge john deed in 2002', 'in 2000 boulter had a guest @ - @ starring role on the television series the bill ; he portrayed " scott parry " in the episode, " in safe hands ". boulter starred as " scott " in the play herons written by simon stephens, which was performed in 2001 at the royal court theatre. a review of boulter\'s performance in the independent 

----

### Using the modified ChromaDBRM

In [96]:
from src import chromadb_rm

In [97]:
chroma_rm = chromadb_rm.ChromadbRM(collection_name="test", persist_directory="chroma.db", local_embed_model="sentence-transformers/paraphrase-MiniLM-L6-v2",
                                   openai_api_key=os.environ["OPENAI_API_KEY"])

Collection Count: 3795


In [98]:
dspy.settings.configure(lm=turbo, rm=chroma_rm)

In [99]:
class RAG(dspy.Module):
    def __init__(self, num_passages=3):
        super().__init__()
        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
    
    def forward(self, question):
        context = self.retrieve(question).passages
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=prediction.answer)

In [100]:
rag = RAG(num_passages=3)
question = "Who was Robert Boulter?"
rag(question)

Prediction(
    context=['= robert boulter =', 'robert boulter is an english film, television and theatre actor. he had a guest @ - @ starring role on the television series the bill in 2000. this was followed by a starring role in the play herons written by simon stephens, which was performed in 2001 at the royal court theatre. he had a guest role in the television series judge john deed in 2002', 'in 2006 boulter starred in the play citizenship written by mark ravenhill. the play was part of a series which featured different playwrights, titled burn / chatroom / citizenship. in a 2006 interview, fellow actor ben whishaw identified boulter as one of his favorite co @ - @ stars : " i loved working with a guy called robert boulter, who was in the triple bill of burn, chatroom and citizenship at the national. he played my brother in mercury fur'],
    answer='English actor'
)

In [62]:
from dspy.datasets import HotPotQA

# Load the dataset.
dataset = HotPotQA(train_seed=1, train_size=20, eval_seed=2023, dev_size=50, test_size=0)

# Tell DSPy that the 'question' field is the input. Any other fields are labels and/or metadata.
trainset = [x.with_inputs('question') for x in dataset.train]
devset = [x.with_inputs('question') for x in dataset.dev]

len(trainset), len(devset)

  table = cls._concat_blocks(blocks, axis=0)


(20, 50)

In [63]:
from dspy.teleprompt import BootstrapFewShot

# Validation logic: check that the predicted answer is correct.
# Also check that the retrieved context does actually contain that answer.
def validate_context_and_answer(example, pred, trace=None):
    answer_EM = dspy.evaluate.answer_exact_match(example, pred)
    answer_PM = dspy.evaluate.answer_passage_match(example, pred)
    return answer_EM and answer_PM

# Set up a basic teleprompter, which will compile our RAG program.
teleprompter = BootstrapFewShot(metric=validate_context_and_answer)

# Compile!
compiled_rag = teleprompter.compile(RAG(), trainset=trainset)

100%|██████████| 20/20 [00:07<00:00,  2.85it/s]

Bootstrapped 0 full traces after 20 examples in round 0.





In [64]:
# Ask any question you like to this simple RAG program.
my_question = "Who was Robert Boulter?"

# Get the prediction. This contains `pred.context` and `pred.answer`.
pred = compiled_rag(my_question)

# Print the contexts and the answer.
print(f"Question: {my_question}")
print(f"Predicted Answer: {pred.answer}")
print(f"Retrieved Contexts (truncated): {[c[:200] + '...' for c in pred.context]}")

Question: Who was Robert Boulter?
Predicted Answer: English actor
Retrieved Contexts (truncated): ['= robert boulter =...', 'robert boulter is an english film, television and theatre actor. he had a guest @ - @ starring role on the television series the bill in 2000. this was followed by a starring role in the play herons w...', 'in 2006 boulter starred in the play citizenship written by mark ravenhill. the play was part of a series which featured different playwrights, titled burn / chatroom / citizenship. in a 2006 interview...']


In [65]:
turbo.inspect_history(n=1)





Answer questions with short factoid answers.

---

Question: Which magazine has published articles by Scott Shaw, Tae Kwon Do Times or Southwest Art?
Answer: Tae Kwon Do Times

Question: This American guitarist best known for her work with the Iron Maidens is an ancestor of a composer who was known as what?
Answer: The Waltz King

Question: On the coast of what ocean is the birthplace of Diogal Sakho?
Answer: Atlantic

Question: The Victorians - Their Story In Pictures is a documentary series written by an author born in what year?
Answer: 1950

Question: The Organisation that allows a community to influence their operation or use and to enjoy the benefits arisingwas founded in what year?
Answer: 2010

Question: Which company distributed this 1977 American animated film produced by Walt Disney Productions for which Sherman Brothers wrote songs?
Answer: Buena Vista Distribution

Question: Samantha Cristoforetti and Mark Shuttleworth are both best known for being first in their field

In [66]:
for name, parameter in compiled_rag.named_predictors():
    print(name)
    print(parameter.demos[0])
    print()

generate_answer
Example({'question': 'Which magazine has published articles by Scott Shaw, Tae Kwon Do Times or Southwest Art?', 'answer': 'Tae Kwon Do Times'}) (input_keys={'question'})



----