# Learning Objectives
- Build LLM applications for retrieval-augmented generation tasks.
- Evaluate RAG applications for groundedness and revelance

# Setup

In [None]:
%pip install -q openai tiktoken pypdf langchain langchain-community chromadb sentence-transformers

In [1]:
import json, tiktoken, pandas as pd, os, dotenv

from groq import Groq
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_community.vectorstores import Chroma

dotenv.load_dotenv()

True

In [2]:
groq_api_key = os.environ['GROQ_API_KEY']
client = Groq(api_key=groq_api_key)

model_name = 'openai/gpt-oss-20b' # deployment name
embedding_model = SentenceTransformerEmbeddings(model_name='thenlper/gte-large')

  embedding_model = SentenceTransformerEmbeddings(model_name='thenlper/gte-large')
  from .autonotebook import tqdm as notebook_tqdm


# Load the Vector Database

Since we persisted the database to a Google Drive location, we can download the database to the instance using its unique id like so:

In [3]:
#get tesla_db.zip
#!gdown 1hWbAWhJr5xsl0sAvvEq9Wpo8ItCdZpdq

Now that the database is downloaded onto the Colab instance, we can unzip it and attach a retriever.

In [4]:
# !unzip ./db_vector_data.zip

In practise, the database is maintained as a separate entity and CRUD operations are managed just as one would for normal databases (e.g., relational databases).

In [5]:
data_collection = 'data_collection'
vectorstore_persisted = Chroma(
    collection_name=data_collection, 
    persist_directory='/tesla_db', 
    embedding_function=embedding_model
)
retriever = vectorstore_persisted.as_retriever(search_type='similarity', search_kwargs={'k': 5})

  vectorstore_persisted = Chroma(


InternalError: Permission denied (os error 13)

## RAG Q&A

### Prompt Design

In [None]:
qna_system_message = """
You are an assistant to a financial services firm who answers user queries on annual reports.
User input will have the context required by you to answer user questions.
This context will begin with the token: ###Context.
The context contains references to specific portions of a document relevant to the user query.

User questions will begin with the token: ###Question.

Please answer user questions only using the context provided in the input.
Do not mention anything about the context in your final answer. Your response should only contain the answer to the question.

If the answer is not found in the context, respond "I don't know".
"""

qna_user_message_template = """
###Context
Here are some documents that are relevant to the question mentioned below.
{context}

###Question
{question}
"""

### Retrieving relevant documents

In [None]:
user_input = "What was the annual revenue of the company in 2022?"

relevant_document_chunks = retriever.get_relevant_documents(user_input)
len(relevant_document_chunks)

for idx, document in enumerate(relevant_document_chunks):
    print(document.page_content.replace("\t", " "))
    if idx >= 5:
        break


### Composing the response

In [None]:
user_input = "What was the annual revenue of the company in 2022?"


relevant_document_chunks = retriever.get_relevant_documents(user_input)
context_list = [d.page_content for d in relevant_document_chunks]
context_for_query = ". ".join(context_list)

prompt = [
    {'role':'system', 'content': qna_system_message}, 
    {'role': 'user', 'content': qna_user_message_template.format(context=context_for_query, question=user_input)}
]

try:
    response = client.chat.completions.create(model=model_name, messages=prompt, temperature=0)
    prediction = response.choices[0].message.content.strip()
except Exception as e:
    prediction = f'Sorry, I encountered the following error: \n {e}'

print(prediction)

## Evaluation

Let us now use the LLM-as-a-judge method to check the quality of the RAG system on two parameters - retrieval and generation. We illustrate this evaluation based on the answeres generated to the question from the previous section.

To save cost, we are using GPT 3.5 itself as the judge, the ideal choice would have been GPT 4 (note that this will impact the quality of the evaluation).

In [None]:
rater_model = 'gpt-35-turbo' # 'gpt-4'

groundedness_rater_system_message = """
You are tasked with rating AI generated answers to questions posed by users.
You will be presented a question, context used by the AI system to generate the answer and an AI generated answer to the question.
In the input, the question will begin with ###Question, the context will begin with ###Context while the AI generated answer will begin with ###Answer.

Evaluation criteria:
The task is to judge the extent to which the metric is followed by the answer.
1 - The metric is not followed at all
2 - The metric is followed only to a limited extent
3 - The metric is followed to a good extent
4 - The metric is followed mostly
5 - The metric is followed completely

Metric:
The answer should be derived only from the information presented in the context

Instructions:
1. First write down the steps that are needed to evaluate the answer as per the metric.
2. Give a step-by-step explanation if the answer adheres to the metric considering the question and context as the input.
3. Next, evaluate the extent to which the metric is followed.
4. Use the previous information to rate the answer using the evaluaton criteria and assign a score.
"""

relevance_rater_system_message = """
You are tasked with rating AI generated answers to questions posed by users.
You will be presented a question, context used by the AI system to generate the answer and an AI generated answer to the question.
In the input, the question will begin with ###Question, the context will begin with ###Context while the AI generated answer will begin with ###Answer.

Evaluation criteria:
The task is to judge the extent to which the metric is followed by the answer.
1 - The metric is not followed at all
2 - The metric is followed only to a limited extent
3 - The metric is followed to a good extent
4 - The metric is followed mostly
5 - The metric is followed completely

Metric:
Relevance measures how well the answer addresses the main aspects of the question, based on the context.
Consider whether all and only the important aspects are contained in the answer when evaluating relevance.

Instructions:
1. First write down the steps that are needed to evaluate the context as per the metric.
2. Give a step-by-step explanation if the context adheres to the metric considering the question as the input.
3. Next, evaluate the extent to which the metric is followed.
4. Use the previous information to rate the context using the evaluaton criteria and assign a score.
"""

user_message_template = """
###Question
{question}

###Context
{context}

###Answer
{answer}
"""

In [None]:
user_input = "What was the annual revenue of the company in 2022?"


relevant_document_chunks = retriever.get_relevant_documents(user_input)
context_list = [d.page_content for d in relevant_document_chunks]
context_for_query = ". ".join(context_list)

In [None]:
prompt = [
    {'role':'system', 'content': qna_system_message},
    {'role': 'user', 'content': qna_user_message_template.format(context=context_for_query, question=user_input)}
]

response = client.chat.completions.create(model=model_name, messages=prompt, temperature=0)
answer = response.choices[0].message.content.strip()
print(answer)

In [None]:
groundedness_prompt = [
    {'role':'system', 'content': groundedness_rater_system_message},
    {'role': 'user', 'content': user_message_template.format(question=user_input, context=context_for_query, answer=answer)}
]

response = client.chat.completions.create(model=rater_model, messages=groundedness_prompt, temperature=0)
print(response.choices[0].message.content)

In [None]:
relevance_prompt = [
    {'role':'system', 'content': relevance_rater_system_message},
    {'role': 'user', 'content': user_message_template.format(question=user_input, context=context_for_query, answer=answer)}
]

response = client.chat.completions.create(model=rater_model, messages=relevance_prompt, temperature=0)
print(response.choices[0].message.content)