# Implementação do RAGAS

- Implementar o RAGAS com o LLaMA-3 70B para avaliar a qualidade das 50 anotações do IIRC usadas no exercício passado.
- O RAGAS considera context, question, answer, keys que estão disponíveis no conjunto de teste do IIRC.
- Opcional:
    - Avaliar as respostas do exercício da aula 9_10
    - Usar multi agents

In [None]:
%pip install --quiet groq nltk

In [None]:
import os
from groq import Groq
import json
from sentence_transformers import SentenceTransformer, util
import tqdm

GROQ_API_KEY = "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"
MODEL_NAME = "llama3-70b-8192"
EMBEDDINGS_MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2'
NUM_QUESTIONS = 50

def groq_completion(client,
                    messages,
                    model=MODEL_NAME,
                    max_tokens=1024):
    return client.chat.completions.create(
                model=model,
                max_tokens=max_tokens,
                messages=messages).choices[0].message.content

client = Groq(api_key=GROQ_API_KEY)
embeddings_model = SentenceTransformer(EMBEDDINGS_MODEL_NAME)

## Dataset

### Loading and testing

In [None]:
with open('test_questions.json', 'r') as file:
	test_questions  = json.load(file)

for i, item in enumerate(test_questions[:NUM_QUESTIONS]):
    # Print the question
    print("Question:", item['question'])

    if item['answer']['type'] == 'span':
        print("Answer:", item['answer']['answer_spans'][0]['text'])
    elif item['answer']['type'] in ['value', 'binary']:
        print("Answer:", item['answer']['answer_value'])

    context = '. '.join(context_item['text'] for context_item in item['context']) + '.'
    print(f"Context {i}: {context}")
    print("\n")

### Dataset

In [None]:
# Implementação baseada na do aluno Otávio Cury Pontes

from nltk.tokenize import sent_tokenize

all_questions = []
all_answers = []
all_contexts = []

for question in test_questions:
    all_questions.append(question['question'])

    if question['answer']['type'] == 'span':
        all_answers.append(question['answer']['answer_spans'][0]['text'])
    if question['answer']['type'] == 'value':
        answer = question['answer']['answer_value'] + " " + question['answer']['answer_unit'] + '.'
        all_answers.append(answer)
    if question['answer']['type'] == 'binary':
        answer = question['answer']['answer_value'] + '.'
        all_answers.append(answer)

    context = ''
    for item in question['context']:
        context += item['text'] + " "

    all_contexts.append(context)

max_sent_context = -1
max_sent_answer = -1

for i in range(50):
    num_sentence_answer =  len(sent_tokenize(all_answers[i]))
    if num_sentence_answer > max_sent_context:
        max_sent_answer = num_sentence_answer

    num_sentence_context = len(sent_tokenize(all_contexts[i]))
    if num_sentence_context > max_sent_context:
        max_sent_context = num_sentence_context

print(f"{len(all_questions)}, {len(all_answers)}, {len(all_contexts)}")

## RAGAS evaluation

### Faithfulness



In [None]:
faithfulness_user_prompt = """
Context: {context}
{statement}
"""

faithfulness_system_prompt = """
Consider the given context and following statements, then determine whether they are supported by the information present in the context.

Provide a brief explanation for each statement before arriving at the verdict (Yes/No). The veredict should be a 'string' Yes or No

Provide a final veredict for each statement in order at the end in the JSON format.
Do not deviate from the specified format!

[
    {
    "statement": ,
    "explanation": ,
    "veredict":
    },
    {
    "statement": ,
    "explanation": ,
    "veredict":
    },
]

The final answer should only be the required JSON and nothing else.
"""


In [None]:
faithfulness_user_prompt = faithfulness_user_prompt.format(context = all_questions[0], statement = all_answers[0])

FAITHFULNESS_MESSAGE=[
    {"role": "system", "content": faithfulness_system_prompt},
    {"role": "user", "content": faithfulness_user_prompt}
]

response = groq_completion(client,
                    FAITHFULNESS_MESSAGE,
                    model=MODEL_NAME,
                    max_tokens=1024)
print(response)

[
    {
    "statement": "Zeus is known for being a sky and thunder god in Greek mythology.",
    "explanation": "The context directly states that Zeus is known as the sky and thunder god, making this statement true.",
    "veredict": "Yes"
    }
]


### Answer Relevance

In [None]:
answer_relevance_system_prompt = """
Generate a question for the given answer.
Provide your response in a JSON format with the following schema:

{
"answer": [answer],
"generated question": [question]
}

The final answer should only be the required JSON and nothing else.
"""

answer_relevance_user_prompt = """
Answer: {answer}
generated question:
"""

In [None]:
answer_relevance_user_prompt = answer_relevance_user_prompt.format(answer = all_answers[0])

ANSWER_RELEVANCE_MESSAGE=[
    {"role": "system", "content": answer_relevance_system_prompt},
    {"role": "user", "content": answer_relevance_user_prompt}
]

response = groq_completion(client,
                    ANSWER_RELEVANCE_MESSAGE,
                    model=MODEL_NAME,
                    max_tokens=1024)
print(response)

json_response = json.loads(response)

generated_question = embeddings_model.encode(json_response['generated question'], convert_to_tensor=True)
question = embeddings_model.encode(all_questions[0], convert_to_tensor=True)

similarity = util.pytorch_cos_sim(generated_question, question)

print(f"""
Calculated similarity score: {similarity.item()}
""")

In [None]:
import time

AR = 0.0
N = len(all_questions)

for i in tqdm(range(NUM_QUESTIONS)):
    retry = 0

    answer_relevance_user_prompt = answer_relevance_user_prompt.format(answer = all_answers[i])

    ANSWER_RELEVANCE_MESSAGE=[
        {"role": "system", "content": answer_relevance_system_prompt},
        {"role": "user", "content": answer_relevance_user_prompt}
    ]

    while retry < 10:
        try:
            response = json.loads(groq_completion(client,
                    ANSWER_RELEVANCE_MESSAGE,
                    model=MODEL_NAME,
                    max_tokens=1024))
            time.sleep(2)
            generated_question = embeddings_model.encode(response['generated question'], convert_to_tensor=True)
            question = embeddings_model.encode(all_questions[i], convert_to_tensor=True)

            similarity = util.pytorch_cos_sim(generated_question, question)
            AR += similarity.item()

            break

        except Exception as e:
            print(e)
            retry += 1

AR = (AR/N)*100

print(f"\nAnswer Relevance score = {AR:.2f}%")

100%|██████████| 50/50 [02:01<00:00,  2.43s/it]


Answer Relevance score = 11.28%





### Context Relevance

In [None]:
context_relevance_system_prompt = """
Please extract relevant sentences from the provided context that can potentially help answer the following question.
Provide your response in a valid JSON format with the following schema:

{
"question": [question],
"context_size": [value]
"sentences": [
        sentence1,
        sentence2,
        ...
    ]
}

The final answer should only be the required JSON and nothing else.

Constraints:

1. If no relevant sentences are found, or if you believe the question cannot be answered from the given context, return an empty JSON schema such as below:

{
"question": ,
"context_size": ,
"sentences":
}

2. While extracting DO NOT, IN ANY CIRCUMSTANCE, make any changes to sentences from given context.

3. The "context_size" property should contain the number of sentences of the given context.
"""

context_relevance_user_prompt = """
Question: {question}
Context:  {context}
"""

In [None]:
context_relevance_user_prompt = context_relevance_user_prompt.format(question= all_questions[0] , context= all_contexts[0])

CONTEXT_RELEVANCE_MESSAGE=[
    {"role": "system", "content": context_relevance_system_prompt},
    {"role": "user", "content": context_relevance_user_prompt}
]

response = groq_completion(client,
                    CONTEXT_RELEVANCE_MESSAGE,
                    model=MODEL_NAME,
                    max_tokens=1024)
print(response)

response = json.loads(response)

context_sentences = response['context_size']
extracted_sentences = len(response['sentences'])

Cr = extracted_sentences/context_sentences

print(f"""
Calculated relevance: {Cr}
""")

In [None]:
extracted_sentences = 0.0
context_sentences = 0.0
test = 0.0

for i in tqdm(range(50)):
    retry = 0

    context_relevance_user_prompt = context_relevance_user_prompt.format(question= all_questions[i] , context= all_contexts[i])

    CONTEXT_RELEVANCE_MESSAGE=[
        {"role": "system", "content": context_relevance_system_prompt},
        {"role": "user", "content": context_relevance_user_prompt}
    ]

    while retry < 10:
        try:
            response = json.loads(groq_completion(client,
                        CONTEXT_RELEVANCE_MESSAGE,
                        model=MODEL_NAME,
                        max_tokens=1024))
            time.sleep(2)
            context_sentences += response['context_size']
            extracted_sentences += len(response['sentences'])
            test += len(sent_tokenize(all_contexts[i]))
            break

        except Exception as e:
            print(e)
            retry += 1

CR = (extracted_sentences/context_sentences)*100

print(f"\nContext Relevance score = {CR:.2f}%")

100%|██████████| 50/50 [02:08<00:00,  2.56s/it]


Context Relevance score = 98.04%



