# **Evaluate our RAG Model (Llama3.2)**

### Retrieve Data from our VektorDatabase

In [None]:
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
import chromadb


db_path="../Vektordatenbanken/vektor_DB"
client = chromadb.PersistentClient(path=db_path)

embedding_model = SentenceTransformer("sentence-transformers/gtr-t5-large")

collection = client.get_collection("meinungen")

def query_collection(query_text, n_results=4):

    query_embedding = embedding_model.encode(query_text)
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=n_results,
        include=["documents", "metadatas", "distances"]
    )
    return results

### Initialize the Llama-3B pipeline

In [None]:
from transformers import pipeline

rag_pipeline = pipeline(
    "text-generation",
    model="NousResearch/Hermes-3-Llama-3.2-3B",
    device_map="auto",  # I am using my GPU, but I did make it as Auto - if my gpu not available it will use the cpu
    torch_dtype="auto" # Use the most efficient data type for the hardware
)


### Improve the prompt more and check if the question contains one of listed parties

In [None]:
def imp_clean_prompt(query_text, results):
    if not results["documents"] or not results["documents"][0]:
        return None

    # List of parties to check
    parties = ["AFD", "Die Linke", "FreieWahler", "SPD", "FDP", "Grünen", "CDU CSU"]

    mentioned_parties = [party for party in parties if party.lower() in query_text.lower()]


    party_docs = {}
    for i, doc in enumerate(results["documents"][0]):
        party = results['metadatas'][0][i]['party']
        if party not in party_docs:
            party_docs[party] = []
        party_docs[party].append(f"Document {i + 1}:\n{doc}")

    # If a party is mentioned in the query, only include its documents
    if mentioned_parties:
        context_parts = []
        for party in mentioned_parties:
            if party in party_docs:
                context_parts.append(f"Party: {party}")
                context_parts.extend(party_docs[party])
    else:
        # If not, include all
        context_parts = []
        for party, docs in sorted(party_docs.items()):
            context_parts.append(f"Party: {party}")
            context_parts.extend(docs)

    context = "\n\n".join(context_parts)

    # I wanted to see what I have done here 🤷‍♂️
    # print(context) No need for that anymore Cause I have seen the context



# Update the prompt to be clear and make the eval easier
    prompt = f"""
<|system|>

You are an expert political assistant tasked with answering questions about politics, political parties, and their opinions. Your answers should be accurate and concise, based solely on the information provided.

Guidelines:
1. If you do not know the answer or if the information is irrelevant to the question, respond with "I don't know." Do not fabricate or infer answers.
2. Always respond in the same language as the user's question.
3. Format your responses clearly and professionally (e.g., using bullet points or numbered lists if appropriate).

You will be provided with context retrieved from a database along with the user's question. Use this context to provide the best possible answer.

<|end|>

<|user|>

Retrieved Data:
{context}

User Question: {query_text}

<|end|>

Your Answer:
"""
    return prompt


## main function that sends vk_DB's output to our rag model

In [None]:
def query_with_llama(query_text):
    # call the Retrieve function
    results = query_collection(query_text)

    # call the new prompt Constructor
    prompt = imp_clean_prompt(query_text, results)
    if not prompt:
        return "I do not know."

    generated_text = ""
    stop_signal = "\n\n"
    is_complete = False

    while not is_complete:
        response = rag_pipeline(prompt + generated_text, max_new_tokens=270)[0]["generated_text"]

        new_text = response[len(prompt) + len(generated_text):].strip()
        generated_text += new_text

        if stop_signal in new_text or len(new_text) < 270:
            is_complete = True

    # Extract the final answer
    answer = generated_text.strip()
    return answer

## Cause it is a generative model, I did evaluate it with similarities

In [None]:
from sentence_transformers import SentenceTransformer, util

similarity_model = SentenceTransformer("sentence-transformers/gtr-t5-large")

def calc_similarity(expected, generated):
    embeddings = similarity_model.encode([expected, generated])
    return util.cos_sim(embeddings[0], embeddings[1]).item()

def eval_semantics(evaluation_data, threshold=0.7):
    semantic_scores = []
    tp, fp, fn = 0, 0, 0  # Initialize counts for metrics

    for example in evaluation_data:
        query = example["query"]
        expected = example["expected"]
        generated = query_with_llama(query)

        similarity = calc_similarity(expected, generated)
        semantic_scores.append(similarity)

        # Determine correctness based on the threshold
        if similarity > threshold:
            if expected != "I do not know":  # Assume "I do not know" is incorrect
                tp += 1
            else:
                fp += 1
        elif expected != "I do not know":
            fn += 1

    # Calculate metrics
    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0

    accuracy = tp / len(evaluation_data)  # Fraction of correct answers
    avg_similarity = sum(semantic_scores) / len(semantic_scores)

    return {
        "accuracy": accuracy,
        "average_similarity": avg_similarity,
        "precision": precision,
        "recall": recall,
        "f1_score": f1_score,
    }


In [None]:
import json

# I have created a json file from our dataset, in order to evaluate the model's response with the expected response
  # accroding to similarities and after many tries it did work 😁.
with open("eval.json", "r") as f:
    eval_data = json.load(f)

metrics = eval_semantics(eval_data)

print(f"Accuracy: {metrics['accuracy'] * 100:.2f}%")
print(f"Average Semantic Similarity: {metrics['average_similarity'] * 100:.2f}%")
print(f"Precision: {metrics['precision'] * 100:.2f}%")
print(f"Recall: {metrics['recall'] * 100:.2f}%")
print(f"F1 Score: {metrics['f1_score'] * 100:.2f}%")


Accuracy: 96.33%
Average Semantic Similarity: 88.25%
Precision: 100.00%
Recall: 96.33%
F1 Score: 97.95%
