# Code for Evaluating the LLM Models

In [10]:
#Imports
import time
import pandas as pd
from llm import getChatChain
from app import load_documents_into_database
from langchain_community.llms import Ollama
from langchain.evaluation import load_evaluator
from langchain_community.vectorstores import Chroma

In [11]:
# Função que avalia a Precisão e Accuracy do Modelo LLM
def evaluate(llm_model_name: str, db: Chroma, inicio: float) -> tuple:
    accuracy_criteria = {
    "accuracy": """
        Score 1: The answer is completely irrelevant or incoherent in relation to the reference.
        Score 2: The answer is mostly irrelevant, with few or no correct parts.
        Score 3: The answer has some relevance but is mostly incorrect or out of context.
        Score 4: The answer has moderate relevance but contains several significant inaccuracies.
        Score 5: The answer has moderate relevance but contains some notable inaccuracies.
        Score 6: The answer is generally correct but contains a reasonable number of minor errors or omissions.
        Score 7: The answer is mostly correct and relevant but contains some minor errors or omissions.
        Score 8: The answer is very correct and relevant, with only small inaccuracies or omissions.
        Score 9: The answer is almost entirely accurate and relevant, with only one or two small inaccuracies or omissions.
        Score 10: The answer is completely accurate and perfectly aligns with the reference, with no errors or omissions."""
    }

    evaluator = load_evaluator(
        "labeled_score_string",
        criteria=accuracy_criteria,
        llm=Ollama(model=llm_model_name),
    )

    chat = getChatChain(Ollama(model=llm_model_name), db)
    df = pd.read_csv("evaluate.csv")
    print("\n[INFO] Evaluating model: ", llm_model_name)
    for index, row in df.iterrows():
        question = row['question']
        reference_answer = row['answer']
        model_answer = chat(question=question)
        try:
            evaluation = evaluator.evaluate_strings(
                prediction=model_answer,
                reference=reference_answer,
                input=question
            )
            print(evaluation)
            
            # Save results to CSV
            with open("Stats.csv", "a") as f:
                f.write(f"{llm_model_name},{evaluation['score']},{time.time() - inicio}\n")
                print("\n[QUESTION] " + evaluation['reasoning'],evaluation['score'])
        except ValueError as e:
            print("\n[EXCEPTION] ")

# Mistral

In [12]:
#Avaliação do Mistral segundo o Tempo, a Precisão e a Accuracy.
inicio = time.time()
db = load_documents_into_database("mistral","../Final PDF Files",True)
evaluate("mistral",db)
fim = time.time()
print("O Modelo demorou " + str(round((fim-inicio),2)) + " segundos a gerar as respostas.")

Loading documents
Loading .pdf files


100%|██████████| 25/25 [00:04<00:00,  6.22it/s]


Loading .md files


100%|██████████| 1/1 [00:00<00:00, 1014.34it/s]


Creating embeddings and loading documents into Chroma


KeyboardInterrupt: 

# Llama2

In [None]:
#Avaliação do Llama2 segundo o Tempo, a Precisão e a Accuracy.
inicio = time.time()
db = load_documents_into_database("llama2","../Final PDF Files",True)
evaluate("llama2",db)
fim = time.time()
print("O Modelo demorou " + str(round((fim-inicio),2)) + " segundos a gerar as respostas.")

Loading documents
Loading .pdf files


100%|██████████| 2/2 [00:00<00:00, 97.64it/s]


Loading .md files


0it [00:00, ?it/s]

Creating embeddings and loading documents into Chroma






[INFO] Evaluating model:  llama2
Based on the provided research documents, Article 236 of the Portuguese Penal Code states that "Whoever, publicly and repeatedly, incites hatred against a people, intending to trigger a war, is punished with imprisonment from 6 months to 3 years."
[EXCEPTION] 
O Modelo demorou 37.46 segundos a gerar as respostas.


# Zephyr

In [None]:
#Avaliação do Zephyr segundo o Tempo, a Precisão e a Accuracy.
inicio = time.time()
db = load_documents_into_database("zephyr","../Final PDF Files",True)
evaluate("zephyr",db)
fim = time.time()
print("O Modelo demorou " + str(round((fim-inicio),2)) + " segundos a gerar as respostas.")

Loading documents
Loading .pdf files


100%|██████████| 2/2 [00:00<00:00, 103.95it/s]


Loading .md files


0it [00:00, ?it/s]

Creating embeddings and loading documents into Chroma






[INFO] Evaluating model:  zephyr
Unfortunately, based on the provided context, it is unclear which article the speaker is referring to as "the other". The given conversation only mentions Articles 2034, 2037, and 2036 indirectly, as well as Article 69-b. Without further information or context, it is impossible to determine whether the speaker is referring to Article 236 of the Portuguese Criminal Code or another article altogether. Please provide more details or clarify which article is being referred to!
[EXCEPTION] 
O Modelo demorou 65.33 segundos a gerar as respostas.
