# Code for Evaluating the LLM Models

In [1]:
#Imports
import time
import pandas as pd
from llm import getChatChain
from app import load_documents_into_database
from langchain_community.llms import Ollama
from langchain.evaluation import load_evaluator
from langchain_community.vectorstores import Chroma

In [2]:
# Função que avalia a Precisão e Accuracy do Modelo LLM
def evaluate(llm_model_name: str, db: Chroma, inicio: float) -> tuple:
    accuracy_criteria = {
    "accuracy": """
        Score 1: The answer is completely irrelevant or incoherent in relation to the reference.
        Score 2: The answer is mostly irrelevant, with few or no correct parts.
        Score 3: The answer has some relevance but is mostly incorrect or out of context.
        Score 4: The answer has moderate relevance but contains several significant inaccuracies.
        Score 5: The answer has moderate relevance but contains some notable inaccuracies.
        Score 6: The answer is generally correct but contains a reasonable number of minor errors or omissions.
        Score 7: The answer is mostly correct and relevant but contains some minor errors or omissions.
        Score 8: The answer is very correct and relevant, with only small inaccuracies or omissions.
        Score 9: The answer is almost entirely accurate and relevant, with only one or two small inaccuracies or omissions.
        Score 10: The answer is completely accurate and perfectly aligns with the reference, with no errors or omissions."""
    }

    evaluator = load_evaluator(
        "labeled_score_string",
        criteria=accuracy_criteria,
        llm=Ollama(model=llm_model_name),
    )

    chat = getChatChain(Ollama(model=llm_model_name), db)
    df = pd.read_csv("evaluate.csv")
    print("\n[INFO] Evaluating model: ", llm_model_name)
    for index, row in df.iterrows():
        question = row['question']
        reference_answer = row['answer']
        model_answer = chat(question=question)
        try:
            evaluation = evaluator.evaluate_strings(
                prediction=model_answer,
                reference=reference_answer,
                input=question
            )
            print(evaluation)
            
            # Save results to CSV
            with open("Stats.csv", "a") as f:
                f.write(f"{llm_model_name},{evaluation['score']},{time.time() - inicio}\n")
                print("\n[QUESTION] " + evaluation['reasoning'],evaluation['score'])
        except ValueError as e:
            print("\n[EXCEPTION] ")

# Mistral

In [3]:
#Avaliação do Mistral segundo o Tempo, a Precisão e a Accuracy.
inicio = time.time()
db = load_documents_into_database("mistral","nomic-embed-text","../Final PDF Files",True)
evaluate("mistral",db,inicio)
fim = time.time()
print("O Modelo demorou " + str(round((fim-inicio),2)) + " segundos a gerar as respostas.")

Loading documents
Loading .pdf files


100%|██████████| 25/25 [00:04<00:00,  6.07it/s]


Loading .md files


100%|██████████| 1/1 [00:00<00:00, 1156.09it/s]


Creating embeddings and loading documents into Chroma

[INFO] Evaluating model:  mistral
 The human chest has three parts: upper, middle, and lower. (Sources: Training\_Muscles.pdf)
[EXCEPTION] 
O Modelo demorou 46.08 segundos a gerar as respostas.


# Llama2

In [4]:
#Avaliação do Llama2 segundo o Tempo, a Precisão e a Accuracy.
inicio = time.time()
db = load_documents_into_database("llama2","nomic-embed-text","../Final PDF Files",True)
evaluate("llama2",db,inicio)
fim = time.time()
print("O Modelo demorou " + str(round((fim-inicio),2)) + " segundos a gerar as respostas.")

Loading documents
Loading .pdf files


100%|██████████| 25/25 [00:04<00:00,  6.20it/s]


Loading .md files


100%|██████████| 1/1 [00:00<00:00, 1074.09it/s]


Creating embeddings and loading documents into Chroma

[INFO] Evaluating model:  llama2
Based on the provided research documents, the answer to the question "How many parts are there in the human chest?" is:

There are 4 parts to the human chest:

1. Pectoralis major muscle
2. Pectoralis minor muscle
3. Ribcage (made up of 12 ribs)
4. Sternum (or breastbone)
[EXCEPTION] 
O Modelo demorou 59.76 segundos a gerar as respostas.


# Zephyr

In [5]:
#Avaliação do Zephyr segundo o Tempo, a Precisão e a Accuracy.
inicio = time.time()
db = load_documents_into_database("zephyr","nomic-embed-text","../Final PDF Files",True)
evaluate("zephyr",db,inicio)
fim = time.time()
print("O Modelo demorou " + str(round((fim-inicio),2)) + " segundos a gerar as respostas.")

Loading documents
Loading .pdf files


100%|██████████| 25/25 [00:03<00:00,  6.28it/s]


Loading .md files


100%|██████████| 1/1 [00:00<00:00, 938.95it/s]


Creating embeddings and loading documents into Chroma

[INFO] Evaluating model:  zephyr
The human chest is composed of three parts: upper, middle, and lower, according to the provided research document "Training_Muscles.pdf". The middle and lower portions make up 80% of the chest mass, so it makes sense to focus on working these parts with more sets in flat bench presses/flyes than incline ones for optimal training.
[EXCEPTION] 
O Modelo demorou 59.01 segundos a gerar as respostas.
