# Code for Evaluating the LLM Models

In [8]:
#Imports
import time
import pandas as pd
from llm import getChatChain
from app import load_documents_into_database
from langchain_community.llms import Ollama
from langchain.evaluation import load_evaluator
from langchain_community.vectorstores import Chroma
from langchain.evaluation import Criteria
import re, json
from typing import Dict, Any

In [9]:
def evaluate_mistral(llm_model_name: str, db: Chroma) -> tuple:
    
    evaluator = load_evaluator("labeled_score_string", criteria="correctness", llm=Ollama(model=llm_model_name))
    f = open("Stats.csv","a")
    tempo = 0
    chat = getChatChain(Ollama(model=llm_model_name), db)
    df = pd.read_csv("evaluate.csv")

    print("\n[INFO] Evaluating model: ", llm_model_name)
    for index, row in df.iterrows():
        inicio = time.time()

        question = row['question']
        reference_answer = row['answer']
        model_answer = chat(question=question)
        
        fim = time.time()
        tempo = round(float(fim - inicio),2)

        try:
            evaluation = evaluator.evaluate_strings(
                prediction=model_answer,
                reference=reference_answer,
                input=question
            )
        except ValueError as e:
            print(f"ValueError: {e}")
            evaluation = str(e)  # Assign the exception message to evaluation

        # Debugging print statement to inspect the evaluation output
        print(f"Evaluation output: {evaluation}")

        # Extract the score using regex
        match = re.search(r'Rating: (\d+)', evaluation)
        if match:
            score = int(match.group(1))
        else:
            print(f"Invalid format for evaluation output: {evaluation}")
            continue

        print(f'\n[QUESTION] {question}')
        print(f'[SCORE] {score}')
        f.write(f"{llm_model_name},{score},{tempo}\n")
    
    f.close()

In [14]:
def extract_rating(evaluation: str) -> int:
    """
    Extracts the rating number enclosed in single or double brackets from the evaluation string.
    """
    match = re.search(r'Rating: \[?\[?(\d+)\]?\]?', evaluation)
    if match:
        return int(match.group(1))
    else:
        print(f"Invalid format for evaluation output: {evaluation}")
        return None

def evaluate_llama2(llm_model_name: str, db: Chroma) -> tuple:
    evaluator = load_evaluator("labeled_score_string", criteria="correctness", llm=Ollama(model=llm_model_name))
    chat = getChatChain(Ollama(model=llm_model_name), db)
    df = pd.read_csv("evaluate.csv")

    print("\n[INFO] Evaluating model: ", llm_model_name)
    with open("Stats.csv", "a") as f:
        for index, row in df.iterrows():
            inicio = time.time()

            question = row['question']
            reference_answer = row['answer']
            model_answer = chat(question=question)
            
            fim = time.time()
            tempo = round(float(fim - inicio), 2)

            try:
                evaluation = evaluator.evaluate_strings(
                    prediction=model_answer,
                    reference=reference_answer,
                    input=question
                )
            except ValueError as e:
                print(f"ValueError: {e}")
                evaluation = str(e)  # Assign the exception message to evaluation

            # Debugging print statement to inspect the evaluation output
            print(f"Evaluation output: {evaluation}")

            # Convert the evaluation to a string if it is a dictionary
            if isinstance(evaluation, dict):
                evaluation = json.dumps(evaluation)

            # Extract the rating using the new function
            score = extract_rating(evaluation)
            if score is None:
                continue

            print(f'\n[QUESTION] {question}')
            print(f'[SCORE] {score}')
            f.write(f"{llm_model_name},{score},{tempo}\n")
    f.close()

In [16]:
def extract_rating(evaluation: str) -> int:
    """
    Extracts the rating number from the evaluation string.
    It handles both single and double-bracketed formats.
    """
    match = re.search(r'Rating: \[?\[?(\d+)\]?\]?', evaluation)
    if match:
        return int(match.group(1))
    else:
        print(f"Invalid format for evaluation output: {evaluation}")
        return None

def evaluate_zephyr(llm_model_name: str, db: Chroma) -> tuple:
    evaluator = load_evaluator("labeled_score_string", criteria="correctness", llm=Ollama(model=llm_model_name))
    chat = getChatChain(Ollama(model=llm_model_name), db)
    df = pd.read_csv("evaluate.csv")

    print("\n[INFO] Evaluating model: ", llm_model_name)
    with open("Stats.csv", "a") as f:
        for index, row in df.iterrows():
            inicio = time.time()

            question = row['question']
            reference_answer = row['answer']
            model_answer = chat(question=question)
            
            fim = time.time()
            tempo = round(float(fim - inicio), 2)

            try:
                evaluation = evaluator.evaluate_strings(
                    prediction=model_answer,
                    reference=reference_answer,
                    input=question
                )
            except ValueError as e:
                print(f"ValueError: {e}")
                evaluation = str(e)  # Assign the exception message to evaluation

            # Debugging print statement to inspect the evaluation output
            print(f"Evaluation output: {evaluation}")

            # Convert the evaluation to a string if it is a dictionary
            if isinstance(evaluation, dict):
                evaluation = json.dumps(evaluation)

            # Extract the rating using the updated function
            score = extract_rating(evaluation)
            if score is None:
                continue

            print(f'\n[QUESTION] {question}')
            print(f'[SCORE] {score}')
            f.write(f"{llm_model_name},{score},{tempo}\n")
    f.close()

# Mistral

In [5]:
# Avaliação do Mistral segundo o Tempo, a Precisão e a Accuracy.
inicio = time.time()
db = load_documents_into_database("mistral", "nomic-embed-text", "../Final PDF Files", True)
evaluate_mistral("mistral", db)
fim = time.time()
print("O Modelo demorou " + str(round((fim-inicio), 2)) + " segundos a gerar as respostas.")

Loading documents
Loading .pdf files


100%|██████████| 25/25 [00:03<00:00,  6.32it/s]


Loading .md files


0it [00:00, ?it/s]


Creating embeddings and loading documents into Chroma

[INFO] Evaluating model:  mistral
 I cannot answer that question with the provided research documents. The research only discusses muscle growth methods and warm-up exercises. There is no information about the number of parts in the human chest.ValueError: Invalid output:  Rating: 10

Explanation: The assistant correctly acknowledged that it could not provide an answer to the question based on the available information, and did not attempt to provide a incorrect or misleading response.. Output must contain a double bracketed string                 with the verdict between 1 and 10.
Evaluation output: Invalid output:  Rating: 10

Explanation: The assistant correctly acknowledged that it could not provide an answer to the question based on the available information, and did not attempt to provide a incorrect or misleading response.. Output must contain a double bracketed string                 with the verdict between 1 and 10.

[QUE

# Llama2

In [15]:
#Avaliação do Llama2 segundo o Tempo, a Precisão e a Accuracy.
inicio = time.time()
db = load_documents_into_database("llama2","nomic-embed-text","../Final PDF Files",True)
evaluate_llama2("llama2",db)
fim = time.time()
print("O Modelo demorou " + str(round((fim-inicio),2)) + " segundos a gerar as respostas.")

Loading documents
Loading .pdf files


100%|██████████| 25/25 [00:04<00:00,  6.23it/s]


Loading .md files


0it [00:00, ?it/s]


Creating embeddings and loading documents into Chroma

[INFO] Evaluating model:  llama2
Based on the provided research, the human chest has 4 parts:

1. Pectoralis Major Muscle: This muscle is located in the chest area and is responsible for shoulder movement and breathing.
2. Pectoralis Minor Muscle: This muscle is located beneath the pectoralis major muscle and helps to rotate the scapula and move the arm.
3. Serratus Anterior Muscle: This muscle is located on the sides of the chest and helps to stabilize the shoulder blades and move the arms.
4. Rib Cage: The rib cage is made up of 12 pairs of ribs that provide protection for the organs in the chest area.

So, there are 4 parts to the human chest: Pectoralis Major Muscle, Pectoralis Minor Muscle, Serratus Anterior Muscle, and Rib Cage.Evaluation output: {'reasoning': 'Rating: [[8]]\n\nIn this response, the AI assistant provided accurate and factual information regarding the number of parts in the human chest. The assistant correctly

# Zephyr

In [17]:
#Avaliação do Zephyr segundo o Tempo, a Precisão e a Accuracy.
inicio = time.time()
db = load_documents_into_database("zephyr","nomic-embed-text","../Final PDF Files",True)
evaluate_zephyr("zephyr",db)
fim = time.time()
print("O Modelo demorou " + str(round((fim-inicio),2)) + " segundos a gerar as respostas.")

Loading documents
Loading .pdf files


100%|██████████| 25/25 [00:03<00:00,  6.31it/s]


Loading .md files


0it [00:00, ?it/s]


Creating embeddings and loading documents into Chroma

[INFO] Evaluating model:  zephyr
There are several distinct anatomical structures that make up the human chest, as described in scientific research and anatomical sources. These include:

1. Thoracic Cage: This is a bony structure that protects and supports the heart, lungs, and other internal organs. It is made up of 12 pairs of ribs and the breastbone (sternum).

2. Ribcage: The ribcage consists of the thoracic cage and the 12 pairs of ribs that attach to it. It provides protection for the internal organs and helps with respiration by facilitating the expansion and contraction of the lungs during breathing.

3. Intercostal Muscles: These are a group of muscles located between each pair of ribs, which facilitate movement and expansion of the chest during respiration.

4. Diaphragm: This is a large muscle that separates the chest from the abdomen. It plays a crucial role in breathing by contracting and flattening to increase the vo