Notebook for calculating all metrics associated with the generation evaluation.

In [None]:
# Imports
from typing import List
import torch
import time
from typing import Tuple
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# ragas
from datasets import Dataset
from ragas.metrics import AnswerRelevancy, ContextRecall
from ragas.llms import llm_factory

# HuggingFace
from sentence_transformers import CrossEncoder
import evaluate

# DeepEval
from deepeval.scorer import Scorer

# Embedding Model
from ipynb.fs.defs.a_setup_llms import create_embedding_model

In [None]:
def calculate_rouge_score_bulk(generated_texts: List[str], target_texts: List[str], score_type: str) -> Tuple[List[float], float]:
    '''
    Calculates the ROUGE score of the given score type between the generated LLM text and the golden answer.
    '''
    if score_type not in ["rouge1", "rouge2", "rougeL"]:
        raise Exception("Error, raised exception: Wrong score type for rouge score given.")

    scores = []
    start_time = time.time()

    for gen_text, target_text in zip(generated_texts, target_texts):
        score = Scorer.rouge_score(target=target_text, prediction=gen_text, score_type=score_type)
        scores.append(score)
    
    end_time = time.time()
    time_avg = (end_time - start_time) / len(generated_texts)
       
    return scores, time_avg

In [None]:
def calculate_response_length_bulk(generated_texts: List[str]) -> Tuple[int, float]:
    '''
    Calculates the response length (number of words) based on the NLTK word tokenizer.
    '''    
    start_time = time.time()
    word_length = evaluate.load("word_length", module_type="measurement")
    results = word_length.compute(data=generated_texts)
    end_time = time.time()
    time_avg = (end_time - start_time) / len(generated_texts)

    torch.cuda.empty_cache()

    return results["average_word_length"], time_avg

In [None]:
def calculate_hallucination_score_bulk(generated_texts: List[str], context_lists: List[List[str]]) -> Tuple[List[float], float]:
    '''
    Generates the hallucination score between the generated LLM text and the retrieved contexts.
    Is based on the Vectara Hallucination Evaluation Model.
    '''
    start_time = time.time()

    pairs = []
    # Create pairs of all generated_texts and the connected retrieved contexts
    for gen, context_list in zip(generated_texts, context_lists):
        for context in context_list:
            pairs.append([context, gen])
    
    model = CrossEncoder('vectara/hallucination_evaluation_model')
    scores = model.predict(pairs, batch_size=8)

    avg_scores = []
    index = 0

    for context_list in context_lists:
        # Calculate average score for the current context_list
        scores_contexts = scores[index:index + len(context_list)]
        avg_score = sum(scores_contexts) / len(scores_contexts)
        avg_scores.append(avg_score)

        # Move the index to the next set of scores
        index += len(context_list)
    
    end_time = time.time()
    time_avg = (end_time - start_time) / len(generated_texts)

    torch.cuda.empty_cache()

    return avg_scores, time_avg

In [None]:
import time
from sentence_transformers import CrossEncoder
from typing import List
from typing import Tuple
from FlagEmbedding import FlagReranker

def calculate_answer_relevancy_bulk(generated_texts: List[str], questions: List[str]) -> Tuple[List[float], float]:
    '''
    Generates the answer relevancy score between the generated LLM text and the provided question.
    Uses the BAAI/bge-reranker-base cross-encoder from HuggingFace.
    '''
    start_time = time.time()

    reranker = FlagReranker('BAAI/bge-reranker-base')

    pairs = []
    for question, gen_text in zip(questions, generated_texts):
        pairs.append([question, gen_text])

    scores = reranker.compute_score(pairs, batch_size=16)
    end_time = time.time()

    time_avg = (end_time - start_time) / len(generated_texts)

    torch.cuda.empty_cache()

    return scores, time_avg

In [None]:
def calculate_answer_similarity_bulk(generated_texts: List[str], target_texts: List[str]) -> Tuple[List[float], float]:
    '''
    Generates the answer relevancy between the generated LLM texts and target texts by using the fine-tuned ISO-bge model to compute the cosine similarity.
    '''
    start_time = time.time()

    embedding_model = create_embedding_model("Fine-tuned", "finetuned-ISO-27001_1024")
    embeddings_gen_texts = embedding_model.embed_documents(generated_texts)
    embeddings_target_texts = embedding_model.embed_documents(target_texts)

    # Compute cosine similarity matrix
    cosine_sim_matrix = cosine_similarity(embeddings_gen_texts, embeddings_target_texts)

    print(len(cosine_sim_matrix))
    # Extract the diagonal to get the similarity scores for corresponding pairs
    cosine_sim_scores = np.diag(cosine_sim_matrix)
    
    end_time = time.time()
    time_avg = (end_time - start_time) / len(generated_texts)

    torch.cuda.empty_cache()

    return cosine_sim_scores, time_avg

In [None]:
def calculate_context_recall_bulk(contexts: List[List[str]], target_texts: List[str], questions: List[str]) -> Tuple[List[float], float]:
    '''
    This method computes the context recall which measures the extent to which the retrieved contexts aligns with the golden answer.
    '''
    start_time = time.time()

    llm = llm_factory(model="gpt-3.5-turbo-0125")

    context_recall = ContextRecall(batch_size=10, llm=llm)
    dataset = Dataset.from_dict({"contexts": contexts, "ground_truths": target_texts, "question": questions})
    results = context_recall.score(dataset)
    scores = results["context_recall"]

    end_time = time.time()
    time_avg = (end_time - start_time) / len(questions)

    return scores, time_avg

In [None]:
def calculate_answer_relevancy_RAGAS_bulk(questions: List[str], answers: List[str]) -> Tuple[List[float], float]:
    """
    Computes the answer relevancy score based using RAGAS. Uses an LLM to generate multiple fitting questions based on the generated answer. Then proceeds to compute the cosine similarity between the generated question and the given question. Has the underlying idea that if the generated answer accurately answers the question, the LLM should be able to generate questions that align with the original one. Values are ranging from 0 to 1, with 1 being the highest relevance value
    """
    start_time = time.time()

    embedding_model = create_embedding_model("Fine-tuned", "finetuned-ISO-27001_1024")
    llm = llm_factory(model="gpt-3.5-turbo-0125")

    answer_relevancy = AnswerRelevancy(batch_size=10, embeddings=embedding_model, llm=llm)
    dataset = Dataset.from_dict({"question": questions, "answer": answers})
    results = answer_relevancy.score(dataset)
    scores = results["answer_relevancy"]

    end_time = time.time()
    time_avg = (end_time - start_time) / len(questions)

    return scores, time_avg