# Import real evaluation dataset

In [None]:
import pandas as pd

def load_outputs_from_csv(csv_file_path):
    try:
        # Read the CSV file into a DataFrame with semicolon as the delimiter
        df = pd.read_csv(csv_file_path, sep=';', encoding='utf-8')
        print(f"Data successfully loaded from {csv_file_path}")
        return df
    except FileNotFoundError:
        print(f"No CSV file found at {csv_file_path}. Returning an empty DataFrame.")
        return pd.DataFrame(columns=["question", "answer"])

# Usage example - change the input file as needed
eval_df = load_outputs_from_csv("real_evaluation_set.csv")
print(len(eval_df))
display(eval_df.head())  # Display the first few rows of the loaded data

# Import documents dataset to have the original data for the Retrieval part of the RAG

In [None]:
# Load the original dataset for RAG
import pandas as pd
filename_all_data_dict = "./Files/final_dataset.csv"

data_df = pd.read_csv(filename_all_data_dict, names = ['file', 'text'], header = None)
data_df = data_df.drop(index = 0)
data_df

# Create the Retrieval part of the RAG

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.document_loaders import DataFrameLoader
import pprint

loader = DataFrameLoader(data_df, page_content_column="text")
docs_data = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
splits = text_splitter.split_documents(docs_data)
pprint.pprint(splits[0:6])
pprint.pprint(len(splits))

In [None]:
from FlagEmbedding import BGEM3FlagModel
model_fp16 = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)

In [None]:
class M3EmbeddingFP16:
    def embed_documents(self, texts):
        return model_fp16.encode(texts)['dense_vecs']
    
    def __call__(self, texts):
        return self.embed_documents(texts)

In [None]:
from langchain_community.vectorstores import FAISS
vectorstore = FAISS.load_local("local_model_index", M3EmbeddingFP16(), allow_dangerous_deserialization=True)
vectorstore.index.ntotal

In [None]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 10})

# Prepare the Generative part of the RAG

In [None]:
from langchain.prompts import ChatPromptTemplate

# Prompt
template = """
Comportati come un assistente che risponde alle domande del cliente.
Rispondi alla domanda basandoti solo sui seguenti documenti: {context}
Rispondi in modo conciso e chiaro, spiegando passo passo al cliente le azioni necessarie da effettuare.
Se possibile, dai indicazioni dettagliate al cliente, su come risolvere il problema o effettuare l'azione desiderata.
Evita troppe ripetizioni nella risposta fornita.
Quando spieghi che cosa è o cosa significa un certo elemento richiesto, non parlarne come se fosse un problema.

In caso di più domande rispondi solo a quelle inerenti alla documentazione e rimani a disposizione per altre domande sull'argomento, specificando,
invece, che le altre domande non sono state trovate pertinenti in questo contesto.

Domanda relativa al software Panthera: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt

# Post-processing
def format_docs(splits):
    return "\n\n".join(doc.page_content for doc in splits)

In [None]:
import pandas as pd

def invoke_model_and_store_response(eval_df, model_invoke_func, response_column):
    """
    Iterate over the DataFrame, invoke the model for each question, and store the response.

    Parameters:
    eval_df (pd.DataFrame): DataFrame containing the column 'question'.
    model_invoke_func: A function that takes a question as input and returns a model response.
    response_column (str): The column name to store the model responses. Defaults to "model_response_llama_instruct".
    
    Returns:
    pd.DataFrame: The updated DataFrame with model responses.
    """
    # Iterate over each row in the DataFrame using `iterrows`
    for idx, row in eval_df.iterrows():
        question = row["question"]  # Get the question from the current row
        
        # Invoke the model to get the response
        response = model_invoke_func(question)  # Assuming the model function takes the question as a string
        
        # Store the response in the specified column for the corresponding row
        eval_df.at[idx, response_column] = response

    return eval_df

# Evaluation auxiliary functions

In [None]:
# Evaluate RAG with RAG-evaluator dataset
from rag_evaluator import RAGEvaluator

# Initialize the evaluator
evaluator = RAGEvaluator()

In [None]:
import pandas as pd

def evaluate_responses(eval_df, evaluator, model_response_df_column):
    """
    Evaluate model responses against reference answers.

    Parameters:
    eval_df (pd.DataFrame): DataFrame containing the columns 'question', 'model_response', and 'answer'.
    evaluator: An object that has an `evaluate_all` method for evaluating responses.

    Returns:
    pd.DataFrame: A DataFrame containing the evaluation results.
    """
    results = []
    
    for _, row in eval_df.iterrows():
        question = row['question']
        response = row[model_response_df_column]
        reference = row['answer']
        
        # Evaluate and store the results
        evaluation = evaluator.evaluate_all(question, response, reference)
        results.append(evaluation)

    # Convert results to a DataFrame
    results_df = pd.DataFrame(results)

    return results_df

# Example usage
# results_df = evaluate_responses(eval_df, evaluator)
# display(results_df)

In [None]:
import pandas as pd

def aggregate_metrics(results_df):
    """
    Aggregate evaluation metrics from the results DataFrame.

    Parameters:
    results_df (pd.DataFrame): DataFrame containing evaluation metrics.

    Returns:
    pd.DataFrame: A DataFrame containing the aggregated metrics.
    """
    # Aggregate metrics to get a single evaluation for the model
    aggregated_results = {
        "BLEU": results_df["BLEU"].mean(),
        "ROUGE-1": results_df["ROUGE-1"].mean(),
        "ROUGE-2": results_df["ROUGE-2"].mean(), 
        "ROUGE-L": results_df["ROUGE-L"].mean(),
        "BERT P": results_df["BERT P"].mean(),
        "BERT R": results_df["BERT R"].mean(),
        "BERT F1": results_df["BERT F1"].mean(),
        "Perplexity": results_df["Perplexity"].mean(),
        "Diversity": results_df["Diversity"].mean(),
    }

    # Convert aggregated results to a DataFrame for better readability
    aggregated_results_df = pd.DataFrame(aggregated_results, index=[0])

    return aggregated_results_df

# Example usage
# aggregated_results_df = aggregate_metrics(results_df)
# display(aggregated_results_df)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

def evaluate_and_plot(eval_df, evaluator, model_response_df_column, title):
    """
    Evaluate model responses and plot the progression of evaluation metrics.

    Parameters:
    eval_df (pd.DataFrame): DataFrame containing the columns 'question', 'model_response', and 'answer'.
    evaluator: An object that has an `evaluate_all` method for evaluating responses.
    """
    cumulative_metrics = {
        "BLEU": 0,
        "ROUGE-1": 0,
        "BERT P": 0,
        "BERT R": 0,
        "BERT F1": 0,
        "Perplexity": 0,
        "Diversity": 0,
    }

    # List to store running means
    running_means = []

    # Evaluate each entry
    for index, row in eval_df.iterrows():
        question = row['question']
        response = row[model_response_df_column]
        reference = row['answer']
        
        # Evaluate and store the results
        evaluation = evaluator.evaluate_all(question, response, reference)
        
        # Update cumulative sums
        for metric in cumulative_metrics.keys():
            cumulative_metrics[metric] += evaluation[metric]
        
        # Compute the current means
        current_means = {metric: cumulative_metrics[metric] / (index + 1) for metric in cumulative_metrics}
        running_means.append(current_means)

    # Create a DataFrame for running means
    running_means_df = pd.DataFrame(running_means)

    # Create subplots for both plots
    fig, axs = plt.subplots(1, 2, figsize=(12, 4))

    # First plot: Progression of evaluation metrics
    for metric in cumulative_metrics.keys():
        axs[0].plot(running_means_df.index + 1, running_means_df[metric], marker='o', label=metric)

    axs[0].set_title(title)
    axs[0].set_xlabel('Number of Evaluation Points (N)')
    axs[0].set_ylabel('Mean Metric Value')
    axs[0].axhline(y=0, color='grey', linestyle='--')  # Optional: Add a horizontal line at y=0 for reference
    axs[0].grid()

    # Second plot: Zoomed in on values between 0 and 1
    for metric in cumulative_metrics.keys():
        axs[1].plot(running_means_df.index + 1, running_means_df[metric], marker='o', label=metric)

    axs[1].set_title('Zoomed Progression of Evaluation Metrics')
    axs[1].set_xlabel('Number of Evaluation Points (N)')
    axs[1].set_ylabel('Mean Metric Value')
    axs[1].set_ylim(0, 1.1)  # Set y-axis limits to zoom
    axs[1].legend(loc='best', fontsize='small', frameon=True, borderpad=0.5, bbox_to_anchor=(1, 1))
    axs[1].grid()

    plt.tight_layout()  # Adjust the layout
    plt.show()

# Example usage
# evaluate_and_plot(eval_df, evaluator, 'model_response')


# Simple RAG with Lllama3.2

In [None]:
from langchain_ollama import ChatOllama

model_llama = ChatOllama(
    model="llama3.2",
    temperature=0
)

# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model_llama
    | StrOutputParser()
)

response_column = 'model_response'
eval_df = invoke_model_and_store_response(eval_df, rag_chain.invoke, response_column)
eval_df

# Simple RAG with Llama3.2 Instruct

In [None]:
from langchain_ollama.llms import OllamaLLM
model_llama_instruct = OllamaLLM(model="llama3.2:3b-instruct-fp16", temperature=0)

rag_chain_llama_instruct = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model_llama_instruct
    | StrOutputParser()
)

response_column = "model_response_llama_instruct"
eval_df = invoke_model_and_store_response(eval_df, rag_chain_llama_instruct.invoke, response_column)
eval_df

# Simple RAG with GPT-4o

In [None]:
import os
from langchain_openai import ChatOpenAI
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = os.getenv('LANGCHAIN_API_KEY')
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

In [None]:
model_gpt = ChatOpenAI(temperature=0, model="gpt-4o")

# Chain
rag_chain_gpt = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model_gpt
    | StrOutputParser()
)

response_column = 'model_response_gpt'
eval_df = invoke_model_and_store_response(eval_df, rag_chain_gpt.invoke, response_column)
display(eval_df)

# Simple RAG with GPT-3.5-turbo

In [None]:
model_old_gpt = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")

# Chain
rag_chain_old_gpt = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model_old_gpt
    | StrOutputParser()
)

response_column = 'model_response_old_gpt'
eval_df = invoke_model_and_store_response(eval_df, rag_chain_old_gpt.invoke, response_column)
display(eval_df)

# Models comparison on basic RAG

In [None]:
# Verify if the BLEU score is correct, given that it should be from 0 to 1, but has higher values
import evaluate
bleu = evaluate.load("bleu")

In [None]:
model_response_df_column_evaluated = 'model_response'
title = 'Progression of Evaluation Metrics for Llama3.2'
result_df_llama = evaluate_responses(eval_df, evaluator, model_response_df_column_evaluated)
display(result_df_llama)
metrics_llama = aggregate_metrics(result_df_llama)
# Overwrite the average BLEU with a more precise computation, which value is maintained among 0 and 1
metrics_llama['BLEU'] = bleu.compute(predictions=eval_df[model_response_df_column_evaluated], references=eval_df['answer'])['bleu']
display(metrics_llama)
evaluate_and_plot(eval_df, evaluator, model_response_df_column_evaluated, title)

In [None]:
model_response_df_column_evaluated = 'model_response_llama_instruct'
title = 'Progression of Evaluation Metrics for Llama3.2 Instruct'
result_df_llama_instruct = evaluate_responses(eval_df, evaluator, model_response_df_column_evaluated)
display(result_df_llama_instruct)
metrics_llama_instruct = aggregate_metrics(result_df_llama_instruct)
metrics_llama_instruct['BLEU'] = bleu.compute(predictions=eval_df[model_response_df_column_evaluated], references=eval_df['answer'])['bleu']
display(metrics_llama_instruct)
evaluate_and_plot(eval_df, evaluator, model_response_df_column_evaluated, title)

In [None]:
model_response_df_column_evaluated = 'model_response_gpt'
title = 'Progression of Evaluation Metrics for GPT-4o'
result_df_gpt = evaluate_responses(eval_df, evaluator, model_response_df_column_evaluated)
display(result_df_gpt)
metrics_gpt = aggregate_metrics(result_df_gpt)
metrics_gpt['BLEU'] = bleu.compute(predictions=eval_df[model_response_df_column_evaluated], references=eval_df['answer'])['bleu']
display(metrics_gpt)
evaluate_and_plot(eval_df, evaluator, model_response_df_column_evaluated, title)

In [None]:
model_response_df_column_evaluated = 'model_response_old_gpt'
title = 'Progression of Evaluation Metrics for GPT-3.5-turbo'
result_df_old_gpt = evaluate_responses(eval_df, evaluator, model_response_df_column_evaluated)
display(result_df_old_gpt)
metrics_old_gpt = aggregate_metrics(result_df_old_gpt)
metrics_old_gpt['BLEU'] = bleu.compute(predictions=eval_df[model_response_df_column_evaluated], references=eval_df['answer'])['bleu']
display(metrics_old_gpt)
evaluate_and_plot(eval_df, evaluator, model_response_df_column_evaluated, title)

# Mean evaluation metrics of all models

In [None]:
# Define model names and RAG types
model_data = [
    {'Model': 'llama3.2', 'RAG Type': 'Basic RAG', 'Question_rewriting' : False, 'Retriver_k' : 10, 'Prompt_engineering': True, 'Prompt_type': 'Zero-shot', **metrics_llama},
    {'Model': 'llama3.2_Instruct', 'RAG Type': 'Basic RAG', 'Question_rewriting' : False, 'Retriver_k' : 10, 'Prompt_engineering': True, 'Prompt_type': 'Zero-shot', **metrics_llama_instruct},
    {'Model': 'gpt-3.5-turbo', 'RAG Type': 'Basic RAG', 'Question_rewriting': False, 'Retriver_k' : 10, 'Prompt_engineering': True, 'Prompt_type': 'Zero-shot', **metrics_old_gpt},
    {'Model': 'gpt-4o', 'RAG Type': 'Basic RAG', 'Question_rewriting': False, 'Retriver_k' : 10, 'Prompt_engineering': True, 'Prompt_type': 'Zero-shot', **metrics_gpt},
]

# Create DataFrame
df_metrics = pd.DataFrame(model_data)

# Display the DataFrame
display(df_metrics)

In [None]:
# Specify the columns to apply the extraction function
columns_to_extract = ['BLEU', 'ROUGE-1', 'BERT P', 'BERT R', 'BERT F1', 'Perplexity', 'Diversity']

# Drop the unwanted column
df_metrics = df_metrics.drop(columns='Racial Bias')

# Ensure the 'BLEU' column is treated as strings
for column in columns_to_extract:
    df_metrics[column] = df_metrics[column].astype(str).str.split().str[1]

display(df_metrics)

# Analyze more the metrics and which makes sense to be tracked 

In [None]:
bleurt = evaluate.load("bleurt", module_type="metric")
results = bleurt.compute(predictions=eval_df["model_response"], references=eval_df["answer"])
print(results)

In [None]:
from bert_score import score
from statistics import mean

candidates = eval_df["model_response"]
references = eval_df["answer"]

def evaluate_bert_score(candidates, references):
    bertscore = evaluate.load("bertscore")
    results = bertscore.compute(predictions=candidates, references=references, lang="it")
    P = results['precision']
    R = results['recall']
    F1 = results['f1']
    P_mean, R_mean, F1_mean = mean(P), mean(R), mean(F1)
    return P, R, F1, P_mean, R_mean, F1_mean

evaluate_bert_score(candidates, references)


In [None]:
def evaluate_perplexity(response_column):
    perplexity = evaluate.load("perplexity", module_type="metric")
    results = perplexity.compute(model_id='gpt2',
                             add_start_token=False,
                             predictions=eval_df["model_response"])
    perplexities = results['perplexities']
    mean_perplexity = results['mean_perplexity']
    return perplexities, mean_perplexity

evaluate_perplexity(candidates)

In [None]:
# Verify if the BLEU score is correct, given that it should be from 0 to 1, but has higher values
import evaluate

def evaluate_bleu(candidates, references):
    bleu = evaluate.load("bleu")
    return bleu.compute(predictions=candidates, references=references)
# the precisions are the precision scores for each n-gram
# bleu is the geometric mean of the precision scores

evaluate_bleu(candidates, references)

In [None]:
from rouge_score import rouge_scorer

def evaluate_rouge(candidates, references):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = [scorer.score(ref, cand) for ref, cand in zip(references, candidates)]
    rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores)
    rouge2 = sum([score['rouge2'].fmeasure for score in rouge_scores]) / len(rouge_scores)
    rougeL = sum([score['rougeL'].fmeasure for score in rouge_scores]) / len(rouge_scores)
    return rouge_scores, rouge1, rouge2, rougeL

evaluate_rouge(candidates, references)

In [None]:
from nltk.util import ngrams

def evaluate_diversity(texts):
    all_tokens = [tok for text in texts for tok in text.split()]
    unique_bigrams = set(ngrams(all_tokens, 2))
    diversity_score = len(unique_bigrams) / len(all_tokens) if all_tokens else 0
    return diversity_score

diversity = []
for row in candidates:
    diversity.append(evaluate_diversity(row))

display(diversity)
display(mean(diversity))

In [None]:
# RAG Evaluator implementation, change it for our case
import mauve
import torch
from statistics import mean
from rouge_score import rouge_scorer
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.translate.meteor_score import meteor_score
from nltk.translate.chrf_score import sentence_chrf
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from textstat import flesch_reading_ease, flesch_kincaid_grade
from sklearn.metrics.pairwise import cosine_similarity
from mauve import compute_mauve
import nltk

class RAGEvaluator:
    def __init__(self):
        self.gpt2_model, self.gpt2_tokenizer = self.load_gpt2_model()

    def load_gpt2_model(self):
        model = GPT2LMHeadModel.from_pretrained('gpt2')
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

        return model, tokenizer
    
    def evaluate_bert_score(self, candidates, references):
        bertscore = evaluate.load("bertscore")
        results = bertscore.compute(predictions=candidates, references=references, lang="it")
        P = float(results['precision'][0])
        R = float(results['recall'][0])
        F1 = float(results['f1'][0])
        return P, R, F1

    def evaluate_perplexity(self, text):
        encodings = self.gpt2_tokenizer(text, return_tensors='pt')
        max_length = self.gpt2_model.config.n_positions
        stride = 512
        lls = []
        for i in range(0, encodings.input_ids.size(1), stride):
            begin_loc = max(i + stride - max_length, 0)
            end_loc = min(i + stride, encodings.input_ids.size(1))
            trg_len = end_loc - i
            input_ids = encodings.input_ids[:, begin_loc:end_loc]
            target_ids = input_ids.clone()
            target_ids[:, :-trg_len] = -100
            with torch.no_grad():
                outputs = self.gpt2_model(input_ids, labels=target_ids)
                log_likelihood = outputs[0] * trg_len
            lls.append(log_likelihood)
        ppl = torch.exp(torch.stack(lls).sum() / end_loc)
        return ppl.item()
        
    def evaluate_bleu(self, candidates, references):
        bleu = evaluate.load("bleu")

        results = bleu.compute(predictions=candidates, references=references)
        return results['bleu']
    
    def evaluate_rouge(self, candidates, references):
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        rouge_scores = [scorer.score(ref, cand) for ref, cand in zip(references, candidates)]
        rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores)
        rouge2 = sum([score['rouge2'].fmeasure for score in rouge_scores]) / len(rouge_scores)
        rougeL = sum([score['rougeL'].fmeasure for score in rouge_scores]) / len(rouge_scores)
        return rouge1, rouge2, rougeL

    def evaluate_diversity(self, texts):
        all_tokens = [tok for text in texts for tok in text.split()]
        unique_bigrams = set(ngrams(all_tokens, 2))
        diversity_score = len(unique_bigrams) / len(all_tokens) if all_tokens else 0
        return diversity_score
        
    def evaluate_all(self, question, response, reference):
        candidates = [response]
        references = [reference]
        bleu = self.evaluate_bleu(candidates, references)
        rouge1, rouge2, rougeL = self.evaluate_rouge(candidates, references)
        bert_p, bert_r, bert_f1 = self.evaluate_bert_score(candidates, references)
        perplexity = self.evaluate_perplexity(response)
        diversity = self.evaluate_diversity(candidates)
        #mauve_score = self.evaluate_mauve(reference, response)
        #meteor = self.evaluate_meteor(candidates, references)
        #chrf = self.evaluate_chrf(candidates, references)
        #flesch_ease, flesch_grade = self.evaluate_readability(response)
        return {
            "BLEU": bleu,
            "ROUGE-1": rouge1,
            "ROUGE-2": rouge2, 
            "ROUGE-L": rougeL,
            "BERT P": bert_p,
            "BERT R": bert_r,
            "BERT F1": bert_f1,
            "Perplexity": perplexity,
            "Diversity": diversity,
            #"MAUVE": mauve_score,
            #"METEOR": meteor,
            #"CHRF": chrf,
            #"Flesch Reading Ease": flesch_ease,
            #"Flesch-Kincaid Grade": flesch_grade,
        }

In [None]:
my_evaluator = RAGEvaluator()

In [None]:
my_library_results = evaluate_responses(eval_df, my_evaluator, model_response_df_column_evaluated)

In [None]:
def evaluate_meteor(self, candidates, references):
        nltk.download('punkt', quiet=True)  
        
        meteor_scores = [
            meteor_score([word_tokenize(ref)], word_tokenize(cand))
            for ref, cand in zip(references, candidates)
        ]
        return sum(meteor_scores) / len(meteor_scores)
    
def evaluate_chrf(self, candidates, references):
        chrf_scores = [sentence_chrf(ref, cand) for ref, cand in zip(references, candidates)]
        return sum(chrf_scores) / len(chrf_scores)
    
def evaluate_readability(self, text):
        flesch_ease = flesch_reading_ease(text)
        flesch_grade = flesch_kincaid_grade(text)
        return flesch_ease, flesch_grade
        
def evaluate_mauve(self,reference_texts, generated_texts):
        out = mauve.compute_mauve(
                                  p_text=reference_texts,  # List of reference texts
                                  q_text=generated_texts,  # List of generated texts
                                  device_id=0,             # GPU device ID; set to -1 for CPU
                                  max_text_length=1024,     # Maximum length of text to truncate
                                  verbose=False            # Whether to print additional information
                                )
        return  out.mauve

# SemScore

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F
import torch

# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

sentences = ["limone", "arancia", "macchina", "soldi"]

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-m3')
model = AutoModel.from_pretrained('BAAI/bge-m3')

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling. In this case, max pooling.
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p = 2, dim = 1)

for i in range(0, len(sentences)):
    for j in range(0, len(sentence_embeddings)):
        print(
            sentences[j],
            sentences[i],
            (sentence_embeddings[j] @ sentence_embeddings[i]).item()
        )
    print("\n")

In [None]:
# Sentences we want sentence embeddings for
sentences = ["limone", "arancia", "macchina", "soldi"]

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling. In this case, max pooling.
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

import torch.nn.functional as F

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p = 2, dim = 1)
sentence_embeddings

for i in range(0, len(sentences)):
    for j in range(0, len(sentence_embeddings)):
        print(
            sentences[j],
            sentences[i],
            (sentence_embeddings[j] @ sentence_embeddings[i]).item()
        )
    print("\n")

In [None]:
# Sentences we want sentence embeddings for
sentences = ["limone", "arancia", "macchina", "soldi"]

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling. In this case, max pooling.
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

import torch.nn.functional as F

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p = 2, dim = 1)
sentence_embeddings

for i in range(0, len(sentences)):
    for j in range(0, len(sentence_embeddings)):
        print(
            sentences[j],
            sentences[i],
            (sentence_embeddings[j] @ sentence_embeddings[i]).item()
        )
    print("\n")

In [None]:
sentences = [eval_df["answer"][0], eval_df["model_response"][0], eval_df["model_response_llama_instruct"][0], eval_df["model_response_old_gpt"][0], eval_df["model_response_gpt"][0]]

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-m3')
model = AutoModel.from_pretrained('BAAI/bge-m3')

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling. In this case, max pooling.
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

import torch.nn.functional as F

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p = 2, dim = 1)
sentence_embeddings

for i in range(0, len(sentences)):
    print(
        (sentence_embeddings[0] @ sentence_embeddings[i]).item()
    )


In [None]:
sentences = [eval_df["answer"][0], eval_df["model_response"][0], eval_df["model_response_llama_instruct"][0], eval_df["model_response_old_gpt"][0], eval_df["model_response_gpt"][0]]

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling. In this case, max pooling.
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

import torch.nn.functional as F

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p = 2, dim = 1)
sentence_embeddings

for i in range(0, len(sentences)):
    print(
        (sentence_embeddings[0] @ sentence_embeddings[i]).item()
    )


In [None]:
sentences = [eval_df["answer"][0], eval_df["model_response"][0], eval_df["model_response_llama_instruct"][0], eval_df["model_response_old_gpt"][0], eval_df["model_response_gpt"][0]]

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling. In this case, max pooling.
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

import torch.nn.functional as F

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p = 2, dim = 1)
sentence_embeddings

for i in range(0, len(sentences)):
    print(
        (sentence_embeddings[0] @ sentence_embeddings[i]).item()
    )


# Extend SemScore to entire evaluation set

In [None]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
import pandas as pd
from tqdm import tqdm

def compute_sem_score(eval_df, model_name, response_keys):
    """
    Computes cosine similarity between the reference answer and different model responses in eval_df.
    
    Parameters:
    - eval_df: DataFrame containing "answer" and model response columns
    - model_name: name of the model to load from HuggingFace
    - tokenizer_name: name of the tokenizer to load from HuggingFace
    - response_keys: list of column names in eval_df for different model responses

    Returns:
    - cosine_similarities_df: DataFrame of individual cosine similarities for each example and each model response
    - average_cosine_similarities_df: DataFrame of average cosine similarities for each model response
    """
    
    # Load specified model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    # Initialize a list to store cosine similarity results for each example
    cosine_similarities_list = []

    # Loop over all examples in eval_df
    for idx in tqdm(range(len(eval_df)), desc="Processing examples"):
        sentences = [eval_df["answer"][idx]] + [eval_df[key][idx] for key in response_keys]
        
        # Tokenize sentences
        encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
        
        # Compute token embeddings
        with torch.no_grad():
            model_output = model(**encoded_input)
        
        # Perform mean pooling
        sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
        
        # Normalize embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        
        # Calculate cosine similarity for this example and store in a dictionary
        cosine_scores = {"example_index": idx}
        for i, key in enumerate(response_keys, start=1):
            cosine_similarity = (sentence_embeddings[0] @ sentence_embeddings[i]).item()
            cosine_scores[key] = cosine_similarity
        
        # Append the scores dictionary to the list
        cosine_similarities_list.append(cosine_scores)

    # Convert the list of cosine similarities into a DataFrame
    cosine_similarities_df = pd.DataFrame(cosine_similarities_list)
    cosine_similarities_df = cosine_similarities_df.drop(columns= "example_index")

    # Calculate average similarity scores for each model response and store in a new DataFrame
    average_cosine_similarities_df = pd.DataFrame(cosine_similarities_df[response_keys].mean()).T

    # Return both the individual and average DataFrames
    return cosine_similarities_df, average_cosine_similarities_df

In [None]:
# Use the function
response_keys = ["model_response", "model_response_llama_instruct", "model_response_old_gpt", "model_response_gpt"]

cosine_similarities_miniLM, average_cosine_similarities_miniLM = compute_sem_score(
    eval_df, 
    model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',
    response_keys=response_keys
)

cosine_similarities_bge, average_cosine_similarities_bge = compute_sem_score(
    eval_df, 
    model_name='BAAI/bge-m3',
    response_keys=response_keys
)

cosine_similarities_Mpnet, average_cosine_similarities_Mpnet = compute_sem_score(
    eval_df, 
    model_name='sentence-transformers/paraphrase-multilingual-mpnet-base-v2',
    response_keys=response_keys
)

#Display the results
print("\n")
print("Model sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
print("Individual Cosine Similarities DataFrame:")
display(cosine_similarities_miniLM)
print("\nAverage Cosine Similarities DataFrame:")
display(average_cosine_similarities_miniLM)

#Display the results
print("\n")
print("Model BAAI/bge-m3")
print("Individual Cosine Similarities DataFrame:")
display(cosine_similarities_bge)
print("\nAverage Cosine Similarities DataFrame:")
display(average_cosine_similarities_bge)

#Display the results
print("\n")
print("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
print("Individual Cosine Similarities DataFrame:")
display(cosine_similarities_Mpnet)
print("\nAverage Cosine Similarities DataFrame:")
display(average_cosine_similarities_Mpnet)

In [None]:
# Assign model names to differentiate the averages in each DataFrame
average_cosine_similarities_miniLM['model'] = 'MiniLM'
average_cosine_similarities_bge['model'] = 'BGE'
average_cosine_similarities_Mpnet['model'] = 'Mpnet'

# Concatenate the two average DataFrames along the rows
combined_averages_df = pd.concat([average_cosine_similarities_miniLM, average_cosine_similarities_bge, average_cosine_similarities_Mpnet], ignore_index=True)

# Display the combined DataFrame
print("Combined Average Cosine Similarities for Different Models, comparing embeddings for Italian SemScore:")
display(combined_averages_df)

# BERTScore Analysis compared to SemScore

In [None]:
evaluator = RAGEvaluator()
model_response_df_column_evaluated = "model_response_old_gpt"
my_library_results_old_gpt = evaluate_responses(eval_df, evaluator, model_response_df_column_evaluated)
display(my_library_results_old_gpt)
my_metrics_old_gpt = aggregate_metrics(my_library_results_old_gpt)
display(my_metrics_old_gpt)

In [None]:
model_response_df_column_evaluated = "model_response_gpt"
my_library_results_gpt = evaluate_responses(eval_df, evaluator, model_response_df_column_evaluated)
display(my_library_results_gpt)
my_metrics_gpt = aggregate_metrics(my_library_results_gpt)
display(my_metrics_gpt)

In [None]:
model_response_df_column_evaluated = "model_response"
my_library_results_llama = evaluate_responses(eval_df, evaluator, model_response_df_column_evaluated)
display(my_library_results_llama)
my_metrics_llama = aggregate_metrics(my_library_results_llama)
display(my_metrics_llama)

In [None]:
model_response_df_column_evaluated = "model_response_llama_instruct"
my_library_results_llama_instruct = evaluate_responses(eval_df, evaluator, model_response_df_column_evaluated)
display(my_library_results_llama_instruct)
my_metrics_llama_instruct = aggregate_metrics(my_library_results_llama_instruct)
display(my_metrics_llama_instruct)

In [None]:
# Define model names and RAG types
model_data = [
    {'Model': 'llama3.2', 'RAG Type': 'Basic RAG', 'Question_rewriting' : False, 
     'Retriver_k' : 10, 'Prompt_engineering': True, 'Prompt_type': 'Zero-shot', **my_metrics_llama},
    {'Model': 'llama3.2_Instruct', 'RAG Type': 'Basic RAG', 'Question_rewriting' : False, 
     'Retriver_k' : 10, 'Prompt_engineering': True, 'Prompt_type': 'Zero-shot', **my_metrics_llama_instruct},
    {'Model': 'gpt-3.5-turbo', 'RAG Type': 'Basic RAG', 'Question_rewriting': False, 
     'Retriver_k' : 10, 'Prompt_engineering': True, 'Prompt_type': 'Zero-shot', **my_metrics_old_gpt},
    {'Model': 'gpt-4o', 'RAG Type': 'Basic RAG', 'Question_rewriting': False, 
     'Retriver_k' : 10, 'Prompt_engineering': True, 'Prompt_type': 'Zero-shot', **my_metrics_gpt},
]

# Create DataFrame
df_my_metrics = pd.DataFrame(model_data)

# Specify the columns to apply the extraction function
columns_to_extract = ['BLEU', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'BERT P', 'BERT R', 'BERT F1', 'Perplexity', 'Diversity']

# Ensure the 'BLEU' column is treated as strings
for column in columns_to_extract:
    df_my_metrics[column] = df_my_metrics[column].astype(str).str.split().str[1]

display(df_my_metrics)

In [None]:
sem_scores_for_model = combined_averages_df.T

In [None]:
sem_scores_for_model.columns = sem_scores_for_model.loc['model']
sem_scores_for_model = sem_scores_for_model[:-1]
sem_scores_for_model

In [None]:
sem_scores_for_model.reset_index(drop = True, inplace=True)
sem_scores_for_model

In [None]:
concatenated_metrics = pd.concat([df_my_metrics, sem_scores_for_model], axis = 1)
concatenated_metrics

In [None]:
final_metrics = concatenated_metrics.drop(columns=["BLEU", "ROUGE-1", "ROUGE-2", "ROUGE-L", "BERT P", "BERT R"])
final_metrics

# TonicValidate model evaluation

In [None]:
from llama_index.evaluation.tonic_validate import (
    TonicValidateEvaluator
)

In [None]:
#Evaluating multiple questions
questions = eval_df['question']

retrieved_context_lists = []
for i in range(0, len(eval_df)):
    context = retriever.invoke(eval_df['question'][i])
    single_context_list = []
    for elem in context:
        single_context_list.append(elem.page_content)

    retrieved_context_lists.append(single_context_list)

references = eval_df['answer']
responses = eval_df['model_response']

tonic_validate_evaluator = TonicValidateEvaluator()

scores_llama = await tonic_validate_evaluator.aevaluate_run(
    questions, references, retrieved_context_lists, responses
)

In [None]:
#Evaluating multiple questions
questions = eval_df['question']

retrieved_context_lists = []
for i in range(0, len(eval_df)):
    context = retriever.invoke(eval_df['question'][i])
    single_context_list = []
    for elem in context:
        single_context_list.append(elem.page_content)

    retrieved_context_lists.append(single_context_list)

references = eval_df['answer']
responses = eval_df['model_response_gpt']

tonic_validate_evaluator = TonicValidateEvaluator()

scores_gpt = await tonic_validate_evaluator.aevaluate_run(
    questions, references, retrieved_context_lists, responses
)

In [None]:
#Evaluating multiple questions
questions = eval_df['question']

retrieved_context_lists = []
for i in range(0, len(eval_df)):
    context = retriever.invoke(eval_df['question'][i])
    single_context_list = []
    for elem in context:
        single_context_list.append(elem.page_content)

    retrieved_context_lists.append(single_context_list)

references = eval_df['answer']
responses = eval_df['model_response_llama_instruct']

tonic_validate_evaluator = TonicValidateEvaluator()

scores_llama_instruct = await tonic_validate_evaluator.aevaluate_run(
    questions, references, retrieved_context_lists, responses
)

In [None]:
#Evaluating multiple questions
questions = eval_df['question']

retrieved_context_lists = []
for i in range(0, len(eval_df)):
    context = retriever.invoke(eval_df['question'][i])
    single_context_list = []
    for elem in context:
        single_context_list.append(elem.page_content)

    retrieved_context_lists.append(single_context_list)

references = eval_df['answer']
responses = eval_df['model_response_old_gpt']

tonic_validate_evaluator = TonicValidateEvaluator()

scores_old_gpt = await tonic_validate_evaluator.aevaluate_run(
    questions, references, retrieved_context_lists, responses
)