# Import synthetic dataset

In [None]:
import pandas as pd
from datasets import Dataset

# Load the dataset
eval_dataset = Dataset.load_from_disk("eval_dataset")

# Convert to DataFrame
eval_df = eval_dataset.to_pandas()
eval_df

# RAG evaluation on eval_dataset

In [None]:
import random
index = random.randint(0, len(eval_df))
element = eval_df.loc[index]
element

In [None]:
print(len(eval_df))

In [None]:
import pprint

question = element["question"]
response = element["answer"]
referenced_context = element["context"]

pprint.pprint(question)
pprint.pprint(response)

# Basic RAG responses

In [None]:
# Load the original dataset for RAG
import pandas as pd
filename_all_data_dict = "./Files/final_dataset.csv"

data_df = pd.read_csv(filename_all_data_dict, names = ['file', 'text'], header = None)
data_df = data_df.drop(index = 0)
data_df

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [None]:
from langchain_community.document_loaders import DataFrameLoader

loader = DataFrameLoader(data_df, page_content_column="text")
docs_data = loader.load()

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
splits = text_splitter.split_documents(docs_data)
pprint.pprint(splits[0:6])
pprint.pprint(len(splits))

In [None]:
from FlagEmbedding import BGEM3FlagModel

model_fp16 = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)

In [None]:
class M3EmbeddingFP16:
    def embed_documents(self, texts):
        return model_fp16.encode(texts)['dense_vecs']
    
    def __call__(self, texts):
        return self.embed_documents(texts)

In [None]:
from langchain_community.vectorstores import FAISS
vectorstore = FAISS.load_local("local_model_index", M3EmbeddingFP16(), allow_dangerous_deserialization=True)
vectorstore.index.ntotal

In [None]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

In [None]:
from langchain_ollama import ChatOllama

model_llama = ChatOllama(
    model="llama3.2",
    temperature=0
)

In [None]:
from langchain.prompts import ChatPromptTemplate

# Prompt
template = """
Comportati come un assistente che risponde alle domande del cliente.
Rispondi alla domanda basandoti solo sui seguenti documenti: {context}
Rispondi in modo conciso e chiaro, spiegando passo passo al cliente le azioni necessarie da effettuare.
Se possibile, dai indicazioni dettagliate al cliente, su come risolvere il problema o effettuare l'azione desiderata.
Quando spieghi che cosa è o cosa significa un certo elemento richiesto, non parlarne come se fosse un problema.

In caso di più domande rispondi solo a quelle inerenti alla documentazione e rimani a disposizione per altre domande sull'argomento, specificando,
invece, che le altre domande non sono state trovate pertinenti in questo contesto.

Domanda relativa al software Panthera: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt

# Post-processing
def format_docs(splits):
    return "\n\n".join(doc.page_content for doc in splits)

# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model_llama
    | StrOutputParser()
)

In [None]:
# Iterate over each row in the DataFrame using `iterrows` to access both index and row data
for idx, row in eval_df.iterrows():
    question = row["question"]  # Get the question from the current row
    
    # Invoke the model to get the response
    response = rag_chain.invoke(question)  # Assuming invoke expects a dict with "question" key
    
    # Store the response in the "model_response" column for the corresponding row
    eval_df.at[idx, "model_response"] = response

# Display the updated DataFrame to verify
eval_df

In [None]:
# Extract a single data poin to test on from the evaluation set
import random
index = random.randint(0, len(eval_df))
element = eval_df.loc[index]
element

In [None]:
question = element["question"]
reference = element["answer"]
response = element["model_response"]

In [None]:
pprint.pprint(question)
pprint.pprint(reference)
pprint.pprint(response)

In [None]:
# Evaluate RAG with RAG-evaluator dataset
from rag_evaluator import RAGEvaluator

# Initialize the evaluator
evaluator = RAGEvaluator()

In [None]:
# Evaluate the response
pprint.pprint(question)
metrics = evaluator.evaluate_all(question, response, reference)

# Print the results
pprint.pprint(metrics)

In [None]:
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = os.getenv('LANGCHAIN_API_KEY')
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

In [None]:
prompt_perspectives = ChatPromptTemplate.from_template(template)

from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

# Post-processing
def format_docs(splits):
    return "\n\n".join(doc.page_content for doc in splits)

# Chain
rag_chain_gpt = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | ChatOpenAI(temperature=0, model="gpt-4o")
    | StrOutputParser()
)

In [None]:
# Iterate over each row in the DataFrame using `iterrows` to access both index and row data
for idx, row in eval_df.iterrows():
    question = row["question"]  # Get the question from the current row
    
    # Invoke the model to get the response
    response = rag_chain_gpt.invoke(question)  # Assuming invoke expects a dict with "question" key
    
    # Store the response in the "model_response" column for the corresponding row
    eval_df.at[idx, "model_response_gpt"] = response

# Display the updated DataFrame to verify
eval_df

In [None]:
element = eval_df.loc[index]
response_gpt = element["model_response_gpt"]

In [None]:
# Evaluate the response
pprint.pprint(question)
metrics_gpt = evaluator.evaluate_all(question, response_gpt, reference)

# Print the results
pprint.pprint(metrics_gpt)

In [None]:
import pandas as pd

# Assuming your DataFrame is named df
eval_df.to_csv('eval_dataset_llama3.2_against_gpt.csv', index=False)

In [None]:
print("Compare models metrics:")
print("")
pprint.pprint("Metrics llama3.2 generator:")
pprint.pprint(metrics)
print("\n")
pprint.pprint("Metrics gpt-4o generator:")
pprint.pprint(metrics_gpt)

## Results of an evaluation of a single data point

In [None]:
# Define model names and RAG types
model_data = [
    {'Model': 'llama3.2', 'RAG Type': 'Basic RAG', 'Question_rewriting' : False, **metrics},
    {'Model': 'gpt-4o', 'RAG Type': 'Basic RAG', 'Question_rewriting': False,  **metrics_gpt}
]

# Create DataFrame
df_metrics = pd.DataFrame(model_data)

# Display the DataFrame
display(df_metrics)

# Evaluation on all available data points

In [None]:
# Evaluate each entry of llama3.2 - (question, reference, response)
results = []
for index, row in eval_df.iterrows():
    question = row['question']
    response = row['model_response']
    reference = row['answer']
    
    # Evaluate and store the results
    evaluation = evaluator.evaluate_all(question, response, reference)
    results.append(evaluation)

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Display the final DataFrame with evaluations
display(results_df)

In [None]:
# Aggregate metrics to get a single evaluation for the model
aggregated_results = {
    "BLEU": results_df["BLEU"].mean(),
    "ROUGE-1": results_df["ROUGE-1"].mean(),
    "BERT P": results_df["BERT P"].mean(),
    "BERT R": results_df["BERT R"].mean(),
    "BERT F1": results_df["BERT F1"].mean(),
    "Perplexity": results_df["Perplexity"].mean(),
    "Diversity": results_df["Diversity"].mean(),
    "Racial Bias": results_df["Racial Bias"].mean()
}

# Convert aggregated results to a DataFrame for better readability
aggregated_results_df = pd.DataFrame(aggregated_results, index=[0])

# Display the aggregated results
display(aggregated_results_df)

In [None]:
# Evaluate each entry of llama3.2 - (question, reference, response)
results = []
for index, row in eval_df.iterrows():
    question = row['question']
    response = row['model_response_gpt']
    reference = row['answer']
    
    # Evaluate and store the results
    evaluation = evaluator.evaluate_all(question, response, reference)
    results.append(evaluation)

# Convert results to a DataFrame
results_df_gpt = pd.DataFrame(results)

# Display the final DataFrame with evaluations
display(results_df_gpt)

In [None]:
# Aggregate metrics to get a single evaluation for the model
aggregated_results_gpt = {
    "BLEU": results_df_gpt["BLEU"].mean(),
    "ROUGE-1": results_df_gpt["ROUGE-1"].mean(),
    "BERT P": results_df_gpt["BERT P"].mean(),
    "BERT R": results_df_gpt["BERT R"].mean(),
    "BERT F1": results_df_gpt["BERT F1"].mean(),
    "Perplexity": results_df_gpt["Perplexity"].mean(),
    "Diversity": results_df_gpt["Diversity"].mean(),
    "Racial Bias": results_df_gpt["Racial Bias"].mean()
}

# Convert aggregated results to a DataFrame for better readability
aggregated_results_df_gpt = pd.DataFrame(aggregated_results_gpt, index=[0])

# Display the aggregated results
display(aggregated_results_df_gpt)

In [None]:
display(aggregated_results_df)
display((aggregated_results_df_gpt))

In [None]:
# Define model names and RAG types
model_data_aggregated_eval = [
    {'Model': 'llama3.2', 'RAG Type': 'Basic RAG', 'Question_rewriting' : False, **aggregated_results},
    {'Model': 'gpt-4o', 'RAG Type': 'Basic RAG', 'Question_rewriting': False,  **aggregated_results_gpt}
]

# Create DataFrame
df_metrics_aggregated_data = pd.DataFrame(model_data_aggregated_eval)

# Display the DataFrame
display(df_metrics_aggregated_data)

In [None]:
print("Evaluation for a single data point")
display(df_metrics)

print("Evaluation for more data points (14) - mean evaluation")
display(df_metrics_aggregated_data)

In [None]:
import matplotlib.pyplot as plt

cumulative_metrics = {
    "BLEU": 0,
    "ROUGE-1": 0,
    "BERT P": 0,
    "BERT R": 0,
    "BERT F1": 0,
    "Perplexity": 0,
    "Diversity": 0,
}

# List to store running means
running_means = []

# Evaluate each entry
for index, row in eval_df.iterrows():
    question = row['question']
    response = row['model_response']
    reference = row['answer']
    
    # Evaluate and store the results
    evaluation = evaluator.evaluate_all(question, response, reference)
    results.append(evaluation)
    
    # Update cumulative sums
    for metric in cumulative_metrics.keys():
        cumulative_metrics[metric] += evaluation[metric]
    
    # Compute the current means
    current_means = {metric: cumulative_metrics[metric] / (index + 1) for metric in cumulative_metrics}
    running_means.append(current_means)

    # Create a DataFrame for running means
running_means_df = pd.DataFrame(running_means)

# Plotting the evolution of metrics
plt.figure(figsize=(12, 8))

for metric in cumulative_metrics.keys():
    plt.plot(running_means_df.index + 1, running_means_df[metric], marker='o', label=metric)

plt.title('Progression of Evaluation Metrics With more data for Llama3.2')
plt.xlabel('Number of Evaluation Points (N)')
plt.ylabel('Mean Metric Value')
plt.axhline(y=0, color='grey', linestyle='--')  # Optional: Add a horizontal line at y=0 for reference
plt.legend()
plt.grid()
plt.show()


In [None]:
import matplotlib.pyplot as plt

cumulative_metrics = {
    "BLEU": 0,
    "ROUGE-1": 0,
    "BERT P": 0,
    "BERT R": 0,
    "BERT F1": 0,
    "Perplexity": 0,
    "Diversity": 0,
}

# List to store running means
running_means = []

# Evaluate each entry
for index, row in eval_df.iterrows():
    question = row['question']
    response = row['model_response_gpt']
    reference = row['answer']
    
    # Evaluate and store the results
    evaluation = evaluator.evaluate_all(question, response, reference)
    results.append(evaluation)
    
    # Update cumulative sums
    for metric in cumulative_metrics.keys():
        cumulative_metrics[metric] += evaluation[metric]
    
    # Compute the current means
    current_means = {metric: cumulative_metrics[metric] / (index + 1) for metric in cumulative_metrics}
    running_means.append(current_means)

    # Create a DataFrame for running means
running_means_df = pd.DataFrame(running_means)

# Plotting the evolution of metrics
plt.figure(figsize=(12, 8))

for metric in cumulative_metrics.keys():
    plt.plot(running_means_df.index + 1, running_means_df[metric], marker='o', label=metric)

plt.title('Progression of Evaluation Metrics With more data for GPT-4o')
plt.xlabel('Number of Evaluation Points (N)')
plt.ylabel('Mean Metric Value')
plt.axhline(y=0, color='grey', linestyle='--')  # Optional: Add a horizontal line at y=0 for reference
plt.legend()
plt.grid()
plt.show()
