# Required information
#### Make sure to fill this cell to run the file

In [None]:
import torch

# Pinecone API key as a string
PINECONE_API_KEY = ""

# Cohere API key as a string
COHERE_API_KEY = ""

# Directory where all the dataset that will be created during the run of this file will be save
datasets_saving_dir = ""

# Directory to the ground-truth dataset provided in the GitHub repository
ground_truth_df_dir = ""

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# Loading the Neural Bridge RAG Dataset

In [None]:
import pandas as pd
splits = {'train': 'data/train-00000-of-00001-9df3a936e1f63191.parquet', 'test': 'data/test-00000-of-00001-af2a9f454ad1b8a3.parquet'}
df = pd.read_parquet("hf://datasets/neural-bridge/rag-dataset-12000/" + splits["train"])
df['words_count'] = df['context'].apply(lambda text: len(text.split()))
df = df.reset_index()
df['ground_truth'] = ""
df.head()

#### Basic info and stats of the dataset

In [None]:
min_length = min(df['words_count'])
max_length = max(df['words_count'])
avg_length = df['words_count'].mean()
med_length = df['words_count'].median()
print(f'min_length: {min_length}')
print(f'max_length: {max_length}')
print(f'avg_length: {avg_length}')
print(f'med_length: {med_length}')

#### Filtering all rows that have texts longer than 550 tokens, and printing info and stats

In [None]:
large_df = df[df['words_count'] > 550]

# Basic info about the df with large documents
min_length = min(large_df['words_count'])
max_length = max(large_df['words_count'])
avg_length = large_df['words_count'].mean()
med_length = large_df['words_count'].median()
print(f'min_length: {min_length}')
print(f'max_length: {max_length}')
print(f'avg_length: {avg_length}')
print(f'med_length: {med_length}')

#### Now we will create the Documents objects from the dataframe, that is needed for the LangChain chunking methods and vectordb

In [None]:
# Create Document objects list from the df
from langchain.schema import Document
documents = []
for _, row in large_df.iterrows():
    doc = Document(
        page_content=row['context'],
        metadata={'idx': row['index'], 'num_words': row['words_count']},
    )
    documents.append(doc)

q_and_a_df = large_df[['index', 'question', 'answer']]

# Measuring chunking time and size

In [None]:
# Init a RecursiveChunker
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer

embeddings_model_name = 'sentence-transformers/all-MiniLM-L6-v2' # 512 tokens as input
embeddings_model_tokenizer = AutoTokenizer.from_pretrained(embeddings_model_name)

rec_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    separators=['\n\n','\n','.','?','!',';',',',' ',''],
    length_function=lambda text: len(embeddings_model_tokenizer.encode(text=text,truncation=False))
)

In [None]:
# Init a SemanticChunker
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_experimental.text_splitter import SemanticChunker
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
embeddings_model = HuggingFaceEmbeddings(model_name=embeddings_model_name, model_kwargs={'device': device})

sem_text_splitter = SemanticChunker(embeddings=embeddings_model)

In [None]:
# Init a SemanticGraphChunker
from semantic_graph_chunker import SemanticGraphChunker
sem_g_text_splitter = SemanticGraphChunker()

#### Defining a function to measure chunking times and sizes for the chunking methods

In [None]:
# Function to measure chunking times

def measure_chunking_time_and_num_of_chunks(documents, text_splitter, intervals):
    times = []
    num_chunks = []
    total_time = 0
    total_num_of_chunks = 0
    prev_interval = 0
    all_chunks = []

    for interval in tqdm(intervals, desc="Processing intervals"):
        # Only process new documents from the last interval to the current one
        docs = documents[prev_interval:interval]
        start_time = time.time()
        chunks = text_splitter.split_documents(docs)
        end_time = time.time()

        # Update cumulative metrics
        chunk_time = end_time - start_time
        total_time += chunk_time
        total_num_of_chunks += len(chunks)

        # Store results
        times.append(total_time)
        num_chunks.append(total_num_of_chunks)

        for i, chunk in enumerate(chunks):
            chunk.metadata['chunk_index'] = prev_interval + i
            all_chunks.append(chunk)

        # Update previous interval
        prev_interval = interval

    print('Done chunking for all intervals')
    return times, num_chunks, all_chunks

In [None]:
# Documents intervals
intervals = [10,50,100,500,1000,1500,2000,2500,3000,3500,4000,4500,len(documents)]


#### Executing the function for each of the chunking methods

In [None]:
# Measuring times for each chunking method
rec_times, rec_num_chunks, rec_chunks = measure_chunking_time_and_num_of_chunks(documents=documents,text_splitter=rec_text_splitter, intervals=intervals)

sem_g_times, sem_g_num_chunks, sem_g_chunks = measure_chunking_time_and_num_of_chunks(documents=documents,text_splitter=sem_g_text_splitter, intervals=intervals)

sem_times, sem_num_chunks, sem_chunks = measure_chunking_time_and_num_of_chunks(documents=documents,text_splitter=sem_text_splitter, intervals=intervals)

### Plotting results

In [None]:
# Plotting results for time
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.plot(intervals, rec_times, label="RecursiveChunker")
plt.plot(intervals, sem_times, label="SemanticChunker")
plt.plot(intervals, sem_g_times, label="SemanticGraphChunker")

# Add labels, title, and legend
plt.xlabel("Number of Documents")
plt.ylabel("Time (seconds)")
plt.title("Chunking Time Comparisons")
plt.legend(loc="upper left")

# Display the plot
plt.grid(True)
plt.show()

In [None]:
# Plotting results for number of chunks
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.plot(intervals, rec_num_chunks, label="RecursiveChunker")
plt.plot(intervals, sem_num_chunks, label="SemanticChunker")
plt.plot(intervals, sem_g_num_chunks, label="SemanticGraphChunker")

# Add labels, title, and legend
plt.xlabel("Number of Documents")
plt.ylabel("Number of Chunks")
plt.title("Comparing number of Chunks")
plt.legend(loc="upper left")

# Display the plot
plt.grid(True)
plt.show()

# Setting up vectorstores for each chunking methods


In [None]:
# Setting up vectorstores


def create_vectorstore(documents, index_name):
    embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', model_kwargs={'device': device})
    pc = Pinecone(api_key=PINECONE_API_KEY)

    if index_name not in [index_info["name"] for index_info in pc.list_indexes()]:
        pc.create_index(
            name=index_name,
            dimension=len(embedding_model.embed_query('dummy')),
            metric="cosine",
            spec=ServerlessSpec(cloud="aws", region='us-east-1')
        )
        while not pc.describe_index(index_name).status["ready"]:
            time.sleep(1)

    index = pc.Index(index_name)
    vectorstore = PineconeVectorStore(index=index, embedding=embedding_model)

    # Multiprocessing for faster document addition
    batch_size = 100
    num_batches = len(documents) // batch_size + (len(documents) % batch_size > 0)

    # Add tqdm to track progress
    for i in tqdm(range(num_batches), desc="Adding documents to Pinecone"):
        batch = documents[i * batch_size:(i + 1) * batch_size]
        vectorstore.add_documents(documents=batch)

    return vectorstore

#### Creating the Pinecone vectorstores

In [None]:
rec_vectorstore = create_vectorstore(documents=rec_chunks, index_name='rec-index')
sem_vectorstore = create_vectorstore(documents=sem_chunks, index_name='sem-index')
sem_g_vectorstore = create_vectorstore(documents=sem_g_chunks, index_name='sem-g-index')

# Creating the ground_truth df for each method
This dataframe contains the chunks each method created, and the ground_truth references relevant to the question assocuated to the document of the chunk


In [None]:
import pandas as pd
ground_truth_df = pd.read_excel('evaluation_datasets/ground_truth.xlsx')

def create_method_ground_truth_df(chunks, ground_truth_df):
    method_df = ground_truth_df.copy()
    relevant_chunks_ids = []
    relevant_text_indices = set(ground_truth_df['index'])  # Use a set for faster lookups
    relevant_chunks = [chunk for chunk in chunks if str(chunk.metadata.get('idx', '')) in map(str, relevant_text_indices)]
    print(f"Total relevant chunks: {len(relevant_chunks)}")
    for i, row in ground_truth_df.iterrows():
        idx = row['index']

        # Debug print to check values being compared
        print(f"Checking idx: {idx}")

        relevant_chunks_for_row = [
            chunk for chunk in relevant_chunks if chunk.metadata['idx'] == idx
        ]

        # Debug print to see how many relevant chunks were found
        print(f"Relevant chunks for row {i}: {len(relevant_chunks_for_row)}")

        chunks_ids = []
        references = row['ground_truth']
        for chunk in relevant_chunks_for_row:
            # Check if any reference is a substring of chunk.page_content
            if any(reference.lower() in chunk.page_content.lower() for reference in references):
                chunks_ids.append(chunk.metadata['chunk_index'])
        print(f"Relevant chunks ids: {len(chunks_ids)}")
        relevant_chunks_ids.append(chunks_ids)
    method_df['relevant_chunks_ids'] = relevant_chunks_ids
    return method_df


In [None]:
rec_ground_truth_df = create_method_ground_truth_df(rec_chunks,ground_truth_df)
sem_ground_truth_df = create_method_ground_truth_df(sem_chunks,ground_truth_df)
sem_g_ground_truth_df = create_method_ground_truth_df(sem_g_chunks,ground_truth_df)

#### Saving as Parquet file

In [None]:
rec_ground_truth_df.to_parquet(f'{datasets_saving_dir}/rec_ground_truth.parquet', index=False, engine='pyarrow')
sem_ground_truth_df.to_parquet(f'{datasets_saving_dir}/sem_ground_truth.parquet', index=False, engine='pyarrow')
sem_g_ground_truth_df.to_parquet(f'{datasets_saving_dir}/sem_g_ground_truth.parquet', index=False, engine='pyarrow')

# It is possible to start here if the Pinecone vectordbs are set


In [None]:
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain_huggingface import HuggingFaceEmbeddings
import torch

# loading the vectorstores again from pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', model_kwargs={'device': device})

index = pc.Index('rec-index')
rec_vectorstore = PineconeVectorStore(index=index, embedding=embedding_model)

index = pc.Index('sem-index')
sem_vectorstore = PineconeVectorStore(index=index, embedding=embedding_model)

index = pc.Index('sem-g-index')
sem_g_vectorstore = PineconeVectorStore(index=index, embedding=embedding_model)

#### Setting up the retrievers, with k=20

In [None]:
# Set retrievers
rec_retriever = rec_vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 20}
)

sem_retriever = sem_vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 20}
)

sem_g_retriever = sem_g_vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 20}
)

#### Loading the Parquet ground-truth files we created for each of the methods

In [None]:
# Load ground_truth datasets
import pandas as pd

rec_ground_truth_df = pd.read_parquet(f'{datasets_saving_dir}/rec_ground_truth.parquet', engine='pyarrow')
sem_ground_truth_df = pd.read_parquet(f'{datasets_saving_dir}/sem_ground_truth.parquet', engine='pyarrow')
sem_g_ground_truth_df = pd.read_parquet(f'{datasets_saving_dir}/sem_g_ground_truth.parquet', engine='pyarrow')


# Compute Recall@k, Precision@k


In [None]:
def recall_and_precision_at_k(retriever, ground_truth_df_row, k=5):
    # Extract query and ground truth information
    query = ground_truth_df_row['question']
    query_id = ground_truth_df_row['index']

    # Ensure ground_truth_relevant_docs is a list
    ground_truth_relevant_docs_idx = list(ground_truth_df_row['relevant_chunks_ids'])

    # Retrieve top-K documents
    retrieved_at_k = retriever.invoke(query)[:k]

    # Filter relevant documents based on metadata
    relevant_documents = [
        doc for doc in retrieved_at_k
        if doc.metadata['idx'] == query_id and doc.metadata['chunk_index'] in ground_truth_relevant_docs_idx
    ]

    # Handle edge case: no relevant documents in ground truth
    if not ground_truth_relevant_docs_idx:
        return 0.0  # Avoid division by zero, recall is 0 if no relevant documents exist

    # Calculate recall

    recall = len(relevant_documents) / len(ground_truth_relevant_docs_idx)
    precision = len(relevant_documents) / k
    return recall, precision


def get_mean_recall_and_precision(retriever, ground_truth_df, k_values):

    recall_scores = []
    precision_scores = []
    for k in k_values:
        mean_recall = 0
        mean_precision = 0
        for _, question_row in ground_truth_df.iterrows():
            recall, precision = recall_and_precision_at_k(retriever, question_row, k)
            mean_recall += recall
            mean_precision += precision
        recall_scores.append(mean_recall / len(ground_truth_df))
        precision_scores.append(mean_precision / len(ground_truth_df))
    return recall_scores, precision_scores

#### Executing the recall and precision calculation function, for each of the chunking methods

In [None]:
k_values = [1,3,5,10,15,20]

rec_recalls, rec_precision = get_mean_recall_and_precision(rec_retriever, rec_ground_truth_df, k_values)
sem_recalls, sem_precision = get_mean_recall_and_precision(sem_retriever, sem_ground_truth_df, k_values)
sem_g_recalls, sem_g_precision = get_mean_recall_and_precision(sem_g_retriever, sem_g_ground_truth_df, k_values)


### Plotting Recall@k and Precision@k results

In [None]:
# Plotting results for number of chunks
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.plot(k_values, rec_precision, label="RecursiveChunker")
plt.plot(k_values, sem_precision, label="SemanticChunker")
plt.plot(k_values, sem_g_precision, label="SemanticGraphChunker")

# Add labels, title, and legend
plt.xlabel("K value")
plt.ylabel("Mean Precision@k score")
plt.title("Comparing Precision@k")
plt.legend(loc="upper right")

# Display the plot
plt.grid(True)
plt.show()

In [None]:
# Plotting results for number of chunks
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.plot(k_values, rec_recalls, label="RecursiveChunker")
plt.plot(k_values, sem_recalls, label="SemanticChunker")
plt.plot(k_values, sem_g_recalls, label="SemanticGraphChunker")

# Add labels, title, and legend
plt.xlabel("K value")
plt.ylabel("Mean Recall@k score")
plt.title("Comparing Recall@k")
plt.legend(loc="upper left")

# Display the plot
plt.grid(True)
plt.show()

# Generate answers for questions in the dataframes
This is the execution of the RAG pipeline, and the settings for the RAGAS evaluation

#### Defining the prompt for the generator LLM in the pipeline

In [None]:
from langchain.prompts import PromptTemplate


template = """
You are an intelligent assistant. Answer the query strictly based on the given documents.

Question: {query}

Documents:
{documents}

Instructions:
- Respond **only** based on the provided information.
- If the information is insufficient, only respond "Not enough information to answer."
"""

prompt = PromptTemplate.from_template(template)


#### Setting up the LLM from Cohere

In [None]:
from langchain_community.llms import Cohere

llm = Cohere(
    model="command-xlarge-nightly",  # Cohere's large model for Q&A with extended context
    temperature=0.3,                # Control the randomness of the responses
    max_tokens=180,                # Set the maximum response length
    cohere_api_key=COHERE_API_KEY
)

#### Creating a LangChain chain function to generate response

In [None]:
def generate_answer(query, docs):
    from langchain.schema import StrOutputParser
    docs_text = [f'Document {i+1}:\n{doc.page_content}' for i, doc in enumerate(docs)]
    documents = '\n\n'.join(docs_text)
    chain = prompt | llm | StrOutputParser()
    return chain.invoke({'query': query, 'documents': documents})


## Creating dataset for RAG evaluation, buy generating answers to the questions, using the RAG pipeline of retriever, then generator

In [None]:
from tqdm import tqdm

def creat_evaluation_df(retriever, ground_truth_df):
    evaluation_df = ground_truth_df.copy()
    generated_answers = []  # Initialize a list to store generated answers
    retrieved_documents = []  # Initialize a list to store retrieved documents

    # Wrap the loop with tqdm for progress tracking
    for _, row in tqdm(ground_truth_df.iterrows(), total=len(ground_truth_df), desc="Processing queries"):
        query = row['question']

        # Retrieve documents
        retrieved_docs = retriever.invoke(query)

        # Generate the answer
        generated_answer = generate_answer(query=query, docs=retrieved_docs)

        # Store results
        retrieved_documents.append([doc.page_content for doc in retrieved_docs])
        generated_answers.append(generated_answer)

    evaluation_df['retrieved_documents'] = retrieved_documents
    evaluation_df['generated_answer'] = generated_answers
    return evaluation_df


#### Executing the function

In [None]:
rec_evaluation_df = creat_evaluation_df(rec_retriever, rec_ground_truth_df)
sem_evaluation_df = creat_evaluation_df(sem_retriever, sem_ground_truth_df)
sem_g_evaluation_df = creat_evaluation_df(sem_g_retriever, sem_g_ground_truth_df)

#### Saving the evaluation datasets as Parquet files

In [None]:
rec_evaluation_df.to_parquet(f'{datasets_saving_dir}/rec_evaluation.parquet', index=False, engine='pyarrow')
sem_evaluation_df.to_parquet(f'{datasets_saving_dir}/sem_evaluation.parquet', index=False, engine='pyarrow')
sem_g_evaluation_df.to_parquet(f'{datasets_saving_dir}/sem_g_evaluation.parquet', index=False, engine='pyarrow')

### Loading the evaluation datasets.
 It is possible to start from here, for the RAGAS results only, if the evaluation datasets are saved

In [None]:
import pandas as pd
rec_evaluation_df = pd.read_parquet('evaluation_datasets/rec_evaluation.parquet', engine='pyarrow')
sem_evaluation_df = pd.read_parquet('evaluation_datasets/sem_evaluation.parquet', engine='pyarrow')
sem_g_evaluation_df = pd.read_parquet('evaluation_datasets/sem_g_evaluation.parquet', engine='pyarrow')


# Evaluate using RAGAS


#### Preprocessing function to prepare the evaluation dataframe for RAGAS requierments

In [None]:
from datasets import Dataset
import numpy as np
from io import StringIO

def preprocess_dataframe(df):
    """
    Preprocess the DataFrame by:
    1. Renaming columns.
    2. Converting 'retrieved_contexts' from numpy arrays to lists.
    3. Cleaning StringIO objects in specified columns.
    """
    # Rename columns
    df = df[['question', 'generated_answer', 'retrieved_documents', 'answer']].rename(
        columns={
            'retrieved_documents': 'retrieved_contexts',
            'answer': 'ground_truth',
            'generated_answer': 'answer'
        }
    )

    # Convert 'retrieved_contexts' from numpy arrays to lists
    df['retrieved_contexts'] = df['retrieved_contexts'].apply(
        lambda x: x.tolist() if isinstance(x, np.ndarray) else x
    )

    # Clean StringIO objects in specified columns
    columns_to_clean = ['question', 'answer', 'retrieved_contexts', 'ground_truth']
    for column in columns_to_clean:
        df[column] = df[column].apply(
            lambda x: x.getvalue() if isinstance(x, StringIO) else x
        )

    return df


In [None]:
# Apply preprocessing to all DataFrames
rec_evaluation_df = preprocess_dataframe(rec_evaluation_df)
sem_evaluation_df = preprocess_dataframe(sem_evaluation_df)
sem_g_evaluation_df = preprocess_dataframe(sem_g_evaluation_df)

### Setting up the RAGAS evaluator LLM from Cohere

In [None]:
from langchain_cohere import ChatCohere, CohereEmbeddings

# Initialize Cohere models directly
llm = ChatCohere(model="command-xlarge-nightly", cohere_api_key=COHERE_API_KEY)
embeddings = CohereEmbeddings(model="embed-english-v2.0", cohere_api_key=COHERE_API_KEY)

## RAGAS evaluation function

In [None]:
from ragas.metrics import  Faithfulness, AnswerRelevancy, AnswerCorrectness
from ragas import evaluate
import pandas as pd
import time
from datasets import Dataset


def ragas_evaluation(evaluation_df, llm, embeddings, k=40, metrics=None):
    if metrics is None:
        metrics = [
            Faithfulness(),
            AnswerRelevancy(),
            AnswerCorrectness(),
        ]
    evaluations = []

    # Convert DataFrame to Dataset
    dataset = Dataset.from_pandas(evaluation_df[:k])

    for item in dataset:
        # Convert the single item into a pandas DataFrame
        single_item_df = pd.DataFrame([item])

        # Convert the single-item DataFrame into a Dataset
        single_item_dataset = Dataset.from_pandas(single_item_df)

        # Evaluate the single item
        item_evaluation = evaluate(
            metrics=metrics,
            dataset=single_item_dataset,
            llm=llm,
            embeddings=embeddings,
        )

        # time.sleep(10)

        # Convert evaluation result to DataFrame
        item_evaluation_df = item_evaluation.to_pandas()
        print(item_evaluation_df[['faithfulness', 'answer_relevancy', 'answer_correctness']])
        evaluations.append(item_evaluation_df)

    # Combine all individual DataFrames into one
    return pd.concat(evaluations, ignore_index=True)


### Creating the Recursive evaluation results data, and saving it

In [None]:
rec_evaluation_results_df = ragas_evaluation(rec_evaluation_df, llm, embeddings)
rec_evaluation_results_df.to_parquet(f'{datasets_saving_dir}/rec_evaluation_results.parquet', index=False, engine='pyarrow')

### Creating the Semantic evaluation results data, and saving it


In [None]:
sem_evaluation_results_df = ragas_evaluation(sem_evaluation_df, llm, embeddings)
sem_evaluation_results_df.to_parquet(f'{datasets_saving_dir}/sem_evaluation_results.parquet', index=False, engine='pyarrow')


### Creating the Semantic-Graph evaluation results data, and saving it


In [None]:
sem_g_evaluation_results_df = ragas_evaluation(sem_g_evaluation_df, llm, embeddings)
sem_g_evaluation_results_df.to_parquet(f'{datasets_saving_dir}/sem_g_evaluation_results.parquet', index=False, engine='pyarrow')


# Visualizing the ragas metrics


#### Loading the evaluation results dataframes

In [None]:
import pandas as pd
rec_evaluation_results_df = pd.read_parquet(f'{datasets_saving_dir}/rec_evaluation_results.parquet', engine='pyarrow')
sem_evaluation_results_df = pd.read_parquet(f'{datasets_saving_dir}/sem_evaluation_results.parquet', engine='pyarrow')
sem_g_evaluation_results_df = pd.read_parquet(f'{datasets_saving_dir}/sem_g_evaluation_results.parquet', engine='pyarrow')


In [None]:
import pandas as pd
import matplotlib.pyplot as plt


def process_and_visualize_from_dataframes(dataframes, column_names, methods):
    """
    Process and visualize metrics from dataframes containing relevant columns.

    Parameters:
    - dataframes: List of pandas DataFrames (e.g., [rec_df, sem_df, sem_g_df]).
    - column_names: List of column names to process (e.g., ["faithfulness", "answer_relevancy", "answer_correctness"]).
    - methods: List of method names (e.g., ["Recursive", "Semantic", "Semantic Graph"]).
    """
    for column_name in column_names:
        # Combine the columns into a single DataFrame
        combined_df = pd.DataFrame({
            methods[0]: dataframes[0][column_name],
            methods[1]: dataframes[1][column_name],
            methods[2]: dataframes[2][column_name]
        })

        # Drop rows with NaN values in any column
        cleaned_df = combined_df.dropna()

        # Calculate mean values
        mean_values = [cleaned_df[method].mean() for method in methods]

        # Create the bar plot
        plt.figure(figsize=(10, 6))
        plt.bar(methods, mean_values, alpha=0.7, color=["blue", "green", "orange"])

        # Title and labels
        metric_title = column_name.replace("_", " ").title()
        plt.title(f"Average {metric_title} Across Methods", fontsize=14)
        plt.ylabel(f"Average {metric_title}", fontsize=12)
        plt.xlabel("Methods", fontsize=12)
        plt.grid(alpha=0.5, linestyle='--', axis='y')
        plt.tight_layout()

        # Show the plot
        plt.show()

# Specify the column names and method names
column_names = ["faithfulness", "answer_relevancy", "answer_correctness"]
methods = ["Recursive", "Semantic", "Semantic Graph"]

# Call the function with the loaded data
process_and_visualize_from_dataframes(
    dataframes=[rec_evaluation_results_df, sem_evaluation_results_df, sem_g_evaluation_results_df],
    column_names=column_names,
    methods=methods
)
