Main notebook for evaluating the retrieval process of the RAG pipeline

In [None]:
from ipynb.fs.defs.a_setup_llms import create_embedding_model
from ipynb.fs.defs.b_retrieve_data_and_prompt import retrieve_contexts
from ipynb.fs.defs.b_retrieve_data_and_prompt import load_multiple_queries_list
from ipynb.fs.defs.b_retrieve_data_and_prompt import load_hyde_docs
from ipynb.fs.defs.c_evaluation_retrieval import generate_and_store_question_context_pairs
from ipynb.fs.defs.c_evaluation_retrieval import load_question_context_pairs
from ipynb.fs.defs.c_evaluation_retrieval import calculate_average_hit_rate
from ipynb.fs.defs.c_evaluation_retrieval import calculate_average_mrr
from ipynb.fs.defs.c_evaluation_retrieval import calculate_average_time
from ipynb.fs.defs.c_evaluation_retrieval import calculate_average_similarity_score_BGE

In [None]:
import chromadb
from langchain.vectorstores import Chroma
import os

new_client = chromadb.PersistentClient(path=os.environ.get("CHROMA_PATH"))
collections = new_client.list_collections()
collections = [collection.name for collection in collections]
print(collections)
print(len(collections))

In [None]:
# Comment out, if QC pairs need to be generated for the given collections.
# generate_and_store_question_context_pairs(collections, 100)

In [None]:
from typing import List
import pandas as pd
import time
import torch

def evaluate_retrieval(collection_names: List[str], retriever_methods: List[str], k_neighbours: List[int], result_name_path: str, rerank_k: int=50, dense_percent: float = 0.5):
    """
    Evaluates the retrieval process of all given collections, retrieval methods and k_neighbours.
    """

    # Path to store results
    full_path = f"./../../evaluationResults/retrievalEval/{result_name_path}"
    torch.cuda.empty_cache()

    # Check if the df already exists, otherwise create a new one
    try:
        df = pd.read_csv(full_path)
    except FileNotFoundError:
        df = pd.DataFrame(columns=["index_name", "retriever_method", "number_retrieved_docs", "hit_rate", "mrr", "similarity", "duration"])
    
    # Go through each given collection
    for collection_name in collection_names:
        new_client = chromadb.PersistentClient(path=os.environ.get("CHROMA_PATH"))
        vectordb = Chroma(client=new_client, collection_name=collection_name)

        print("Starting with collection " + collection_name)

        if "PC_" in collection_name:
            metadata = {
                "file_type": vectordb._collection.metadata["file_type"],
                "chunk_size": vectordb._collection.metadata["chunk_size_parent"],
                "chunk_overlap": vectordb._collection.metadata["chunk_overlap_parent"],
                "title_appended": vectordb._collection.metadata["title_appended"],
                "embedding_model_provider": vectordb._collection.metadata["embedding_model_provider"],
                "embedding_model_name": vectordb._collection.metadata["embedding_model_name"],
            }
        else:
            metadata = {
                "file_type": vectordb._collection.metadata["file_type"],
                "chunk_size": vectordb._collection.metadata["chunk_size"],
                "chunk_overlap": vectordb._collection.metadata["chunk_overlap"],
                "title_appended": vectordb._collection.metadata["title_appended"],
                "embedding_model_provider": vectordb._collection.metadata["embedding_model_provider"],
                "embedding_model_name": vectordb._collection.metadata["embedding_model_name"],
            }

        # Create the embedding model based on metadata
        embedding_model = create_embedding_model(metadata["embedding_model_provider"], metadata["embedding_model_name"])
        vectordb = Chroma(client=new_client, collection_name=collection_name, embedding_function=embedding_model)
        # Load the question, context pairs associated with the index's metadata
        qc_object = load_question_context_pairs(metadata["chunk_size"], metadata["chunk_overlap"], metadata["file_type"], "False")
        qc_pairs = qc_object["question_context_pairs"]

        # Load HyDe Docs and Multi Query lists for chunk size and overlap combination if that is the retriever method
        hyde_docs = []
        multi_query_lists = []

        if "Multi_Query" in retriever_methods or "HyDE" in retriever_methods or "Hybrid_Multi_Query_Cohere" in retriever_methods or "Hybrid_Multi_Query" in retriever_methods or "Hybrid_HyDE" in retriever_methods:
            path_hyde = "Hyde_" + metadata["chunk_size"] + "_" +  metadata["chunk_overlap"] + "_"+ metadata["file_type"] + "_" + metadata["title_appended"]
            path_multi = "Multi_Query_" + metadata["chunk_size"] + "_" +  metadata["chunk_overlap"] + "_"+ metadata["file_type"] + "_" + metadata["title_appended"]
            hyde_docs = load_hyde_docs(True, path_hyde)
            multi_query_lists = load_multiple_queries_list(True, path_multi)

        # Iterate over all desired retrievers
        for retriever_method in retriever_methods:
            print("Starting with retriever method " + retriever_method)
            if "Parent_Child" in retriever_method and "PC_" not in collection_name:
                continue
            if "Parent_Child" not in retriever_method and "PC_" in collection_name:
                continue
                
            torch.cuda.empty_cache()

            # For each retriever go through all questions
            for k in k_neighbours:
                docs_list = []
                duration_list = []
                
                if "HyDE" in retriever_method:
                    for qc_pair, hyde_doc in zip(qc_pairs, hyde_docs):
                        retrieved_docs, duration = retrieve_contexts(vectordb=vectordb, retrieval_method=retriever_method,k=k, query=qc_pair["question"], rerank_k= rerank_k, dense_percent=dense_percent, hyde_document=hyde_doc)
                        docs_list.append(retrieved_docs)
                        duration_list.append(duration)

                elif "Multi_Query" in retriever_method:
                    for qc_pair, multi_query_list in zip(qc_pairs, multi_query_lists):
                        retrieved_docs, duration = retrieve_contexts(vectordb=vectordb, retrieval_method=retriever_method,k=k, query=qc_pair["question"],rerank_k= rerank_k, dense_percent=dense_percent, multiple_queries=multi_query_list)
                        docs_list.append(retrieved_docs)
                        duration_list.append(duration)

                else:
                    for index, qc_pair in enumerate(qc_pairs):
                        if index % 5 == 0:
                            print(index)
                        retrieved_docs, duration = retrieve_contexts(vectordb=vectordb, retrieval_method=retriever_method,k=k, query=qc_pair["question"], rerank_k= rerank_k, dense_percent=dense_percent)
                        docs_list.append(retrieved_docs)
                        duration_list.append(duration)
            
                torch.cuda.empty_cache()

                avg_hit_rate = calculate_average_hit_rate(docs_list, qc_pairs)
                avg_mmr = calculate_average_mrr(docs_list, qc_pairs)
                avg_similarity_score = calculate_average_similarity_score_BGE(docs_list, qc_pairs, metadata["chunk_size"])
                avg_duration = calculate_average_time(duration_list)

                new_row = pd.DataFrame({"index_name": collection_name,"retriever_method": retriever_method, "number_retrieved_docs": k, "hit_rate": avg_hit_rate, "mrr": avg_mmr, "similarity": avg_similarity_score, "duration": avg_duration}, index=[0])
                df = pd.concat([new_row,df.loc[:]]).reset_index(drop=True)
                df.to_csv(full_path, index=False)
                
    return df

In [None]:
collections_to_evaluate = [collections[0]]
print(collections_to_evaluate)

retriever_methods = ["Hybrid_Rerank_Cohere"]
k_neighbours = [8]
rerank_k = 50
dense_percent = 0.5

results_df = evaluate_retrieval(collections_to_evaluate, retriever_methods, k_neighbours, "Retrieval_Eval.csv", rerank_k, dense_percent)