Notebook for evaluating the retrieval. Both, generates the evaluation datasets and provides method for evaluating.>

In [None]:
import json
from typing import List

def load_question_context_pairs(chunk_size: str, chunk_overlap: str, file_type: str, append_title: str) -> object:
    """
    Loads already created QC pairs for evaluation.
    """
    qc_file_name = "QC_" + chunk_size + "_" + chunk_overlap + "_" + file_type + "_" + append_title + ".json"
    file_path = f"./../../evaluationInput/retrieval_eval/{qc_file_name}"

    try:
        # Open the file in read mode ('r')
        with open(file_path, 'r') as json_file:
            # Load the JSON content from the file
            data = json.load(json_file)
            return data

    except FileNotFoundError:
        # Handle the case where the file doesn't exist
        print(f"The file {file_path} does not exist.")
        return []

    except json.JSONDecodeError as e:
        # Handle the case where the file is not valid JSON
        print(f"Error decoding JSON in {file_path}: {e}")
        return []

def store_question_context_pairs(question_context_pairs: object, file_path: str):
    """
    Stores the created QC pairs for later evaluation.
    """
    with open(file_path, "w") as json_file:
        json.dump(question_context_pairs, json_file)
    return

In [None]:
import re
from ipynb.fs.defs.a_setup_llms import setup_llm
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)


def generate_question(input_text: str) -> str:
    """
    Uses GPT 3.5 Turbo to generate a question for a given context.
    """

    llm, token_size = setup_llm("OpenAI", "gpt-3.5-turbo", temperature=0)

    system_message_prompt = SystemMessagePromptTemplate.from_template(
        """You are a Professor. Your task is to setup exactly one question based on the provided context for an upcoming quiz/examination. The question should not contain options and not start with Q1. Restrict the questions to the context information provided and just return the question, never return an introduction or something prior to the question e.g. 'Question :'. In addition, never mention that the question is based on the context or "given context"""
    )
    human_message_prompt = HumanMessagePromptTemplate.from_template("""Context information: {input_text}""")

    chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])

    generated_string = llm(chat_prompt.format_prompt(input_text=input_text).to_messages()).content

    # Clean the string, so only the question is extracted
    string_lines = str(generated_string).strip().split("\n")
    # Remove numbering from lines
    questions = [re.sub(r"^\d+[\).\s]", "", question).strip() for question in string_lines]
    # Only include lines with question mark
    questions = [question for question in questions if len(question) > 0 and "?" in question]

    if len(questions) == 1:
        return questions[0]
    else:
        return generated_string

In [None]:
import chromadb
from langchain.vectorstores import Chroma
import os
import random
from langchain.storage.file_system import LocalFileStore
from langchain.storage._lc_store import create_kv_docstore


def generate_and_store_question_context_pairs(collection_names: List[str], k_total: int):
    """
    Generates k question context pairs for each collection name given. Stores it in the directory.

    Data format: {"collection_name": "xyz", "question_context_pairs": [{"question": "xyz", "context": "xyz", "context_id": 1}, {"question": "xyz", "context": "xyz", "context_id": 2}]}
    """
    new_client = chromadb.PersistentClient(path=os.environ.get("CHROMA_PATH"))

    # For each collection create question context pairs
    for collection_name in collection_names:
        print("Starting to generate QC pairs for " + collection_name)
        try:
            new_client.get_collection(collection_name)
        except ValueError as e:
            print(f"Error: Collection {collection_name} does not exist. Will be skipped.")
            continue

        vectordb = Chroma(client=new_client, collection_name=collection_name)
        if "PC_" in collection_name:
            metadata = {
                "file_type": vectordb._collection.metadata["file_type"],
                "chunk_size": vectordb._collection.metadata["chunk_size_parent"],
                "chunk_overlap": vectordb._collection.metadata["chunk_overlap_parent"],
                "title_appended": vectordb._collection.metadata["title_appended"],
            }
        else:
            metadata = {
                "file_type": vectordb._collection.metadata["file_type"],
                "chunk_size": vectordb._collection.metadata["chunk_size"],
                "chunk_overlap": vectordb._collection.metadata["chunk_overlap"],
                "title_appended": vectordb._collection.metadata["title_appended"],
            }

        # Try to load QC pairs based on the same metadata.
        existing_q_c_object = load_question_context_pairs(metadata["chunk_size"], metadata["chunk_overlap"], metadata["file_type"], metadata["title_appended"])
        existing_q_c_pairs = []

        if len(existing_q_c_object) > 0:
            existing_q_c_pairs = existing_q_c_object["question_context_pairs"]

        # Get the IDs for either the filestore, in the case of hierarchical retrieval, or for the vector database. 
        if "PC_" in collection_name:
            fs = LocalFileStore(os.environ.get("PARENT_DOC_PATH") + f"\\{collection_name}")
            store = create_kv_docstore(fs)
            existing_ids = []
            for key in store.yield_keys():
                existing_ids.append(key)
        else:
            existing_ids = vectordb.get()["ids"]

        # Get the total document count, in order to randomly generate ids and check how many q,c pairs need to be generated
        document_count = len(existing_ids)
        pairs_to_generate = 0
        q_c_list = []

        # If q_c pairs for the combination of the index metadata already exist and the size is already larger than k, return
        if len(existing_q_c_pairs) > 0 and len(existing_q_c_pairs) >= k_total:
            return
        # If q_c pairs for the combination of the index metadata already exist and the size is smaller than k
        elif len(existing_q_c_pairs) > 0 and len(existing_q_c_pairs) < k_total:
            difference = k_total - len(existing_q_c_pairs)
            # and the difference is larger than the document count, return
            if difference > document_count:
                return
            # and the difference is smaller than the document count, generate the difference in q,c pairs and store both new, and old pairs
            else:
                pairs_to_generate = difference
                q_c_list.extend(existing_q_c_pairs)
        # If q_c pairs for the combination of the index metadata do not exist and the document count inside the index are >= than k, generate k q,c pairs
        elif len(existing_q_c_pairs) == 0 and document_count >= k_total:
            pairs_to_generate = k_total
        # If q_c pairs for the combination of the index metadata do not exist and the document count inside the index is < than k, generate q,c pairs for all documents
        elif len(existing_q_c_pairs) == 0 and document_count < k_total:
            print("Test Case 2")
            pairs_to_generate = document_count

        print("Number of pairs to generate: " + str(pairs_to_generate))

        # Sample random ids and get the corresponding documents
        random_ids = random.sample(existing_ids, pairs_to_generate)
        if "PC_" in collection_name:
            documents_from_store = store.mget(random_ids)
            metadata_docs = [doc.metadata for doc in documents_from_store]
            id_docs = [doc.metadata["doc_ID"] for doc in documents_from_store]
            chosen_documents = {"ids": id_docs, "metadatas": metadata_docs}
        else:
            chosen_documents = vectordb._collection.get(ids=random_ids)

        # Iterate over these documents and generate a question based on the document. Append the resulting object to the whole list of all question, context pairs
        for id_value, metadata_value in zip(chosen_documents["ids"], chosen_documents["metadatas"]):

            document_original_text = metadata_value["original_text"]

            question = generate_question(document_original_text)

            q_c_pair = {"question": question, "context": document_original_text, "context_id": id_value}
            q_c_list.append(q_c_pair)

        q_c_object = {
            "collection_name": metadata["chunk_size"] + "_" + metadata["chunk_overlap"] + "_" + metadata["file_type"] + "_" + metadata["title_appended"],
            "question_context_pairs": q_c_list,
        }

        qc_file_name = (
            "QC_" + metadata["chunk_size"] + "_" + metadata["chunk_overlap"] + "_" + metadata["file_type"] + "_" + metadata["title_appended"] + ".json"
        )
        file_path = f"./../../evaluationInput/retrieval_eval/{qc_file_name}"
        print("Stored QC file name: " + qc_file_name)

        store_question_context_pairs(q_c_object, file_path)

In [None]:
from langchain.docstore.document import Document

def calculate_hit_rate(retrieved_docs: List[Document], q_c_pair: object) -> float:
    '''
    Caluclates the hit rate for one QC pair.
    '''
    expected_id = q_c_pair["context_id"]

    is_hit = any(str(doc.metadata["doc_ID"]) == expected_id or doc.metadata["doc_ID"] == expected_id for doc in retrieved_docs)
    return 1.0 if is_hit else 0.0

def calculate_average_hit_rate(retrieved_docs_for_pairs: List[List[Document]], q_c_pairs: List[object]) -> float:
    '''
    Calculates the average hit rate over all QC pairs.
    Hit Rate = Is the expected ID inside the retrieved docs? Then return 1, otherwise 0
    '''

    if len(retrieved_docs_for_pairs) != len(q_c_pairs):
        raise Exception("Error at calculating average hit rate, raised exception: Different length of retrieved_docs_for_pairs and q_c_pairs")
    
    hit_rates = []
    for retrieved_doc_list, q_c_pair in zip(retrieved_docs_for_pairs, q_c_pairs):
        hit_rates.append(calculate_hit_rate(retrieved_doc_list, q_c_pair))

    average_hit_rate = sum(hit_rates) / len(hit_rates)

    return average_hit_rate

In [None]:
def calculate_mrr(retrieved_docs: List[Document], q_c_pair: object) -> float:
    '''
    Caluclates the hit mean reciprocal rank for one QC pair.
    '''
    expected_id = q_c_pair["context_id"]
    for index, doc in enumerate(retrieved_docs):
        if str(doc.metadata["doc_ID"]) == expected_id or doc.metadata["doc_ID"] == expected_id:
            mrr = 1 / (index + 1)
            return mrr

    return 0.0

def calculate_average_mrr(retrieved_docs_for_pairs: List[List[Document]], q_c_pairs: List[object]) -> float:
    '''
    Calculates the average MRR over all QC pairs.
    
    Mean Reciprocal Rank = Is the expected ID inside the retrieved docs and on which index? Based on the index return 1/(index+1).
    So if the retrieved doc was the first one (the most relevant one), return 1.
    '''
    if len(retrieved_docs_for_pairs) != len(q_c_pairs):
        raise Exception("Error at calculating average mrr, raised exception: Different length of retrieved_docs_for_pairs and q_c_pairs")
    
    mrrs = []
    for retrieved_doc_list, q_c_pair in zip(retrieved_docs_for_pairs, q_c_pairs):
        mrrs.append(calculate_mrr(retrieved_doc_list, q_c_pair))

    average_mrr = sum(mrrs) / len(mrrs)

    return average_mrr

In [None]:
def calculate_average_time(durations: List[float]) -> float:
    return sum(durations)/len(durations)

In [None]:
from FlagEmbedding import FlagReranker

def calculate_average_similarity_score_BGE(retrieved_docs_for_pairs: List[List[Document]], q_c_pairs: List[object], chunk_size) -> float:
    """
    Calculates how relevant the retrieved k documents are for the given question. Uses the bge-reranker-base cross-encoder.
    The score is calculated for each of the k retrieved documents and then averaged. The overall score is once again the average over all pairs in the dataset
    """
    if len(retrieved_docs_for_pairs) != len(q_c_pairs):
        raise Exception("Error at calculating average similarity score, raised exception: Different length of retrieved_docs_for_pairs and q_c_pairs")
    
    reranker = FlagReranker('BAAI/bge-reranker-base')
    pairs = []
    for retrieved_doc_list, q_c_pair in zip(retrieved_docs_for_pairs, q_c_pairs):
        question = q_c_pair["question"]
        for doc in retrieved_doc_list:
            if int(chunk_size) > 1800 and int(chunk_size) < 3600:
                # Split the text into two parts
                part_size = len(doc.page_content) // 2
                parts = [doc.page_content[:part_size], doc.page_content[part_size:]]
                pairs.append([question, parts[0]])
                pairs.append([question, parts[1]])
            elif int(chunk_size) > 3600:
                # Split the text into four parts
                part_size = len(doc.page_content) // 4
                parts = [
                    doc.page_content[part_size * i:part_size * (i + 1)] for i in range(4)
                ]
                pairs.append([question, parts[0]])
                pairs.append([question, parts[1]])
                pairs.append([question, parts[2]])
                pairs.append([question, parts[3]])
            else: 
                pairs.append([question, doc.page_content])

    # Calculate score for each doc
    scores = reranker.compute_score(pairs, batch_size=16)
    avg = sum(scores) / len(scores)

    return avg