Main notebook for evaluating the generation process of the RAG pipeline.

In [None]:
from ipynb.fs.defs.a_setup_llms import setup_llm
from ipynb.fs.defs.a_setup_llms import create_embedding_model
from ipynb.fs.defs.b_retrieve_data_and_prompt import retrieve_contexts
from ipynb.fs.defs.b_retrieve_data_and_prompt import is_context_size_valid
from ipynb.fs.defs.b_retrieve_data_and_prompt import load_multiple_queries_list
from ipynb.fs.defs.b_retrieve_data_and_prompt import load_hyde_docs
from ipynb.fs.defs.b_retrieve_data_and_prompt import prompt_LLM
from ipynb.fs.defs.d_evaluation_generation import calculate_rouge_score_bulk
from ipynb.fs.defs.d_evaluation_generation import calculate_response_length_bulk
from ipynb.fs.defs.d_evaluation_generation import calculate_hallucination_score_bulk
from ipynb.fs.defs.d_evaluation_generation import calculate_answer_relevancy_bulk
from ipynb.fs.defs.d_evaluation_generation import calculate_answer_similarity_bulk
from ipynb.fs.defs.d_evaluation_generation import calculate_context_recall_bulk
from ipynb.fs.defs.d_evaluation_generation import calculate_answer_relevancy_RAGAS_bulk
import chromadb
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
import os
from typing import List
import pandas as pd
import json
import time
from fastapi.encoders import jsonable_encoder
import os

In [None]:
def load_retrieved_docs(k):
    """
    Loads retrieved docs from json file.
    """

    with open('./../../evaluationInput/generation_eval/retrieved_docs_k_20.json', 'r') as json_file:
        retrieved_doc_list_unstructured = json.load(json_file)

    retrieved_doc_list = []
    for doc_list_unstructured in retrieved_doc_list_unstructured:
        doc_list = []
        for doc in doc_list_unstructured[:k]:
            doc_list.append(Document(**doc))
        retrieved_doc_list.append(doc_list)
    
    return retrieved_doc_list

In [None]:
# 1. Create loop with all available indexes, desired retrieval methods, k and the generated questions for the specific index
# Not viable to test all possibilities, need approach to decrease the number of possibilities --> Only use cheap retrieval strategies for all possibilities (without extra LLM call)
def evaluate_generation(retriever_index_k_combinations, llms_dict, result_name_path: str, calculate_context_recall: bool, calculate_RAGAS_relevancy: bool, first_go: bool = False, prompt_version: int = 1, description: str = "Standard"):
    """
    Evaluates the generation process of the RAG pipeline for all given retriever and index combinations, as well as LLMs.
    
    retriever_index_k_combinations: Array of combinations in the form: {"index_name": "Cohere_v3_l2_All_512_64_False", "retriever_method": "Dense", "number_retrieved_docs": "3"}
    llms_dict: Array of llm combinations in the form: {"model_provider": "..", "model_name": "..", "temperature": "..", "max_context_size": 1000}
    result_name_path: Where to store the final average results of all indexes
    calculate_context_recall: Whether to calculate the context_recall or not (is expensive)
    calculate_RAGAS_relevancy: Whether to calculate the RAGAS relevancy score or not (is expensive)
    first_go: Whether it is the first run or not. If not, then retrieved documents and durations with the same retriever can be accessed again. It would be wise to set a higher k on the first run, and after that all k's lower than that can be evaluated.
    prompt_version: Which prompt version to use (defaults to 1)
    description: Appended to the file path of the results. 
    """
    full_path_all = f"./../../evaluationResults/generationEval/generation_total/RAG/{result_name_path}"

    # Create new folder for the result_name_path for the storing of single document information
    os.makedirs(f"./../../evaluationResults/generationEval/generation_single/RAG/{result_name_path}", exist_ok=True)

    # Check if the df already exists, otherwise create a new one
    try:
        df = pd.read_csv(full_path_all)
    except FileNotFoundError:
        df = pd.DataFrame(columns=["index_name", "retriever_method", "number_retrieved_docs", "llm_name", "max_context_size", "temperature", "prompt_version", "num_questions_answered", "percent_questions_answered", "rouge1", "rouge1_first_Kersten", "rouge1_second_Kersten", "rouge1_third_others", "rouge1_fourth_template_and_new_ISO", "rouge1_fundamentals", "rouge1_clause", "rouge1_risk_management", "rouge1_audits", "rouge1_control", "rouge1_new_iso", "rouge2", "rouge2_first_Kersten", "rouge2_second_Kersten", "rouge2_third_others", "rouge2_fourth_template_and_new_ISO", "rouge2_fundamentals", "rouge2_clause", "rouge2_risk_management", "rouge2_audits", "rouge2_control", "rouge2_new_iso", "rougeL", "rougeL_first_Kersten", "rougeL_second_Kersten", "rougeL_third_others", "rougeL_fourth_template_and_new_ISO", "rougeL_fundamentals", "rougeL_clause", "rougeL_risk_management", "rougeL_audits", "rougeL_control", "rougeL_new_iso", "response_length", "hallucination", "hallucination_first_Kersten", "hallucination_second_Kersten", "hallucination_third_others", "hallucination_fourth_template_and_new_ISO","hallucination_fundamentals", "hallucination_clause", "hallucination_risk_management", "hallucination_audits", "hallucination_control", "hallucination_new_iso", "answer_relevancy", "answer_relevancy_first_Kersten", "answer_relevancy_second_Kersten", "answer_relevancy_third_others", "answer_relevancy_fourth_template_and_new_ISO", "answer_relevancy_fundamentals", "answer_relevancy_clause", "answer_relevancy_risk_management", "answer_relevancy_audits", "answer_relevancy_control", "answer_relevancy_new_iso", "answer_similarity", "answer_similarity_first_Kersten", "answer_similarity_second_Kersten", "answer_similarity_third_others", "answer_similarity_fourth_template_and_new_ISO", "answer_similarity_fundamentals", "answer_similarity_clause", "answer_similarity_risk_management", "answer_similarity_audits", "answer_similarity_control", "answer_similarity_new_iso", "context_recall", "context_recall_first_Kersten", "context_recall_second_Kersten", "context_recall_third_others", "context_recall_fourth_template_and_new_ISO", "context_recall_fundamentals", "context_recall_clause", "context_recall_risk_management", "context_recall_audits", "context_recall_control", "context_recall_new_iso", "RAGAS_relevancy", "RAGAS_relevancy_first_Kersten", "RAGAS_relevancy_second_Kersten", "RAGAS_relevancy_third_others", "RAGAS_relevancy_fourth_template_and_new_ISO", "RAGAS_relevancy_fundamentals", "RAGAS_relevancy_clause", "RAGAS_relevancy_risk_management", "RAGAS_relevancy_audits", "RAGAS_relevancy_control", "RAGAS_relevancy_new_iso", "duration_retrieval", "duration_generation", "duration_context_size", "duration_total", "rouge1_dur", "rouge2_dur", "rougeL_dur", "response_length_dur", "hallucination_dur", "answer_relevancy_dur", "answer_similarity_dur", "context_recall_dur", "RAGAS_relevancy_dur", "duration_all_metrics"])

    # Go through each given collection
    for combination in retriever_index_k_combinations:
        print(combination)
        new_client = chromadb.PersistentClient(path=os.environ.get("CHROMA_PATH"))
        vectordb = Chroma(client=new_client, collection_name=combination["index_name"])
        metadata = {
            "file_type": vectordb._collection.metadata["file_type"],
            "chunk_size": vectordb._collection.metadata["chunk_size"],
            "chunk_overlap": vectordb._collection.metadata["chunk_overlap"],
            "title_appended": vectordb._collection.metadata["title_appended"],
            "embedding_model_provider": vectordb._collection.metadata["embedding_model_provider"],
            "embedding_model_name": vectordb._collection.metadata["embedding_model_name"],
        }
        # Create embedding model based on metadata
        embedding_model = create_embedding_model(metadata["embedding_model_provider"], metadata["embedding_model_name"])
        vectordb = Chroma(client=new_client, collection_name=combination["index_name"], embedding_function=embedding_model)
        # Load the golden question and answer pairs
        with open("./../../evaluationInput/generation_eval/golden_qa_set.json", 'r') as file:
            golden_qa_set = json.load(file)
        
        question_set = [qa['question'] for qa in golden_qa_set['qa_set']]
        golden_answer_set = [qa['golden_answer'] for qa in golden_qa_set['qa_set']]

        if "Multi_Query" in combination["retriever_method"]:
            golden_multi_queries = load_multiple_queries_list(False, "golden_multi_queries")
        if "HyDE" in combination["retriever_method"]:
            hyde_docs = load_hyde_docs(False, "golden_hyde_docs")

        # Iterate over the given llm combinations
        for llm_comb in llms_dict:
            print(llm_comb)
            llm, max_context_size = setup_llm(llm_comb["model_provider"], llm_comb["model_name"], llm_comb["temperature"])
            # Create a new dataframe for each combination of index, retriever, k and llm in which everything is stored
            df_index_llm_comb = pd.DataFrame(columns=["index_name", "retriever_method", "number_retrieved_docs", "llm_name", "max_context_size", "temperature", "prompt_version", "question", "retrieved_contexts", "golden_answer", "generated_answer", "context_size_valid", "rouge1", "rouge2", "rougeL", "hallucination", "answer_relevancy", "answer_similarity", "context_recall", "RAGAS_relevancy", "duration_retrieval", "duration_generation", "duration_context_size", "duration_total"])
             
            # Where to store results later on
            path_index_llm_comb = combination["index_name"] + "_" + combination["retriever_method"] + "_k" + str(combination["number_retrieved_docs"]) + "_LLM_" + llm_comb["model_name"] + "_" + description + "_" + result_name_path
            full_path_index_llm_comb = f"./../../evaluationResults/generationEval/generation_single/RAG/{result_name_path}/{path_index_llm_comb}"

            # Create arrays to store all relevant data for bulk evaluation methods and dataframe
            retrieved_doc_lists = []
            generated_answers = []
            context_size_valids = []
            duration_retrievals = []
            duration_generations = []
            duration_check_context_size = []
            duration_totals = []

            # Load already stored retrieved documents, so the Cohere API does not need to be accessed every time
            if not first_go:
                retrieved_doc_lists = load_retrieved_docs(combination["number_retrieved_docs"])
                duration_retrievals = []

                with open('./../../evaluationInput/generation_eval/durations_k_20.json', 'r') as json_file:
                    duration_retrievals = json.load(json_file)

            print("Length of doc lists ", len(retrieved_doc_lists))
            print("Length of durations ", len(duration_retrievals))
            print("Length of dataset ", len(golden_qa_set["qa_set"]))

            # Iterate over the golden pairs
            for index, qa_pair in enumerate(golden_qa_set["qa_set"]):
                # Special case: Retriever method is multi query or HyDE or a combination with these two, then we need to load the required inputs
                if not first_go:
                    retrieved_docs = retrieved_doc_lists[index]
                    duration_retrieval = duration_retrievals[index]
                elif "Multi_Query" in combination["retriever_method"]:
                    multi_queries = golden_multi_queries[index]
                    retrieved_docs, duration_retrieval = retrieve_contexts(vectordb=vectordb, retrieval_method=combination["retriever_method"], k=combination["number_retrieved_docs"], query=qa_pair["question"], multiple_queries=multi_queries)
                    retrieved_doc_lists.append(retrieved_docs)
                    duration_retrievals.append(duration_retrieval)
                elif "HyDE" in combination["retriever_method"]:
                    hyde_doc = hyde_docs[index]
                    retrieved_docs, duration_retrieval = retrieve_contexts(vectordb=vectordb, retrieval_method=combination["retriever_method"], k=combination["number_retrieved_docs"], query=qa_pair["question"], hyde_document=hyde_doc)
                    retrieved_doc_lists.append(retrieved_docs)
                    duration_retrievals.append(duration_retrieval)
                else:
                    retrieved_docs, duration_retrieval = retrieve_contexts(vectordb=vectordb, retrieval_method= combination["retriever_method"], k=combination["number_retrieved_docs"], query=qa_pair["question"])
                    retrieved_doc_lists.append(retrieved_docs)
                    duration_retrievals.append(duration_retrieval)

                # Check if the retrieved documents and queries fit into the context size
                start_time_context_size = time.time()
                if is_context_size_valid(llm_comb["model_provider"], llm_comb["model_name"], retrieved_docs, qa_pair["question"], llm_comb["max_context_size"]):
                    answer, duration_generation, sources = prompt_LLM(retrieved_docs, prompt_version, llm, qa_pair["question"], llm_comb["model_provider"])
                    generated_answers.append(answer)
                    context_size_valids.append(True)
                    duration_generations.append(duration_generation)
                    duration_totals.append(duration_retrieval + duration_generation)

                # Exceeds context window size
                else:
                    generated_answers.append("")
                    context_size_valids.append(False)
                    duration_generations.append(0)
                    duration_totals.append(duration_retrieval + duration_generation)
            
                end_time_context_size = time.time()
                if (index + 1) % 50 == 0:
                    print("Completed Index ", index)
                duration_context_size = end_time_context_size - start_time_context_size
                duration_check_context_size.append(duration_context_size - duration_generation)

            # If its the first run, store the retrieved documents and durations, so the Cohere API does not need to be accessed every time
            if first_go:
                with open('./../../evaluationInput/generation_eval/retrieved_docs_k_20.json', 'w') as json_file:
                    json.dump(jsonable_encoder(retrieved_doc_lists), json_file, indent=4)

                with open('./../../evaluationInput/generation_eval/durations_k_20.json', 'w') as json_file_2:
                    json.dump(duration_retrievals, json_file_2, indent=4)
                
            # After reviewing all q,a pairs, do the bulk evaluation operations:
            # Transform List[List[Documents]] into List[List[str]]
            retrieved_docs_lists_texts = []
            for doc_list in retrieved_doc_lists:
                texts = []
                for doc in doc_list:
                    texts.append(doc.metadata.get('original_text', ''))
                retrieved_docs_lists_texts.append(texts)

            # Calculate evaluation metrics
            start_time = time.time()
            rouge1, rouge1_dur = calculate_rouge_score_bulk(generated_answers, golden_answer_set, "rouge1")
            rouge2, rouge2_dur = calculate_rouge_score_bulk(generated_answers, golden_answer_set, "rouge2")
            rougeL, rougeL_dur = calculate_rouge_score_bulk(generated_answers, golden_answer_set, "rougeL")
            print("Calculated rouge scores \n")
            response_length, response_length_dur = calculate_response_length_bulk(generated_answers)
            print("Calculated response length \n")
            hallucination, hallucination_dur = calculate_hallucination_score_bulk(generated_answers, retrieved_docs_lists_texts)
            print("Calculated hallucination score \n")
            answer_relevancy, answer_relevancy_dur = calculate_answer_relevancy_bulk(generated_answers, question_set)
            print("Calculated answer relevancy score \n")
            answer_similarity, answer_similarity_dur = calculate_answer_similarity_bulk(generated_answers, golden_answer_set)
            print("Calculated answer similarity score \n")

            if(calculate_context_recall):
                context_recall, context_recall_dur = calculate_context_recall_bulk(retrieved_docs_lists_texts, golden_answer_set, question_set)
            else:
                context_recall = [0] * len(generated_answers)
                context_recall_dur = 0

            print("Calculated RAGAS context recall score \n")
            if(calculate_RAGAS_relevancy):
                RAGAS_relevancy, RAGAS_relevancy_dur = calculate_answer_relevancy_RAGAS_bulk(question_set, generated_answers)
            else:
                RAGAS_relevancy = [0] * len(generated_answers)
                RAGAS_relevancy_dur = 0
            print("Calculated RAGAGS relevance score \n")

            end_time = time.time()
            duration_all_metrics_total = (end_time - start_time) / len(golden_answer_set)

            # Create dataframe from all the stored arrays
            for i in range(len(context_size_valids)):
                new_row = {
                    "index_name": combination["index_name"],
                    "retriever_method": combination["retriever_method"],
                    "number_retrieved_docs": str(combination["number_retrieved_docs"]),
                    "llm_name": llm_comb["model_name"],
                    "max_context_size": str(llm_comb["max_context_size"]),
                    "temperature": str(llm_comb["temperature"]), 
                    "prompt_version": prompt_version, 
                    "question": golden_qa_set["qa_set"][i]["question"],
                    "retrieved_contexts": retrieved_doc_lists[i],
                    "golden_answer": golden_qa_set["qa_set"][i]["golden_answer"],
                    "generated_answer": generated_answers[i],
                    "context_size_valid": context_size_valids[i],
                    "rouge1": rouge1[i],
                    "rouge2": rouge2[i],
                    "rougeL": rougeL[i],
                    "hallucination": hallucination[i],
                    "answer_relevancy": answer_relevancy[i],
                    "answer_similarity": answer_similarity[i],
                    "context_recall": context_recall[i],
                    "RAGAS_relevancy": RAGAS_relevancy[i],
                    "duration_retrieval": duration_retrievals[i],
                    "duration_generation": duration_generations[i],
                    "duration_context_size": duration_check_context_size[i],
                    "duration_total": duration_totals[i]
                }
                df_index_llm_comb.loc[len(df_index_llm_comb)] = new_row

            # Store the dataframe for single combination
            df_index_llm_comb.to_csv(full_path_index_llm_comb, index=False)

            # Average the metrics, only regard rows in which the question was actually answered. Disregard if context_size_valid = False

            valid_rows = df_index_llm_comb[df_index_llm_comb["context_size_valid"]]
            questions_answered = valid_rows.shape[0]
            question_answered_percent = round(questions_answered / df_index_llm_comb.shape[0], 3),
            numeric_columns = valid_rows.select_dtypes(include="number")
            average_values = numeric_columns.mean()
            rounded_average_values = round(average_values, 3)

            # To calculate values for specific types of questions (research process)
            first_Kersten = df_index_llm_comb.head(51)
            first_Kersten = first_Kersten[first_Kersten["context_size_valid"]]
            first_Kersten = first_Kersten.select_dtypes(include="number")
            first_Kersten_avg = first_Kersten.mean()
            first_Kersten_rounded = round(first_Kersten_avg, 3)

            second_Kersten = df_index_llm_comb.iloc[51:102]
            second_Kersten = second_Kersten[second_Kersten["context_size_valid"]]
            second_Kersten = second_Kersten.select_dtypes(include="number")
            second_Kersten_avg = second_Kersten.mean()
            second_Kersten_rounded = round(second_Kersten_avg, 3)

            third_others = df_index_llm_comb.iloc[102:155]
            third_others = third_others[third_others["context_size_valid"]]
            third_others = third_others.select_dtypes(include="number")
            third_others_avg = third_others.mean()
            third_others_rounded = round(third_others_avg, 3)

            fourth_template_and_new_ISO = df_index_llm_comb.tail(9)
            fourth_template_and_new_ISO = fourth_template_and_new_ISO[fourth_template_and_new_ISO["context_size_valid"]]
            fourth_template_and_new_ISO = fourth_template_and_new_ISO.select_dtypes(include="number")
            fourth_template_and_new_ISO_avg = fourth_template_and_new_ISO.mean()
            fourth_template_and_new_ISO_rounded = round(fourth_template_and_new_ISO_avg, 3)

            # To calculate values for specific topics of questions (research process)
            fundamentals_index = [0, 13, 25, 26, 47, 48, 49, 50, 51, 52, 53, 54, 56, 62, 66, 68, 72, 78, 79, 81, 86, 97, 98, 102, 103, 104, 105, 107, 113, 117, 119, 124, 130, 131, 133, 134, 139, 150, 151, 155]
            fundamentals = df_index_llm_comb.iloc[fundamentals_index]
            fundamentals = fundamentals[fundamentals["context_size_valid"]]
            fundamentals = fundamentals.select_dtypes(include="number")
            fundamentals_avg = fundamentals.mean()
            fundamentals_avg_rounded = round(fundamentals_avg, 3)

            clause_index = [1, 2, 6, 7, 8, 9, 10, 12, 55, 58, 71, 73, 74, 76, 80, 83, 87, 89, 94, 106, 109, 123, 125, 126, 128, 132, 136, 140, 142, 147]
            clause = df_index_llm_comb.iloc[clause_index]
            clause = clause[clause["context_size_valid"]]
            clause = clause.select_dtypes(include="number")
            clause_avg = clause.mean()
            clause_avg_rounded = round(clause_avg, 3)

            risk_management_index = [3, 4, 5, 11, 57, 59, 63, 75, 77, 82, 88, 99, 100, 101, 108, 110, 114, 127, 129, 135, 141, 152, 153, 154]
            risk_management = df_index_llm_comb.iloc[risk_management_index]
            risk_management = risk_management[risk_management["context_size_valid"]]
            risk_management = risk_management.select_dtypes(include="number")
            risk_management_avg = risk_management.mean()
            risk_management_avg_rounded = round(risk_management_avg, 3)

            audits_index = [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 85, 90, 138, 143]
            audits = df_index_llm_comb.iloc[audits_index]
            audits = audits[audits["context_size_valid"]]
            audits = audits.select_dtypes(include="number")
            audits_avg = audits.mean()
            audits_avg_rounded = round(audits_avg, 3)

            control_index = [27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 60, 61, 64, 65, 67, 69, 70, 84, 91, 92, 93, 95, 96, 111, 112, 115, 116, 118, 120, 121, 122, 137, 144, 145, 146, 148, 149, 156]
            control = df_index_llm_comb.iloc[control_index]
            control = control[control["context_size_valid"]]
            control = control.select_dtypes(include="number")
            control_avg = control.mean()
            control_avg_rounded = round(control_avg, 3)

            new_iso_index = [157, 158, 159, 160, 161, 162, 163]
            new_iso = df_index_llm_comb.iloc[new_iso_index]
            new_iso = new_iso[new_iso["context_size_valid"]]
            new_iso = new_iso.select_dtypes(include="number")
            new_iso_avg = new_iso.mean()
            new_iso_avg_rounded = round(new_iso_avg, 3)

            new_row_all = {
                "index_name": combination["index_name"]  + "_" + description,
                "retriever_method": combination["retriever_method"],
                "number_retrieved_docs": combination["number_retrieved_docs"],
                "llm_name": llm_comb["model_name"],
                "max_context_size": llm_comb["max_context_size"],
                "temperature": llm_comb["temperature"],                         
                "prompt_version": prompt_version,
                "num_questions_answered": questions_answered,
                "percent_questions_answered": question_answered_percent,

                "rouge1": rounded_average_values["rouge1"],
                "rouge1_first_Kersten": first_Kersten_rounded["rouge1"],
                "rouge1_second_Kersten": second_Kersten_rounded["rouge1"],
                "rouge1_third_others": third_others_rounded["rouge1"],
                "rouge1_fourth_template_and_new_ISO": fourth_template_and_new_ISO_rounded["rouge1"],
                "rouge1_fundamentals": fundamentals_avg_rounded["rouge1"],
                "rouge1_clause": clause_avg_rounded["rouge1"],
                "rouge1_risk_management": risk_management_avg_rounded["rouge1"],
                "rouge1_audits": audits_avg_rounded["rouge1"],
                "rouge1_control": control_avg_rounded["rouge1"],
                "rouge1_new_iso": new_iso_avg_rounded["rouge1"],

                "rouge2": rounded_average_values["rouge2"],
                "rouge2_first_Kersten": first_Kersten_rounded["rouge2"],
                "rouge2_second_Kersten": second_Kersten_rounded["rouge2"],
                "rouge2_third_others": third_others_rounded["rouge2"],
                "rouge2_fourth_template_and_new_ISO": fourth_template_and_new_ISO_rounded["rouge2"],
                "rouge2_fundamentals": fundamentals_avg_rounded["rouge2"],
                "rouge2_clause": clause_avg_rounded["rouge2"],
                "rouge2_risk_management": risk_management_avg_rounded["rouge2"],
                "rouge2_audits": audits_avg_rounded["rouge2"],
                "rouge2_control": control_avg_rounded["rouge2"],
                "rouge2_new_iso": new_iso_avg_rounded["rouge2"],

                "rougeL": rounded_average_values["rougeL"],
                "rougeL_first_Kersten": first_Kersten_rounded["rougeL"],
                "rougeL_second_Kersten": second_Kersten_rounded["rougeL"],
                "rougeL_third_others": third_others_rounded["rougeL"],
                "rougeL_fourth_template_and_new_ISO": fourth_template_and_new_ISO_rounded["rougeL"],
                "rougeL_fundamentals": fundamentals_avg_rounded["rougeL"],
                "rougeL_clause": clause_avg_rounded["rougeL"],
                "rougeL_risk_management": risk_management_avg_rounded["rougeL"],
                "rougeL_audits": audits_avg_rounded["rougeL"],
                "rougeL_control": control_avg_rounded["rougeL"],
                "rougeL_new_iso": new_iso_avg_rounded["rougeL"],

                "response_length": response_length, # Method already calculates the average (does not return List[Float])

                "hallucination": rounded_average_values["hallucination"],
                "hallucination_first_Kersten": first_Kersten_rounded["hallucination"],
                "hallucination_second_Kersten": second_Kersten_rounded["hallucination"],
                "hallucination_third_others": third_others_rounded["hallucination"],
                "hallucination_fourth_template_and_new_ISO": fourth_template_and_new_ISO_rounded["hallucination"],
                "hallucination_fundamentals": fundamentals_avg_rounded["hallucination"],
                "hallucination_clause": clause_avg_rounded["hallucination"],
                "hallucination_risk_management": risk_management_avg_rounded["hallucination"],
                "hallucination_audits": audits_avg_rounded["hallucination"],
                "hallucination_control": control_avg_rounded["hallucination"],
                "hallucination_new_iso": new_iso_avg_rounded["hallucination"],

                "answer_relevancy": rounded_average_values["answer_relevancy"],
                "answer_relevancy_first_Kersten": first_Kersten_rounded["answer_relevancy"],
                "answer_relevancy_second_Kersten": second_Kersten_rounded["answer_relevancy"],
                "answer_relevancy_third_others": third_others_rounded["answer_relevancy"],
                "answer_relevancy_fourth_template_and_new_ISO": fourth_template_and_new_ISO_rounded["answer_relevancy"],
                "answer_relevancy_fundamentals": fundamentals_avg_rounded["answer_relevancy"],
                "answer_relevancy_clause": clause_avg_rounded["answer_relevancy"],
                "answer_relevancy_risk_management": risk_management_avg_rounded["answer_relevancy"],
                "answer_relevancy_audits": audits_avg_rounded["answer_relevancy"],
                "answer_relevancy_control": control_avg_rounded["answer_relevancy"],
                "answer_relevancy_new_iso": new_iso_avg_rounded["answer_relevancy"],

                "answer_similarity": rounded_average_values["answer_similarity"],
                "answer_similarity_first_Kersten": first_Kersten_rounded["answer_similarity"],
                "answer_similarity_second_Kersten": second_Kersten_rounded["answer_similarity"],
                "answer_similarity_third_others": third_others_rounded["answer_similarity"],
                "answer_similarity_fourth_template_and_new_ISO": fourth_template_and_new_ISO_rounded["answer_similarity"],
                "answer_similarity_fundamentals": fundamentals_avg_rounded["answer_similarity"],
                "answer_similarity_clause": clause_avg_rounded["answer_similarity"],
                "answer_similarity_risk_management": risk_management_avg_rounded["answer_similarity"],
                "answer_similarity_audits": audits_avg_rounded["answer_similarity"],
                "answer_similarity_control": control_avg_rounded["answer_similarity"],
                "answer_similarity_new_iso": new_iso_avg_rounded["answer_similarity"],

                "context_recall": rounded_average_values["context_recall"],
                "context_recall_first_Kersten": first_Kersten_rounded["context_recall"],
                "context_recall_second_Kersten": second_Kersten_rounded["context_recall"],
                "context_recall_third_others": third_others_rounded["context_recall"],
                "context_recall_fourth_template_and_new_ISO": fourth_template_and_new_ISO_rounded["context_recall"],
                "context_recall_fundamentals": fundamentals_avg_rounded["context_recall"],
                "context_recall_clause": clause_avg_rounded["context_recall"],
                "context_recall_risk_management": risk_management_avg_rounded["context_recall"],
                "context_recall_audits": audits_avg_rounded["context_recall"],
                "context_recall_control": control_avg_rounded["context_recall"],
                "context_recall_new_iso": new_iso_avg_rounded["context_recall"],

                "RAGAS_relevancy": rounded_average_values["RAGAS_relevancy"],
                "RAGAS_relevancy_first_Kersten": first_Kersten_rounded["RAGAS_relevancy"],
                "RAGAS_relevancy_second_Kersten": second_Kersten_rounded["RAGAS_relevancy"],
                "RAGAS_relevancy_third_others": third_others_rounded["RAGAS_relevancy"],
                "RAGAS_relevancy_fourth_template_and_new_ISO": fourth_template_and_new_ISO_rounded["RAGAS_relevancy"],
                "RAGAS_relevancy_fundamentals": fundamentals_avg_rounded["RAGAS_relevancy"],
                "RAGAS_relevancy_clause": clause_avg_rounded["RAGAS_relevancy"],
                "RAGAS_relevancy_risk_management": risk_management_avg_rounded["RAGAS_relevancy"],
                "RAGAS_relevancy_audits": audits_avg_rounded["RAGAS_relevancy"],
                "RAGAS_relevancy_control": control_avg_rounded["RAGAS_relevancy"],
                "RAGAS_relevancy_new_iso": new_iso_avg_rounded["RAGAS_relevancy"],

                "duration_retrieval": rounded_average_values["duration_retrieval"],
                "duration_generation": rounded_average_values["duration_generation"],
                "duration_context_size": rounded_average_values["duration_context_size"],
                "duration_total": rounded_average_values["duration_total"],
                "rouge1_dur": round(rouge1_dur, 3),
                "rouge2_dur": round(rouge2_dur, 3),
                "rougeL_dur": round(rougeL_dur, 3),
                "response_length_dur": round(response_length_dur, 3),
                "hallucination_dur": round(hallucination_dur, 3),
                "answer_relevancy_dur": round(answer_relevancy_dur, 3),
                "answer_similarity_dur": round(answer_similarity_dur, 3),
                "context_recall_dur": round(context_recall_dur, 3),
                "RAGAS_relevancy_dur": round(RAGAS_relevancy_dur, 3),
                "duration_all_metrics": round(duration_all_metrics_total, 3)
            }

            # Append new row to overall df and save it
            new_row_all_df = pd.DataFrame(new_row_all, index=[0])
            df = pd.concat([new_row_all_df,df.loc[:]]).reset_index(drop=True)
            df.to_csv(full_path_all, index=False)
              
    return df

In [None]:
best_retriever_combinations = [
    {"index_name": "Fine-tuned_finetuned-ISO-27001_1024_l2_All_1536_264_False_Kers", "retriever_method": "Hybrid_Rerank_Cohere", "number_retrieved_docs": 8},
    ]

llms_dict = [
    {"model_provider": "Replicate", "model_name": "lLama2-7b-chat", "temperature": 0, "max_context_size": 4096},
    {"model_provider": "Replicate", "model_name": "lLama2-13b-chat", "temperature": 0, "max_context_size": 4096},
    {"model_provider": "Replicate", "model_name": "lLama2-70b-chat", "temperature": 0, "max_context_size": 4096},
    {"model_provider": "OpenAI", "model_name": "gpt-3.5-turbo", "temperature": 0, "max_context_size": 4096},
    {"model_provider": "OpenAI", "model_name": "gpt-3.5-turbo-16k", "temperature": 0, "max_context_size": 16385},
    {"model_provider": "Mistral", "model_name": "mixtral-8x7B-v0.1", "temperature": 0, "max_context_size": 16000},
    {"model_provider": "HuggingFace", "model_name": "flan-t5-large", "temperature": 0, "max_context_size": 2048},
    {"model_provider": "OpenAI", "model_name": "gpt-4", "temperature": 0, "max_context_size": 8192},
]

df = evaluate_generation(best_retriever_combinations, llms_dict, "Generation_Evaluation", calculate_context_recall=True, calculate_RAGAS_relevancy=True, first_go=True, prompt_version=1, description="First_Test")
df

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
df = pd.read_csv('./../../evaluationResults/generationEval/generation_total/RAG/Third_Generation_Evaluation_Testing')
df