Main notebook for evaluating the generation process of the standalone LLM.

In [None]:
from ipynb.fs.defs.a_setup_llms import setup_llm
from ipynb.fs.defs.b_retrieve_data_and_prompt import prompt_LLM_only
from ipynb.fs.defs.d_evaluation_generation import calculate_rouge_score_bulk
from ipynb.fs.defs.d_evaluation_generation import calculate_response_length_bulk
from ipynb.fs.defs.d_evaluation_generation import calculate_answer_relevancy_bulk
from ipynb.fs.defs.d_evaluation_generation import calculate_answer_similarity_bulk
from ipynb.fs.defs.d_evaluation_generation import calculate_answer_relevancy_RAGAS_bulk

import os
import pandas as pd
import json
import time

In [None]:
def evaluate_generation_llm_only(llms_dict, result_name_path: str, calculate_RAGAS_relevancy: bool, prompt_version: int = 1):
    """
    Evaluates the generation process for all given standalone LLMs.

    llms_dict: Array of llm combinations in the form: {"model_provider": "..", "model_name": "..", "temperature": "..", "max_context_size": 1000}
    result_name_path: Where to store the final average results of all indexes
    prompt_version: Which prompt version to use (defaults to 1)
    """
    # Path to store the total averaged results
    full_path_all = f"./../../evaluationResults/generationEval/generation_total/LLM_only/{result_name_path}"

    # Create new folder for the result_name_path for the storing of single pair results
    os.makedirs(f"./../../evaluationResults/generationEval/generation_single/LLM_only/{result_name_path}", exist_ok=True)

    # Check if the df already exists, otherwise create a new one
    try:
        df = pd.read_csv(full_path_all)
    except FileNotFoundError:
        df = pd.DataFrame(columns=["llm_name", "max_context_size", "temperature", "prompt_version", "num_questions_answered", "percent_questions_answered", "rouge1", "rouge1_first_Kersten", "rouge1_second_Kersten", "rouge1_third_others", "rouge1_fourth_template_and_new_ISO", "rouge1_fundamentals", "rouge1_clause", "rouge1_risk_management", "rouge1_audits", "rouge1_control", "rouge1_new_iso", "rouge2", "rouge2_first_Kersten", "rouge2_second_Kersten", "rouge2_third_others", "rouge2_fourth_template_and_new_ISO", "rouge2_fundamentals", "rouge2_clause", "rouge2_risk_management", "rouge2_audits", "rouge2_control", "rouge2_new_iso", "rougeL", "rougeL_first_Kersten", "rougeL_second_Kersten", "rougeL_third_others", "rougeL_fourth_template_and_new_ISO", "rougeL_fundamentals", "rougeL_clause", "rougeL_risk_management", "rougeL_audits", "rougeL_control", "rougeL_new_iso", "response_length", "answer_relevancy", "answer_relevancy_first_Kersten", "answer_relevancy_second_Kersten", "answer_relevancy_third_others", "answer_relevancy_fourth_template_and_new_ISO", "answer_relevancy_fundamentals", "answer_relevancy_clause", "answer_relevancy_risk_management", "answer_relevancy_audits", "answer_relevancy_control", "answer_relevancy_new_iso", "answer_similarity", "answer_similarity_first_Kersten", "answer_similarity_second_Kersten", "answer_similarity_third_others", "answer_similarity_fourth_template_and_new_ISO", "answer_similarity_fundamentals", "answer_similarity_clause", "answer_similarity_risk_management", "answer_similarity_audits", "answer_similarity_control", "answer_similarity_new_iso", "RAGAS_relevancy", "RAGAS_relevancy_first_Kersten", "RAGAS_relevancy_second_Kersten", "RAGAS_relevancy_third_others", "RAGAS_relevancy_fourth_template_and_new_ISO", "RAGAS_relevancy_fundamentals", "RAGAS_relevancy_clause", "RAGAS_relevancy_risk_management", "RAGAS_relevancy_audits", "RAGAS_relevancy_control", "RAGAS_relevancy_new_iso", "duration_generation", "rouge1_dur", "rouge2_dur", "rougeL_dur", "response_length_dur", "answer_relevancy_dur", "answer_similarity_dur", "RAGAS_relevancy_dur", "duration_all_metrics"])

    # Load the golden question and answer pairs
    with open("./../../evaluationInput/generation_eval/golden_qa_set.json", 'r') as file:
        golden_qa_set = json.load(file)
    
    question_set = [qa['question'] for qa in golden_qa_set['qa_set']]
    golden_answer_set = [qa['golden_answer'] for qa in golden_qa_set['qa_set']]

    # Iterate over the given llm combinations
    for llm_comb in llms_dict:
        print(llm_comb)
        llm, max_context_size = setup_llm(llm_comb["model_provider"], llm_comb["model_name"], llm_comb["temperature"])
        # Create a new dataframe for each llm in which everything is stored
        df_index_llm_comb = pd.DataFrame(columns=["llm_name", "max_context_size", "temperature", "prompt_version", "question", "golden_answer", "generated_answer", "rouge1", "rouge2", "rougeL", "answer_relevancy", "answer_similarity", "RAGAS_relevancy", "duration_generation"])
            
        # Where to store results later on
        path_index_llm_comb = "LLM_" + llm_comb["model_name"] + "_" + result_name_path
        full_path_index_llm_comb = f"./../../evaluationResults/generationEval/generation_single/LLM_only/{result_name_path}/{path_index_llm_comb}"

        # Create arrays to store all relevant data for bulk evaluation methods and dataframe
        generated_answers = []
        duration_generations = []

        # Iterate over the golden pairs
        for index, qa_pair in enumerate(golden_qa_set["qa_set"]):
            answer, duration_generation = prompt_LLM_only(prompt_version, llm, qa_pair["question"], llm_comb["model_provider"])
            generated_answers.append(answer)
            duration_generations.append(duration_generation)

            if (index + 1) % 50 == 0:
                print("Completed Index ", index+1)

        # After reviewing all q,a pairs, do the bulk evaluation operations:
        # Calculate evaluation metrics
        start_time = time.time()
        
        rouge1, rouge1_dur = calculate_rouge_score_bulk(generated_answers, golden_answer_set, "rouge1")
        rouge2, rouge2_dur = calculate_rouge_score_bulk(generated_answers, golden_answer_set, "rouge2")
        rougeL, rougeL_dur = calculate_rouge_score_bulk(generated_answers, golden_answer_set, "rougeL")
        print("Calculated ROUGE")
        response_length, response_length_dur = calculate_response_length_bulk(generated_answers)
        print("Calculated response_length")
        answer_relevancy, answer_relevancy_dur = calculate_answer_relevancy_bulk(generated_answers, question_set)
        print("Calculated answer_relevancy")
        answer_similarity, answer_similarity_dur = calculate_answer_similarity_bulk(generated_answers, golden_answer_set)
        print("Calculated answer_similarity")

        if(calculate_RAGAS_relevancy):
            RAGAS_relevancy, RAGAS_relevancy_dur = calculate_answer_relevancy_RAGAS_bulk(question_set, generated_answers)
        else:
            RAGAS_relevancy = [0] * len(generated_answers)
            RAGAS_relevancy_dur = 0

        print("Calculated RAGAS_relevancy")
        end_time = time.time()
        duration_all_metrics_total = (end_time - start_time) / len(golden_answer_set)

        # Create dataframe from all the stored arrays
        for i in range(len(generated_answers)):
            new_row = {
                "llm_name": llm_comb["model_name"],
                "max_context_size": str(llm_comb["max_context_size"]),
                "temperature": str(llm_comb["temperature"]), 
                "prompt_version": prompt_version, 
                "question": golden_qa_set["qa_set"][i]["question"],
                "golden_answer": golden_qa_set["qa_set"][i]["golden_answer"],
                "generated_answer": generated_answers[i],
                "rouge1": rouge1[i],
                "rouge2": rouge2[i],
                "rougeL": rougeL[i],
                "answer_relevancy": answer_relevancy[i],
                "answer_similarity": answer_similarity[i],
                "RAGAS_relevancy": RAGAS_relevancy[i],
                "duration_generation": duration_generations[i],
            }
            df_index_llm_comb.loc[len(df_index_llm_comb)] = new_row

        # Store the dataframe for single combination
        df_index_llm_comb.to_csv(full_path_index_llm_comb, index=False)

        # Average the metrics, only regard rows in which the question was actually answered. Disregard if context_size_valid = False
        questions_answered = df_index_llm_comb.shape[0]
        question_answered_percent = round(questions_answered / df_index_llm_comb.shape[0], 3),
        numeric_columns = df_index_llm_comb.select_dtypes(include="number")
        average_values = numeric_columns.mean()
        rounded_average_values = round(average_values, 3)

        # To calculate values for specific ranges of questions:
        first_Kersten = df_index_llm_comb.head(51)
        first_Kersten = first_Kersten.select_dtypes(include="number")
        first_Kersten_avg = first_Kersten.mean()
        first_Kersten_rounded = round(first_Kersten_avg, 3)

        second_Kersten = df_index_llm_comb.iloc[51:102]
        second_Kersten = second_Kersten.select_dtypes(include="number")
        second_Kersten_avg = second_Kersten.mean()
        second_Kersten_rounded = round(second_Kersten_avg, 3)

        third_others = df_index_llm_comb.iloc[102:155]
        third_others = third_others.select_dtypes(include="number")
        third_others_avg = third_others.mean()
        third_others_rounded = round(third_others_avg, 3)

        fourth_template_and_new_ISO = df_index_llm_comb.tail(9)
        fourth_template_and_new_ISO = fourth_template_and_new_ISO.select_dtypes(include="number")
        fourth_template_and_new_ISO_avg = fourth_template_and_new_ISO.mean()
        fourth_template_and_new_ISO_rounded = round(fourth_template_and_new_ISO_avg, 3)

        # To calculate values for specific topics of questions
        fundamentals_index = [0, 13, 25, 26, 47, 48, 49, 50, 51, 52, 53, 54, 56, 62, 66, 68, 72, 78, 79, 81, 86, 97, 98, 102, 103, 104, 105, 107, 113, 117, 119, 124, 130, 131, 133, 134, 139, 150, 151, 155]
        fundamentals = df_index_llm_comb.iloc[fundamentals_index]
        fundamentals = fundamentals.select_dtypes(include="number")
        fundamentals_avg = fundamentals.mean()
        fundamentals_avg_rounded = round(fundamentals_avg, 3)

        clause_index = [1, 2, 6, 7, 8, 9, 10, 12, 55, 58, 71, 73, 74, 76, 80, 83, 87, 89, 94, 106, 109, 123, 125, 126, 128, 132, 136, 140, 142, 147]
        clause = df_index_llm_comb.iloc[clause_index]
        clause = clause.select_dtypes(include="number")
        clause_avg = clause.mean()
        clause_avg_rounded = round(clause_avg, 3)

        risk_management_index = [3, 4, 5, 11, 57, 59, 63, 75, 77, 82, 88, 99, 100, 101, 108, 110, 114, 127, 129, 135, 141, 152, 153, 154]
        risk_management = df_index_llm_comb.iloc[risk_management_index]
        risk_management = risk_management.select_dtypes(include="number")
        risk_management_avg = risk_management.mean()
        risk_management_avg_rounded = round(risk_management_avg, 3)

        audits_index = [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 85, 90, 138, 143]
        audits = df_index_llm_comb.iloc[audits_index]
        audits = audits.select_dtypes(include="number")
        audits_avg = audits.mean()
        audits_avg_rounded = round(audits_avg, 3)

        control_index = [27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 60, 61, 64, 65, 67, 69, 70, 84, 91, 92, 93, 95, 96, 111, 112, 115, 116, 118, 120, 121, 122, 137, 144, 145, 146, 148, 149, 156]
        control = df_index_llm_comb.iloc[control_index]
        control = control.select_dtypes(include="number")
        control_avg = control.mean()
        control_avg_rounded = round(control_avg, 3)

        new_iso_index = [157, 158, 159, 160, 161, 162, 163]
        new_iso = df_index_llm_comb.iloc[new_iso_index]
        new_iso = new_iso.select_dtypes(include="number")
        new_iso_avg = new_iso.mean()
        new_iso_avg_rounded = round(new_iso_avg, 3)

        new_row_all = {
            "llm_name": llm_comb["model_name"],
            "max_context_size": llm_comb["max_context_size"],
            "temperature": llm_comb["temperature"],                         
            "prompt_version": prompt_version,
            "num_questions_answered": questions_answered,
            "percent_questions_answered": question_answered_percent,

            "rouge1": rounded_average_values["rouge1"],
            "rouge1_first_Kersten": first_Kersten_rounded["rouge1"],
            "rouge1_second_Kersten": second_Kersten_rounded["rouge1"],
            "rouge1_third_others": third_others_rounded["rouge1"],
            "rouge1_fourth_template_and_new_ISO": fourth_template_and_new_ISO_rounded["rouge1"],
            "rouge1_fundamentals": fundamentals_avg_rounded["rouge1"],
            "rouge1_clause": clause_avg_rounded["rouge1"],
            "rouge1_risk_management": risk_management_avg_rounded["rouge1"],
            "rouge1_audits": audits_avg_rounded["rouge1"],
            "rouge1_control": control_avg_rounded["rouge1"],
            "rouge1_new_iso": new_iso_avg_rounded["rouge1"],

            "rouge2": rounded_average_values["rouge2"],
            "rouge2_first_Kersten": first_Kersten_rounded["rouge2"],
            "rouge2_second_Kersten": second_Kersten_rounded["rouge2"],
            "rouge2_third_others": third_others_rounded["rouge2"],
            "rouge2_fourth_template_and_new_ISO": fourth_template_and_new_ISO_rounded["rouge2"],
            "rouge2_fundamentals": fundamentals_avg_rounded["rouge2"],
            "rouge2_clause": clause_avg_rounded["rouge2"],
            "rouge2_risk_management": risk_management_avg_rounded["rouge2"],
            "rouge2_audits": audits_avg_rounded["rouge2"],
            "rouge2_control": control_avg_rounded["rouge2"],
            "rouge2_new_iso": new_iso_avg_rounded["rouge2"],

            "rougeL": rounded_average_values["rougeL"],
            "rougeL_first_Kersten": first_Kersten_rounded["rougeL"],
            "rougeL_second_Kersten": second_Kersten_rounded["rougeL"],
            "rougeL_third_others": third_others_rounded["rougeL"],
            "rougeL_fourth_template_and_new_ISO": fourth_template_and_new_ISO_rounded["rougeL"],
            "rougeL_fundamentals": fundamentals_avg_rounded["rougeL"],
            "rougeL_clause": clause_avg_rounded["rougeL"],
            "rougeL_risk_management": risk_management_avg_rounded["rougeL"],
            "rougeL_audits": audits_avg_rounded["rougeL"],
            "rougeL_control": control_avg_rounded["rougeL"],
            "rougeL_new_iso": new_iso_avg_rounded["rougeL"],

            "response_length": response_length, # Method already calculates the average (does not return List[Float])

            "answer_relevancy": rounded_average_values["answer_relevancy"],
            "answer_relevancy_first_Kersten": first_Kersten_rounded["answer_relevancy"],
            "answer_relevancy_second_Kersten": second_Kersten_rounded["answer_relevancy"],
            "answer_relevancy_third_others": third_others_rounded["answer_relevancy"],
            "answer_relevancy_fourth_template_and_new_ISO": fourth_template_and_new_ISO_rounded["answer_relevancy"],
            "answer_relevancy_fundamentals": fundamentals_avg_rounded["answer_relevancy"],
            "answer_relevancy_clause": clause_avg_rounded["answer_relevancy"],
            "answer_relevancy_risk_management": risk_management_avg_rounded["answer_relevancy"],
            "answer_relevancy_audits": audits_avg_rounded["answer_relevancy"],
            "answer_relevancy_control": control_avg_rounded["answer_relevancy"],
            "answer_relevancy_new_iso": new_iso_avg_rounded["answer_relevancy"],

            "answer_similarity": rounded_average_values["answer_similarity"],
            "answer_similarity_first_Kersten": first_Kersten_rounded["answer_similarity"],
            "answer_similarity_second_Kersten": second_Kersten_rounded["answer_similarity"],
            "answer_similarity_third_others": third_others_rounded["answer_similarity"],
            "answer_similarity_fourth_template_and_new_ISO": fourth_template_and_new_ISO_rounded["answer_similarity"],
            "answer_similarity_fundamentals": fundamentals_avg_rounded["answer_similarity"],
            "answer_similarity_clause": clause_avg_rounded["answer_similarity"],
            "answer_similarity_risk_management": risk_management_avg_rounded["answer_similarity"],
            "answer_similarity_audits": audits_avg_rounded["answer_similarity"],
            "answer_similarity_control": control_avg_rounded["answer_similarity"],
            "answer_similarity_new_iso": new_iso_avg_rounded["answer_similarity"],

            "RAGAS_relevancy": rounded_average_values["RAGAS_relevancy"],
            "RAGAS_relevancy_first_Kersten": first_Kersten_rounded["RAGAS_relevancy"],
            "RAGAS_relevancy_second_Kersten": second_Kersten_rounded["RAGAS_relevancy"],
            "RAGAS_relevancy_third_others": third_others_rounded["RAGAS_relevancy"],
            "RAGAS_relevancy_fourth_template_and_new_ISO": fourth_template_and_new_ISO_rounded["RAGAS_relevancy"],
            "RAGAS_relevancy_fundamentals": fundamentals_avg_rounded["RAGAS_relevancy"],
            "RAGAS_relevancy_clause": clause_avg_rounded["RAGAS_relevancy"],
            "RAGAS_relevancy_risk_management": risk_management_avg_rounded["RAGAS_relevancy"],
            "RAGAS_relevancy_audits": audits_avg_rounded["RAGAS_relevancy"],
            "RAGAS_relevancy_control": control_avg_rounded["RAGAS_relevancy"],
            "RAGAS_relevancy_new_iso": new_iso_avg_rounded["RAGAS_relevancy"],

            "duration_generation": rounded_average_values["duration_generation"],

            "rouge1_dur": round(rouge1_dur, 3),
            "rouge2_dur": round(rouge2_dur, 3),
            "rougeL_dur": round(rougeL_dur, 3),
            "response_length_dur": round(response_length_dur, 3),
            "answer_relevancy_dur": round(answer_relevancy_dur, 3),
            "answer_similarity_dur": round(answer_similarity_dur, 3),
            "RAGAS_relevancy_dur": round(RAGAS_relevancy_dur, 3),
            "duration_all_metrics": round(duration_all_metrics_total, 3)
        }

        # Append new row to overall df and save it
        new_row_all_df = pd.DataFrame(new_row_all, index=[0])
        df = pd.concat([new_row_all_df,df.loc[:]]).reset_index(drop=True)
        df.to_csv(full_path_all, index=False)
              
    return df

In [None]:
llms_dict = [
    {"model_provider": "OpenAI", "model_name": "gpt-3.5-turbo-16k", "temperature": 0, "max_context_size": 16385},
]

df = evaluate_generation_llm_only(llms_dict, "Eval_1", True)
df

In [None]:
import pandas as pd

pd.set_option("display.max_columns", None)
df = pd.read_csv("./../../evaluationResults/generationEval/generation_total/LLM_only/Eval_1")
df