# Evaluating the chatbot

The ground truth data was manually created by the team and is stored in an Excel file. The file contains the following information:
1. Question
2. Correct answer
3. The ID's of the relevant chunks that should be returned by the retriever

In [1]:
import pandas as pd
import transformers
from retrying import retry
transformers.logging.set_verbosity_error()
from backend.evaluation.Evaluator import Evaluator
from backend.pipeline.DBHandler import DBHandler

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/student/.config/sagemaker/config.yaml


In [2]:
@retry(stop_max_attempt_number=3, wait_fixed=60*1000)  # 3 attempts, 60 seconds between retries
def get_ground_truth_data(gt_file="./FAQ.xlsx"):
    # Load ground truth data from a file
    data = pd.read_excel(gt_file)
    QA_list = []
    for i, row in data.iterrows():
        relevant_chunks_id = row["relevant_chunks_id"].strip().split(",")
        QA_list.append((row["question"], row["answer"], relevant_chunks_id))
    return QA_list

ground_truth_data = get_ground_truth_data()

In [3]:
def evaluate_chatbot(ground_truth_data, style, embedding_type, search_method, llm_name):
    db_handler = DBHandler(org_id=f'maccabi_{embedding_type}', user_id='evaluator', search_method=search_method)
    #db_handler = DBHandler(org_id=f'maccabi', user_id='evaluator', search_method=search_method)
    evaluator = Evaluator(db_handler, style=style, llm_model_name=llm_name)
    results = evaluator.evaluate(ground_truth_data)
    results['style'] = style if style != '' else 'neutral'
    results['embedding_type'] = embedding_type
    results['search_method'] = search_method
    results['llm_name'] = llm_name
    return results

Evaluating the chatbot's answers with the ground truth data. Various configurations are tested:
1. 3 llms
2. 3 embedding types
3. 2 search methods
4. 5 styles

In [4]:
styles = ['', 'kids', 'elderly', 'emoji', 'rhymes'] # empty string means no style
embedding_types = ['emb1, emb2, emb3'] # 1: models/text-embedding-004, 2: from HW1 ,3: models/embedding-001
llm_names = ['gemini-1.5-flash']
search_methods = ['approximate', 'exact'] 

full_results = pd.DataFrame(columns=['style', 'embedding_type', 'search_method', 'llm_name', 'question', 'true_answer', 'chatbot_answer', 'cosine_similarity', 'correctness_score', 'faithfulness_score', 'retriever_scores'])

for style in styles:
    for embedding_type in embedding_types:
        for llm_name in llm_names:
            for search_method in search_methods:
                try:
                    results = evaluate_chatbot(ground_truth_data, style, embedding_type, search_method, llm_name)
                    full_results = pd.concat([full_results, results], ignore_index=True)
                    full_results.to_csv("full_results.csv", index=False) #re-save after each iteration to be safe :)
                except Exception as e:
                    print(f"Function failed after retries: {e}") 
                    continue

In [5]:
full_results.head(20)

Unnamed: 0,style,embedding_type,search_method,llm_name,question,true_answer,chatbot_answer,cosine_similarity,correctness_score,faithfulness_score,retriever_scores
0,neutral,models/text-embedding-004,approximate,gemini-1.5-flash,Where Psychodiagnostic diagnosis is provided?,At Maccabi Mental Health Clinics.,At Maccabi Mental Health Clinics.,1.0,0.666667,0.743116,"{'precision': 0.6, 'recall': 1.0, 'f1': 0.75}"
1,neutral,models/text-embedding-004,approximate,gemini-1.5-flash,How much Examination and treatment for vision ...,The price varies depending on the medical inst...,The price varies depending on the medical inst...,0.778664,0.259555,0.71605,"{'precision': 0.8, 'recall': 0.8, 'f1': 0.8}"
2,neutral,models/text-embedding-004,approximate,gemini-1.5-flash,Does doctor's referral is necessary to do a te...,A doctor's referral is necessary.,A doctor's referral is necessary.,1.0,0.666667,0.637192,"{'precision': 0.4, 'recall': 1.0, 'f1': 0.57143}"
3,neutral,models/text-embedding-004,approximate,gemini-1.5-flash,Who is eligible to transportation for treatmen...,Children and youth members of the Maccabi are ...,Children and youth members of the Maccabi who ...,0.894024,0.52023,0.822125,"{'precision': 1.0, 'recall': 1.0, 'f1': 1.0}"
4,neutral,models/text-embedding-004,approximate,gemini-1.5-flash,"Where can I vaccinate my baby for ""Pent-up""?",At milk drop stations of Maccabi\nIf your vill...,"You can vaccinate your baby for ""Pent-up"" at M...",0.81232,0.43744,0.838165,"{'precision': 0.4, 'recall': 1.0, 'f1': 0.57143}"
5,neutral,models/text-embedding-004,approximate,gemini-1.5-flash,Is there aclinic for treatment of eating disor...,There is no clinic for treatment of eating dis...,"I'm sorry, I cannot answer this question. The ...",0.836515,0.612172,0.721907,"{'precision': 1.0, 'recall': 0.14706, 'f1': 0...."
6,neutral,models/text-embedding-004,approximate,gemini-1.5-flash,What is the quarterly cost to visit a specialist?,Visit to a specialist - 36₪ quarterly deductible,36₪,0.713102,0.237701,0.535042,"{'precision': 0.4, 'recall': 0.5, 'f1': 0.44444}"
7,neutral,models/text-embedding-004,approximate,gemini-1.5-flash,What is a “home visit”?,Maccabi members are entitled to video medical ...,"A ""home visit"" is a video call with a doctor o...",0.809305,0.269768,0.712754,"{'precision': 0.6, 'recall': 1.0, 'f1': 0.75}"
8,neutral,models/text-embedding-004,approximate,gemini-1.5-flash,Can I come to a health institutions with a ser...,In accordance with the Law on the Rights of P...,It depends on the specific health institution ...,0.736839,0.578946,0.66272,"{'precision': 0.8, 'recall': 1.0, 'f1': 0.88889}"
9,neutral,models/text-embedding-004,approximate,gemini-1.5-flash,"I want to do clinical mammography, but I don't...","My Maccabi, gold and silver companies interest...",Asuta salons.,0.778071,0.259357,0.581229,"{'precision': 0.4, 'recall': 0.66667, 'f1': 0.5}"
