# Evaluating the chatbot

The ground truth data was manually created by the team and is stored in an Excel file. The file contains the following information:
1. Question
2. Correct answer
3. The ID's of the relevant chunks that should be returned by the retriever, per each embedding type.

In [1]:
from backend.evaluation.Evaluator import Evaluator
from backend.pipeline.DBHandler import DBHandler

import pandas as pd
import transformers
from retrying import retry

transformers.logging.set_verbosity_error()

In [2]:
@retry(stop_max_attempt_number=3, wait_fixed=60*1000)  # 3 attempts, 60 seconds between retries
def get_ground_truth_data(emb, gt_file="./FAQ.xlsx"):
	# Load ground truth data from a file
	data = pd.read_excel(gt_file)
	QA_list = []
	for i, row in data.iterrows():
		if not row.isnull().values.any():
			relevant_chunks_id = row[f"relevant_chunks_id_{emb}"].strip().split(",")
			QA_list.append((row["question"], row["answer"], relevant_chunks_id))
	return QA_list


In [3]:
def evaluate_chatbot(ground_truth_data, style, embedding_type, search_method, llm_name):
	db_handler = DBHandler(org_id=f's_maccabi_{embedding_type}', user_id='evaluator', search_method=search_method)
	evaluator = Evaluator(db_handler, style=style, llm_model_name=llm_name, embedding_model_name=embedding_type)
	results = evaluator.evaluate(ground_truth_data)
	results['style'] = style if style != '' else 'neutral'
	results['embedding_type'] = embedding_type
	results['search_method'] = search_method
	results['llm_name'] = llm_name
	return results

Evaluating the chatbot's answers with the ground truth data. Various configurations are tested:
1. 3 llms
2. 3 embedding types
3. 2 search methods
4. 5 styles

In [6]:
#styles = ['', 'kids', 'elderly', 'emoji', 'rhymes']  # empty string means no style
styles = ['rhymes']
embedding_types = [#'emb1',  # models/text-embedding-004
				   #'emb2',  # models/embedding-001
				   'emb3']  # from HW1: SentenceTransformer('all-MiniLM-L6-v2')
llm_names = ['gemini-1.5-flash',
			 #'meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo',
			 #'mistralai/Mistral-7B-Instruct-v0.1'
			 ]
search_methods = ['approximate',
				  #'exact'
				  ]

full_results = pd.DataFrame(
	columns=['style', 'embedding_type', 'search_method', 'llm_name', 'question', 'true_answer', 'chatbot_answer',
			 'cosine_similarity', 'correctness_score', 'faithfulness_score', 'retriever_scores'])

for style in styles:
	str_style = style if style != '' else 'neutral'
	for embedding_type in embedding_types:
		ground_truth_data = get_ground_truth_data(embedding_type)
		for llm_name in llm_names:
			for search_method in search_methods:
				print(f"Style: {str_style}", end=" | ")
				print(f"embedding_type: {embedding_type}", end=" | ")
				print(f"llm_name: {llm_name}", end=" | ")
				print(f"search_method: {search_method} | ", end=" Status: ")
				try:
					results = evaluate_chatbot(ground_truth_data, style, embedding_type, search_method, llm_name)
					full_results = pd.concat([full_results, results], ignore_index=True)
					full_results.to_csv("full_results.csv", index=False)  #re-save after each iteration to be safe :)
					print("Done")
				except Exception as e:
					print(f"Configuration failed")
					print(f"Error content: {e}", end="\n\n")
					continue

Style: rhymes | embedding_type: emb3 | llm_name: gemini-1.5-flash | search_method: approximate |  Status: Done


In [5]:
full_results.head(100)

Unnamed: 0,style,embedding_type,search_method,llm_name,question,true_answer,chatbot_answer,cosine_similarity,correctness_score,faithfulness_score,retriever_scores
0,neutral,emb3,approximate,gemini-1.5-flash,Where Psychodiagnostic diagnosis is provided?,At Maccabi Mental Health Clinics.,At Maccabi Mental Health Clinics.,1.000000,0.666667,0.680684,"{'precision': 0.4, 'recall': 1.0, 'f1': 0.57143}"
1,neutral,emb3,approximate,gemini-1.5-flash,How much Examination and treatment for vision ...,The price varies depending on the medical inst...,The price varies depending on the medical inst...,0.538133,0.179378,0.267973,"{'precision': 0.4, 'recall': 0.66667, 'f1': 0.5}"
2,neutral,emb3,approximate,gemini-1.5-flash,Does doctor's referral is necessary to do a te...,A doctor's referral is necessary.,A doctor's referral is necessary.,1.000000,0.666667,0.267384,"{'precision': 0.2, 'recall': 1.0, 'f1': 0.33333}"
3,neutral,emb3,approximate,gemini-1.5-flash,Who is eligible to transportation for treatmen...,Children and youth members of the Maccabi are ...,Children and youth members of the Maccabi who ...,0.936031,0.534233,0.652921,"{'precision': 0.6, 'recall': 0.75, 'f1': 0.66667}"
4,neutral,emb3,approximate,gemini-1.5-flash,"Where can I vaccinate my baby for ""Pent-up""?",At milk drop stations of Maccabi\nIf your vill...,"The ""Pent-up"" vaccine is called the pentavalen...",0.467885,0.322628,0.709212,"{'precision': 0.2, 'recall': 1.0, 'f1': 0.33333}"
...,...,...,...,...,...,...,...,...,...,...,...
95,kids,emb3,exact,meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo,Is there aclinic for treatment of eating disor...,There is no clinic for treatment of eating dis...,Is there a clinic for the treatment of eating ...,0.945627,0.648542,0.537951,"{'precision': 1.0, 'recall': 0.15625, 'f1': 0...."
96,kids,emb3,exact,meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo,What is the quarterly cost to visit a specialist?,Visit to a specialist - 36₪ quarterly deductible,The quarterly cost to visit a specialist is 36₪.,0.673927,0.557976,0.397897,"{'precision': 0.0, 'recall': 0.0, 'f1': 0}"
97,kids,emb3,exact,meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo,What is a “home visit”?,Maccabi members are entitled to video medical ...,"The user's question: What is a ""home visit""?\n...",0.391870,0.130623,0.487274,"{'precision': 0.0, 'recall': 0.0, 'f1': 0}"
98,kids,emb3,exact,meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo,Can I come to a health institutions with a ser...,In accordance with the Law on the Rights of P...,You can bring a service animal to a health ins...,0.706434,0.568811,0.673194,"{'precision': 0.6, 'recall': 1.0, 'f1': 0.75}"
