In [None]:
%load_ext autoreload
%autoreload 2

In [4]:
import sys
import os

import pandas as pd
from tqdm import tqdm

parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))

# Add the parent directory to the system path
sys.path.insert(0, parent_dir)

from src.app.assistant import rag_workflow, evaluate_relevance
from src.client_modules.llms.openai.azure_openai import AzureOpenAIClient
from src.client_modules.embeddings.azure_openai import AzureOpenAIEmbeddingModel

In [5]:
ground_truth = pd.read_csv("../../data/evaluation_ground_truth.csv")
df_sample = ground_truth.sample(n=200, random_state=1)
sample = df_sample.to_dict(orient='records')

In [6]:
os.environ["GENERATION_TEMPLATE_PATH"] = "../../templates/"

In [None]:
llm_client_name = 'openai'
llm_model_name = 'gpt-4o'
model_choice = 'openai/gpt-4o'
evaluations = []

llm_client = AzureOpenAIClient()
embedding_client = AzureOpenAIEmbeddingModel()

for record in tqdm(sample[:10]):
    question = record['question']
    response = rag_workflow(question, llm_client, embedding_client=embedding_client)
    answer_llm = response.choices[0].message.content
    relevance, explanation, _ = evaluate_relevance(question, answer_llm, llm_client)
    evaluation = {"Relevance": relevance, "Explanation": explanation}
    evaluations.append((record, answer_llm, evaluation))

In [27]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [None]:
df_eval

In [None]:
df_eval.relevance.value_counts(normalize=True)

In [17]:
os.makedirs('../../data/evaluation')
df_eval.to_csv('../../data/evaluation/rag-eval-gpt-4o.csv', index=False)

In [None]:
llm_model_name = 'gpt-35-turbo'
model_choice = 'openai/gpt-35-turbo'
evaluations2 = []

llm_client = AzureOpenAIClient()
embedding_client = AzureOpenAIEmbeddingModel()

for record in tqdm(sample):
    question = record['question']
    response = rag_workflow(question, llm_client, embedding_client=embedding_client)
    answer_llm = response.choices[0].message.content
    relevance, explanation, _ = evaluate_relevance(question, answer_llm, llm_client)
    evaluation = {"Relevance": relevance, "Explanation": explanation}
    evaluations2.append((record, answer_llm, evaluation))

In [14]:
df_eval2 = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

df_eval2['id'] = df_eval2.record.apply(lambda d: d['id'])
df_eval2['question'] = df_eval2.record.apply(lambda d: d['question'])

df_eval2['relevance'] = df_eval2.evaluation.apply(lambda d: d['Relevance'])
df_eval2['explanation'] = df_eval2.evaluation.apply(lambda d: d['Explanation'])

del df_eval2['record']
del df_eval2['evaluation']

In [None]:
df_eval2.relevance.value_counts(normalize=True)

In [19]:
df_eval2.to_csv('../../data/evaluation/rag-eval-gpt-35-turbo.csv', index=False)