In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [None]:
import os
import json
import time

from datasets import Dataset
from ragas.metrics import answer_correctness
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.run_config import RunConfig
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

In [13]:
with open("./data/qna_results.json", "r") as f:
    results = json.load(f)

In [None]:
api_key = os.environ.get("OPENAI_API_KEY")
chat_model = ChatOpenAI(model="gpt-4o", api_key=api_key)
evaluator_llm = LangchainLLMWrapper(chat_model)
embedder = OpenAIEmbeddings(model="text-embedding-3-small", api_key=api_key)
run_config = RunConfig(max_workers=2) # reduce to 2, to not reach token/min throughput limit

In [15]:
len(results["question"])

200

In [None]:
answers_keys = [k for k in list(results.keys()) if k.endswith(("text", "image"))]
for answer_key in answers_keys:
    data_samples = {
        'question': results["question"],
        'answer': results[answer_key],
        'ground_truth': results["answer"],
    }
    
    dataset = Dataset.from_dict(data_samples)
    eval_results = evaluate(
        dataset, 
        llm=evaluator_llm, 
        embeddings=embedder, 
        metrics=[answer_correctness],
        run_config=run_config,
    )
    
    with open(f"./data/eval_{answer_key}.json", "w") as f:
        json.dump(eval_results.scores, f)
        
    time.sleep(10) # wait for 10sec just for rate limiting 
    print(f"{answer_key} | {eval_results}")