In [None]:
import sys
import os
from dotenv import load_dotenv

# must happen before other imports!
sys.path.append(os.path.abspath("../"))
load_dotenv("../.env")
load_dotenv("../.env.db")

In [None]:
# also should happen before other imports.
import logging

logging.basicConfig(level=logging.INFO)

In [None]:
from zoneinfo import ZoneInfo
from datetime import datetime
from src.backend.modules.search.llama_index import LlamaIndexExecutor
from src.backend.modules.ai_assistant.llm_interactor.llm_interactor_test import LLMInteractorTest
from src.backend.modules.asr.local_whisper_asr import LocalWhisperASR
from src.backend.modules.evaluation.run_tests.EvaluationPipeline import EvaluationPipeline
from src.backend.modules.llm.lm_studio_llm import LMStudioLLM
from src.backend.modules.srs.testsrs.testsrs import TestFlashcardManager


In [None]:
# define llms
task_llm = LMStudioLLM(model="Meta-Llama-3.1-8B-Instruct", default_temperature=0.8, default_max_tokens=2048)
comparison_llm = LMStudioLLM(model="Meta-Llama-3.1-8B-Instruct", default_temperature=0.0, default_max_tokens=50)
now = datetime.now(ZoneInfo("Europe/Berlin")).strftime("%Y-%m-%d %H:%M:%S %z")

eval_pipeline = EvaluationPipeline(
    asr=LocalWhisperASR("openai/whisper-medium"),
    llm_interactor=LLMInteractorTest(TestFlashcardManager(), comparison_llm, LlamaIndexExecutor()),
    task_llm=task_llm,
    fuzzy_matching_llm=comparison_llm,
    llm_judge=comparison_llm,
    audio_recording_dir_path="../data/recording_data/fabian",
    default_max_messages=15,
    default_max_errors=4,
    max_stream_messages_per_chunk=5,
    max_stream_errors_per_chunk=3,
    verbose_task_execution=True,
    print_progress=True,
    log_file_path=f"../data/logs/{now} evaluation_log.json"
)

In [None]:
from src.backend.modules.evaluation.load_test_data.load_test_data import load_test_data

tests = load_test_data("../tests/data/tests.json")

In [None]:
# eval_pipeline.evaluate(tests)

In [None]:
some_q_a_test = tests.question_answering[51]

some_q_a_test

In [None]:
RES = eval_pipeline.evaluate_individual_tests([some_q_a_test])

RES

In [None]:
RES[0].pretty_print()