In [None]:
import sys
import os
from dotenv import load_dotenv

# must happen before other imports!
sys.path.append(os.path.abspath("../"))
load_dotenv("../.env")
load_dotenv("../.env.db")

In [None]:
# also should happen before other imports.
import logging

logging.basicConfig(level=logging.INFO)

In [None]:
from zoneinfo import ZoneInfo
from datetime import datetime

In [None]:
from src.backend.modules.asr.local_whisper_asr import LocalWhisperASR
from src.backend.modules.evaluation.run_tests.evaluation_pipeline import EvaluationPipeline
from src.backend.modules.llm.lm_studio_llm import LMStudioLLM

# define llms
task_llm = LMStudioLLM(model="Meta-Llama-3.1-8B-Instruct", default_temperature=0.05, default_max_tokens=2048)
comparison_llm = LMStudioLLM(model="Meta-Llama-3.1-8B-Instruct", default_temperature=0.0, default_max_tokens=50)
now = datetime.now(ZoneInfo("Europe/Berlin")).strftime("%Y-%m-%d %H:%M:%S %z")

eval_pipeline = EvaluationPipeline(
    asr=LocalWhisperASR("openai/whisper-medium"),
    task_llm=task_llm,
    fuzzy_matching_llm=comparison_llm,
    llm_judge=comparison_llm,
    audio_recording_dir_path="../data/recording_data/fabian",
    verbose_task_execution=True,
    print_progress=True,
    log_file_path=f"../data/logs/{now} evaluation_log.json",
)

In [None]:
from src.backend.modules.evaluation.load_test_data.load_test_data import load_test_data

tests = load_test_data("../tests/data/tests.json")

# Question Answering

In [None]:
# eval_pipeline.evaluate(tests)

In [None]:
some_q_a_test = tests.question_answering[51]

some_q_a_test

In [None]:
import pandas as pd

q_a_tests_sample = pd.Series(tests.question_answering).sample(n=30, random_state=2308421)

In [None]:
Q_RES = eval_pipeline.evaluate_individual_tests(q_a_tests_sample.to_list())

Q_RES

In [None]:
pd.Series([r.crashed for r in Q_RES]).value_counts()

In [None]:
pd.Series([r.passed for r in Q_RES]).value_counts()

In [None]:
failed = [r for r in Q_RES if not r.passed]

In [None]:
failed[0].pretty_print()

In [None]:
RES_CRASHED = [r for r in Q_RES if r.crashed]
len(RES_CRASHED)

In [None]:
RES_CRASHED[3].pretty_print()

# Interaction

In [None]:
import pandas as pd

interaction_tests_sample = pd.Series(tests.interaction).sample(n=1, random_state=2308421)
I_RES = eval_pipeline.evaluate_individual_tests(interaction_tests_sample.to_list())

I_RES

In [None]:
pd.Series([r.crashed for r in I_RES]).value_counts()

In [None]:
I_RES[0].pretty_print()