In [1]:
import sys
import os
from dotenv import load_dotenv

# must happen before other imports!
sys.path.append(os.path.abspath("../"))
load_dotenv("../.env")
load_dotenv("../.env.db")

True

In [2]:
# also should happen before other imports.
import logging

logging.basicConfig(level=logging.INFO)

In [3]:
from zoneinfo import ZoneInfo
from datetime import datetime

In [None]:
from src.backend.modules.asr.local_whisper_asr import LocalWhisperASR
from src.backend.modules.evaluation.run_tests.evaluation_pipeline import EvaluationPipeline
from src.backend.modules.llm.lm_studio_llm import LMStudioLLM

# define llms
task_llm = LMStudioLLM(model="Meta-Llama-3.1-8B-Instruct", default_temperature=0.05, default_max_tokens=2048)
comparison_llm = LMStudioLLM(model="Meta-Llama-3.1-8B-Instruct", default_temperature=0.0, default_max_tokens=50)
now = datetime.now(ZoneInfo("Europe/Berlin")).strftime("%Y-%m-%d %H:%M:%S %z")

eval_pipeline = EvaluationPipeline(
    asr=LocalWhisperASR("openai/whisper-medium"),
    task_llm=task_llm,
    fuzzy_matching_llm=comparison_llm,
    llm_judge=comparison_llm,
    audio_recording_dir_path="../data/recording_data/fabian",
    verbose_task_execution=True,
    print_progress=True,
    log_file_path=f"../data/logs/{now} evaluation_log.json",
)

  from .autonotebook import tqdm as notebook_tqdm
2025-06-15 06:26:55.550432: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749961615.567014  290762 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749961615.572475  290762 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1749961615.586278  290762 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1749961615.586294  290762 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1749961615.586296  290762

In [5]:
from src.backend.modules.evaluation.load_test_data.load_test_data import load_test_data

tests = load_test_data("../tests/data/tests.json")

# Question Answering

In [6]:
# eval_pipeline.evaluate(tests)

In [7]:
some_q_a_test = tests.question_answering[51]

some_q_a_test

QuestionAnsweringTest(name='java_26', category='java', description='', environment=<src.backend.modules.srs.testsrs.testsrs.TestFlashcardManager object at 0x7e036eab6cc0>, queries=['What are the primary mechanisms for handling and managing exceptions in Java?'], expected_answer='Use try/catch blocks, possibly a finally block, or rethrow them.', sound_file_names=['java_26_0'])

In [8]:
import pandas as pd

q_a_tests_sample = pd.Series(tests.question_answering).sample(n=30, random_state=2308421)

In [None]:
Q_RES = eval_pipeline.evaluate_individual_tests(q_a_tests_sample.to_list())

Q_RES

In [13]:
pd.Series([r.crashed for r in Q_RES]).value_counts()

False    30
Name: count, dtype: int64

In [14]:
pd.Series([r.passed for r in Q_RES]).value_counts()

True     28
False     2
Name: count, dtype: int64

In [15]:
failed = [r for r in Q_RES if not r.passed]

In [18]:
failed[0].pretty_print()

##############################################################################################
Test java_30 FAILED. Audio file available: False

####################################### Queries ##############################################
Original:    What is the default value assigned to an uninitialized integer variable declared as a class field in Java?
Transcribed: What is the default value assigned to an uninitialized integer variable declared as a class field in Java?


####################################### Response #############################################
Question Answer:
The default value assigned to an uninitialized integer variable declared as a class field in Java is zero.


####################################### History ##############################################
None
     -------------------------------------------------------------------------      
<src.backend.modules.ai_assistant.action_states.StateQuestion object at 0x7e036e437ef0>
     -------------------

In [None]:
RES_CRASHED = [r for r in Q_RES if r.crashed]
len(RES_CRASHED)

In [None]:
RES_CRASHED[3].pretty_print()

# Interaction

In [10]:
import pandas as pd

interaction_tests_sample = pd.Series(tests.interaction).sample(n=1, random_state=2308421)
I_RES = eval_pipeline.evaluate_individual_tests(interaction_tests_sample.to_list())

I_RES

Total test 0 out of 1 (0.00%): <class 'src.backend.modules.evaluation.load_test_data.load_test_data.InteractionTest'> create_deck


INFO:httpx:HTTP Request: POST http://localhost:1234/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:1234/v1/chat/completions "HTTP/1.1 200 OK"


Current state: <src.backend.modules.ai_assistant.action_states.StateAction object at 0x7f7b5d6cc890>
Current state: <src.backend.modules.ai_assistant.action_states.StateTask object at 0x7f7b5d705ee0>
Current state: <src.backend.modules.ai_assistant.action_states.StateTaskNoSearch object at 0x7f7b5c85bc20>


INFO:httpx:HTTP Request: POST http://localhost:1234/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:1234/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:1234/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:1234/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:1234/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:1234/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:1234/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:1234/v1/chat/completions "HTTP/1.1 200 OK"


Current state: <src.backend.modules.ai_assistant.action_states.StateFinishedTask object at 0x7f7f79b063c0>


[TestEvalResult(passed=False, crashed=True, name='create_deck', audio_files_available=True, original_queries=['Go make new deck name Computer Vision.'], transcribed_queries=[' Go make a new deck named Computer Vision.'], question_answer=None, task_finish_message=None, state_history=['None', '<src.backend.modules.ai_assistant.action_states.StateTask object at 0x7f7b5d705ee0>', '<src.backend.modules.ai_assistant.action_states.StateTaskNoSearch object at 0x7f7b5c85bc20>', '<src.backend.modules.ai_assistant.action_states.StateFinishedTask object at 0x7f7f79b063c0>'], error_messages=['Exception raised: Can\'t instantiate abstract class CloudLectureTranslatorASR without an implementation for abstract method \'transcribe_wav_file\'.\n\nStack trace:\nTraceback (most recent call last):\n  File "/home/fab/Dokumente/Schule/Universität/12. Semester inkl Ilias/Info LLM Praktikum/the-curator/src/backend/modules/evaluation/run_tests/EvaluationPipeline.py", line 174, in _evaluate_test\n    comparator 

In [11]:
pd.Series([r.crashed for r in I_RES]).value_counts()

True    1
Name: count, dtype: int64

In [12]:
I_RES[0].pretty_print()

##############################################################################################
Test create_deck CRASHED. Audio file available: True

####################################### Queries ##############################################
Original:    Go make new deck name Computer Vision.
Transcribed:  Go make a new deck named Computer Vision.


####################################### Response #############################################


####################################### History ##############################################
None
     -------------------------------------------------------------------------      
<src.backend.modules.ai_assistant.action_states.StateTask object at 0x7f7b5d705ee0>
     -------------------------------------------------------------------------      
<src.backend.modules.ai_assistant.action_states.StateTaskNoSearch object at 0x7f7b5c85bc20>
     -------------------------------------------------------------------------      
<src.backend.modul