In [1]:
from torch.utils.jit.log_extract import run_test

from src.backend.modules.ai_assistant.llm_interactor.llm_interactor_test import LLMInteractorTest
from src.backend.modules.ai_assistant.task_executor import TaskExecutor
from src.backend.modules.evaluation.load_test_data.load_test_data import load_test_data
from src.backend.modules.evaluation.run_tests.run_tests import InteractionTestEvaluator
from src.backend.modules.llm.llm_communicator import LLMCommunicator
from src.backend.modules.llm.lm_studio_llm import LMStudioLLM
from src.backend.modules.srs.testsrs.testsrs import TestCard, TestFlashcardManager

In [2]:
test_data_path = "../tests/data/tests.json"
test_data = load_test_data(test_data_path)

In [3]:
fcm = test_data.test_flashcard_manager

[it.name for it in fcm.get_all_decks()]

['Empty Deck',
 'Latin Literature',
 'Java Programming',
 'Java Programming 10',
 'python']

In [4]:
fcm.get_deck_by_name("Latin Literature")

TestDeck(id=DeckID(numeric_id=1681806090), name='Latin Literature', cards=[TestCard(id=CardID(numeric_id=1210109363), deck=..., question='Who was Virgil?', answer="A Roman poet known for the 'Aeneid'", flag=<Flag.NONE: 'none'>, cardState=<CardState.NEW: 'new'>, fuzzymatch_question=False, fuzzymatch_answer=False), TestCard(id=CardID(numeric_id=4089429325), deck=..., question="What is the 'Aeneid'?", answer="An epic poem about Aeneas's journey", flag=<Flag.NONE: 'none'>, cardState=<CardState.NEW: 'new'>, fuzzymatch_question=False, fuzzymatch_answer=False), TestCard(id=CardID(numeric_id=767443365), deck=..., question='Who was Horace?', answer='A leading Roman lyric poet', flag=<Flag.NONE: 'none'>, cardState=<CardState.NEW: 'new'>, fuzzymatch_question=False, fuzzymatch_answer=False), TestCard(id=CardID(numeric_id=385728564), deck=..., question="What type of work is 'Satires' by Horace?", answer='A collection of satirical poems', flag=<Flag.NONE: 'none'>, cardState=<CardState.NEW: 'new'>, f

In [5]:
print(fcm.get_deck_by_name("Latin Literature").cards[3])

Card card_16fdc034 from the deck Deck 'Latin Literature' (id: deck_643e530a) containing 30 cards
Question:
What type of work is 'Satires' by Horace?

Answer:
A collection of satirical poems

Flag: none
Card State: new


In [6]:
# fuzzy_search_llm = LMStudioLLM("meta-llama-3.1-8b-instruct", 0.0, 10)
# task_llm = LMStudioLLM("meta-llama-3.1-8b-instruct", 0.8, 1024)

fuzzy_search_llm = LMStudioLLM("qwen3-8b", 0.0, 15, add_no_think=True)
task_llm = LMStudioLLM("qwen3-8b", 0.8, 1024)

# create two TaskExecutors: One for a blank, mutable fcm, and one for the tests.
fcm = TestFlashcardManager()
llmi = LLMInteractorTest(fcm, fuzzy_search_llm)
llmc = LLMCommunicator(task_llm)
te = TaskExecutor(llmi, llmc, default_max_errors=0, verbose=True)

test_llmi = LLMInteractorTest(test_data.test_flashcard_manager, fuzzy_search_llm)
test_te = TaskExecutor(test_llmi, llmc, default_max_errors=0, verbose=True)

In [7]:
print(te._get_system_prompt())

You are an assistant for a flashcard learning system.

## The flashcard system.
The flashcard system contains decks, and each deck is a collection of cards.

About Cards:
    A Card is a representation of a flashcard, containing a question and an answer. The card is uniquely identified by the id.

    Properties:
      id (str): The id uniquely identifies the card. It is represented as "card_xxxx_xxxx", with x being hexadecimal digits.
          The id is the only way to identify a card.
      question (str): The question (frontside) of the card.
      answer (str): The answer (frontside) of the card.
      flag (str): The flag of the card. **Must** be one of:
          none, red, orange, green, blue, pink, turquoise, purple
      cardState (str): The state of the card in the flashcard system. **Must** be one of:
          new, learning, review, suspended, buried
    

About Decks:
    A Deck represents a collection of flashcards.

    Properties:
       id (str): The id uniquely ident

In [8]:
te.execute_prompts(
    [
        "Go make new deck name Geography and add a new card (flag: Turquoise) with question What is the capital of France? and answer Paris. The state should be 'New'."
    ],
)

## user
Go make new deck name Geography and add a new card (flag: Turquoise) with question What is the capital of France? and answer Paris. The state should be 'New'.

## assistant
<think>
Okay, let me figure out how to handle this user's request. They want to create a new deck named "Geography" and add a card with specific details.

First, I need to create the deck. The function create_deck is available for that. The name is given as Geography, so I'll call create_deck("Geography"). That should return the deck ID.

Once the deck is created, I can proceed to add the card. The user specified the question as "What is the capital of France?" and the answer as Paris. The flag is Turquoise, and the state should be 'New'. 

Looking at the functions, there's add_card_with_metadata which allows setting the state and flag. So I need to use that function. The parameters required are deck_id_str, question, answer, state, and flag. 

Wait, the user mentioned the state should be 'New', but the vali

## Do the tests!

In [9]:
from src.backend.modules.helpers.matching import match_by_equals, match_by_key

print(match_by_equals([1, 3, 5], ["5", "7", "1", "9"], lambda l, r: l == int(r)))

print(match_by_key([1, 3, 5], ["5", "7", "1", "9"], equals=(lambda x, y: str(x) == y), right_key=lambda x: int(x)))

([(1, '1'), (5, '5')], [3], ['7', '9'])
([(1, '1'), (5, '5')], [3], ['7', '9'])


In [10]:
from src.backend.modules.srs.testsrs.testsrs import Flag, CardState


def foo():  # else 'expected' and 'actual' are in the global scope and I get 5 million warnings. Even if del.
    tmp_fcm = TestFlashcardManager()
    deck = tmp_fcm.add_deck("Test")
    expected = tmp_fcm.add_full_card(
        deck,
        question="What is an integer?",
        answer="A whole number.",
        flag=Flag.NONE,
        card_state=CardState.NEW,
        fuzzymatch_answer=True,
        fuzzymatch_question=True,
    )

    actual = tmp_fcm.add_full_card(
        deck,
        question="What is an integer?",
        answer="A number without any decimals.",
        # lol prompt injectino
        flag=Flag.NONE,
        card_state=CardState.NEW,
    )

    print(InteractionTestEvaluator(fuzzy_search_llm)._fuzzy_match_test_cards(expected, actual))


foo()

True


### Execute interaction tests


In [11]:
test_data.interaction[5]

InteractionTest(name='create_deck', description='', environment=<src.backend.modules.srs.testsrs.testsrs.TestFlashcardManager object at 0x765ab86d46b0>, queries=["Create a new deck with the name 'Business Administration'."], parameters={'<name>': 'Business Administration'}, expected_result=<src.backend.modules.srs.testsrs.testsrs.TestFlashcardManager object at 0x765ab86fe630>, sound_file_names=['create_deck_0_prm_5'])

In [14]:
from typing import Iterable


def execute_interaction_tests(
    indices: Iterable[int] | None = None,
    verbose: bool = True,
    print_progress: bool = True,
    log_file_path: str | None = None,
):
    if indices is None:
        tests = test_data.interaction
    else:
        indices = set(indices)
        tests = [it for (nr, it) in enumerate(test_data.interaction) if nr in indices]
    return InteractionTestEvaluator(fuzzy_search_llm).execute_interaction_tests(
        tests=tests,
        llm_interactor=LLMInteractorTest(TestFlashcardManager(), fuzzy_search_llm),
        task_llm=task_llm,
        default_max_messages=10,
        default_max_errors=5,
        max_stream_messages_per_chunk=10,
        max_stream_errors_per_chunk=2,
        verbose_task_execution=verbose,
        print_progress=print_progress,
        log_file_path=log_file_path,
    )

In [15]:
res = execute_interaction_tests([5])

res

Test 0 out of 1 (0.00%): create_deck
## user
Create a new deck with the name 'Business Administration'.

## assistant
<think>
Okay, the user wants to create a new deck named 'Business Administration'. Let me check the available functions. There's a create_deck function that takes a name parameter. The name must be non-empty, which it is. I don't need to list decks first because the user specified the name directly. So I should call create_deck with the given name. No other steps are needed since there's no mention of adding cards or modifying existing ones. Just create the deck.
</think>

<execute>
* create_deck("Business Administration")
</execute>

## user
[Deck 'Business Administration' (id: deck_a791d400) containing 0 cards]

## assistant
<think>
Okay, so the user asked to create a new deck named 'Business Administration'. I used the create_deck function with that name. The system responded with the deck's ID, which is deck_a791d400. Now, the user hasn't mentioned anything else yet

[TestInfo(passed=True, crashed=False, name='create_deck', queries=["Create a new deck with the name 'Business Administration'."], error_messages=[], log_messages=[('user', "Create a new deck with the name 'Business Administration'."), ('assistant', '<think>\nOkay, the user wants to create a new deck named \'Business Administration\'. Let me check the available functions. There\'s a create_deck function that takes a name parameter. The name must be non-empty, which it is. I don\'t need to list decks first because the user specified the name directly. So I should call create_deck with the given name. No other steps are needed since there\'s no mention of adding cards or modifying existing ones. Just create the deck.\n</think>\n\n<execute>\n* create_deck("Business Administration")\n</execute>'), ('user', "[Deck 'Business Administration' (id: deck_a791d400) containing 0 cards]"), ('assistant', "<think>\nOkay, so the user asked to create a new deck named 'Business Administration'. I used th

In [16]:
res[0].pretty_print()

##############################################################################################
Test create_deck PASSED.

####################################### Queries ##############################################
 * Create a new deck with the name 'Business Administration'.

####################################### Logs #################################################
Create a new deck with the name 'Business Administration'.


<think>
Okay, the user wants to create a new deck named 'Business Administration'. Let me check the available functions. There's a create_deck function that takes a name parameter. The name must be non-empty, which it is. I don't need to list decks first because the user specified the name directly. So I should call create_deck with the given name. No other steps are needed since there's no mention of adding cards or modifying existing ones. Just create the deck.
</think>

<execute>
* create_deck("Business Administration")
</execute>


[Deck 'Business Adminis

In [15]:
# assert len(test_data.interaction) > 100

35

In [19]:
from datetime import datetime
from zoneinfo import ZoneInfo

NOW = datetime.now(ZoneInfo("Europe/Berlin")).strftime("%Y-%m-%d %H:%M:%S %z")
RES = execute_interaction_tests(indices=None, verbose=False, log_file_path=f"reports/test report {NOW}.json")
RES

Test 0 out of 35 (0.00%): create_deck
Test 1 out of 35 (2.86%): create_deck
Test 2 out of 35 (5.71%): create_deck
Test 3 out of 35 (8.57%): create_deck
Test 4 out of 35 (11.43%): create_deck
Test 5 out of 35 (14.29%): create_deck
Test 6 out of 35 (17.14%): create_deck
Test 7 out of 35 (20.00%): create_deck
Test 8 out of 35 (22.86%): create_deck
Test 9 out of 35 (25.71%): add_card
Test 10 out of 35 (28.57%): add_card
Test 11 out of 35 (31.43%): add_card
Test 12 out of 35 (34.29%): create_add_card_flagged
Test 13 out of 35 (37.14%): create_add_card_flagged
Test 14 out of 35 (40.00%): create_add_card_flagged
Test 15 out of 35 (42.86%): delete_deck
Test 16 out of 35 (45.71%): edit_card_exact
Test 17 out of 35 (48.57%): edit_card_fuzzy
Test 18 out of 35 (51.43%): delete_card
Test 19 out of 35 (54.29%): delete_first_7_cards
Test 20 out of 35 (57.14%): delete_first_3_cards
Test 21 out of 35 (60.00%): create_deck_by_keyword_1
Test 22 out of 35 (62.86%): create_deck_by_keyword_2
Test 23 out of 

[TestInfo(passed=True, crashed=False, name='create_deck', queries=["Create a new deck with the name 'Natural Language Processing'."], error_messages=[], log_messages=[('user', "Create a new deck with the name 'Natural Language Processing'."), ('assistant', '<think>\nOkay, the user wants to create a new deck named \'Natural Language Processing\'. Let me check the available functions. There\'s a create_deck function that takes a name parameter. The name must be a non-empty string, which it is here. I don\'t need to list decks first because the user directly specified the name. So I should call create_deck with that name. No other steps are needed since there\'s no mention of adding cards or modifying existing decks. Just create the deck and return.\n</think>\n\n<execute>\n* create_deck("Natural Language Processing")\n</execute>'), ('user', "[Deck 'Natural Language Processing' (id: deck_3e21e300) containing 0 cards]"), ('assistant', "<think>\nOkay, the user asked to create a new deck name

In [20]:
[(r.passed, r.crashed) for r in RES]

[(True, False),
 (True, False),
 (True, False),
 (True, False),
 (True, False),
 (True, False),
 (True, False),
 (True, False),
 (True, False),
 (True, False),
 (True, False),
 (True, False),
 (False, False),
 (False, False),
 (False, False),
 (True, False),
 (False, True),
 (False, False),
 (False, True),
 (False, True),
 (False, True),
 (True, False),
 (True, False),
 (False, False),
 (True, False),
 (False, False),
 (False, False),
 (False, False),
 (False, False),
 (False, False),
 (False, False),
 (False, True),
 (False, False),
 (False, False),
 (False, False)]

### Random other tests and snippets

In [17]:
possible_answer = """

<execute>
* list_decks()
</execute>"""

te._parse_llm_response(possible_answer)

[_ParsedLLMCommand(func_name='list_decks', args=[], kwargs={})]

In [18]:
substring_search_res = test_llmi.search_for_substring(
    deck_id_str="*", search_substring="change", search_in_question=True, search_in_answer=True, case_sensitive=False
)

assert len(substring_search_res.cards) == 5

for c in substring_search_res.cards:
    print(c)
    print("\n===============================\n")

Card card_340b0716 from the deck Deck 'Latin Literature' (id: deck_96524650) containing 30 cards
Question:
What does Apollo and Daphne\'s myth reveal about boundaries between pursuit and violation\?

Answer:
Explores unrequited desire and forced change.

Flag: none
Card State: new


Card card_422cb46c from the deck Deck 'Latin Literature' (id: deck_96524650) containing 30 cards
Question:
Why is the final transformation in Book XV of Metamorphoses significant to the poem\'s theme of change\?

Answer:
Cements the perpetual cycle of creation and renewal.

Flag: none
Card State: new


Card card_4b0da474 from the deck Deck 'Java Programming' (id: deck_ff0b969d) containing 103 cards
Question:
What does 'volatile' mean for a variable?

Answer:
Changes to the variable are always visible to other threads.

Flag: none
Card State: new


Card card_3e0b2990 from the deck Deck 'python' (id: deck_17c7bc52) containing 86 cards
Question:
When would you use a tuple over a list?

Answer:
When you need an