In [None]:
import sys
import os

sys.path.append(os.path.abspath("../.."))

import os
import json

import pandas as pd
from IPython.display import display, HTML, Markdown
import markdown

from src.backend.modules.evaluation.run_tests.test_eval_result import TestEvalResult


def print_markdown(test_eval_result: TestEvalResult):
    markdown_content = test_eval_result.to_markdown()
    html_wrapper = f"""
    <div style="max-width: 900px; margin: auto; font-size: 14px;">
    {markdown.markdown(markdown_content)}
    </div>
    """

    display(HTML(html_wrapper))


TestEvalResult.print_markdown = print_markdown


# Interaction Tests

In [None]:

from src.backend.modules.llm.types import TokenUsage


eval_file_to_read = max(os.listdir("../../data/logs"))
print(f"Reading evaluation log from '{eval_file_to_read}'.")

with open(f"../../data/logs/{eval_file_to_read}", "r") as f:
    raw_data = json.load(f)
    RES = []
    for item in raw_data:
        if 'token_usage' in item and isinstance(item['token_usage'], dict):
            item['token_usage'] = TokenUsage(
                prompt_tokens=item['token_usage'].get('prompt_tokens', 0),
                completion_tokens=item['token_usage'].get('completion_tokens', 0)
            )
        RES.append(TestEvalResult(**item))


In [None]:
print(f"Total time taken: {sum(it.time_taken_s for it in RES):.2f} seconds.")
print(f"Total tests run: {len(RES)}.")

In [None]:
tmp = pd.DataFrame()
raw = ["crashed" if r.crashed else ("passed" if r.passed else "failed") for r in RES]
tmp["abs"] = (pd.Series(raw + ["crashed", "passed", "failed"]).value_counts() - 1).sort_index()
tmp["rel"] = (tmp["abs"] / sum(tmp["abs"]) * 100).round(2)
tmp

### Look at crashed test cases

In [None]:
FAILED_RES = [it for it in RES if not it.passed and not it.crashed]
print(f"{len(FAILED_RES)} out of {len(RES)} tests failed.")

CRASHED_RES = [it for it in RES if it.crashed]
print(f"{len(CRASHED_RES)} out of {len(RES)} tests crashed.")

What kind of errors happened?

In [None]:
pd.Series(
    it.error_messages[0].strip().rsplit("\n", 1)[-1]
    for it in CRASHED_RES
).value_counts()

Look at individual crashed tests.

In [None]:
current = -1

In [None]:
# Run this cell multiple times!
current += 1

if current < len(CRASHED_RES):
    print(current)
    CRASHED_RES[current].print_markdown()
else:
    print("No more crashed tests.")

## Look at failed test cases

In [None]:
current = -1

In [None]:
# Run this cell multiple times!
current += 1

if current < len(FAILED_RES):
    print(current)
    FAILED_RES[current].print_markdown()
else:
    print("No more failed tests.")