In [None]:
import itertools
import random
import string
from typing import Iterable

from pydantic import BaseModel

from intelligence_layer.core import Input, Task, TaskSpan
from intelligence_layer.evaluation import (
    Evaluation,
    Evaluator,
    InMemoryEvaluationRepository,
    SingleOutputEvaluationLogic,
)
from intelligence_layer.evaluation.aggregation.aggregator import (
    AggregationLogic,
    Aggregator,
)
from intelligence_layer.evaluation.aggregation.in_memory_aggregation_repository import (
    InMemoryAggregationRepository,
)
from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput
from intelligence_layer.evaluation.dataset.in_memory_dataset_repository import (
    InMemoryDatasetRepository,
)
from intelligence_layer.evaluation.run.in_memory_run_repository import (
    InMemoryRunRepository,
)
from intelligence_layer.evaluation.run.runner import Runner


class DummyTask(Task[str, str]):
    def __init__(self, model: str, prompt: str):
        self.model = model
        self.prompt = prompt

    def do_run(self, input: str, task_span: TaskSpan) -> str:
        wordlist = [
            "apple",
            "banana",
            "car",
            "dog",
            "elephant",
            "fish",
            "goat",
            "hat",
            "igloo",
            "jacket",
        ]
        sentences = [
            "Once upon a time,",
            "In a land far, far away,",
            "Suddenly,",
            "One day,",
            "In the morning,",
        ]

        random.seed(hash(input) + hash(self.model))  # Set the seed based on the prompt

        story = self.prompt + " "
        for _ in range(10):
            sentence = random.choice(sentences)
            word = random.choice(wordlist)
            story += sentence + " " + word + " "
        return story


class DummyEvaluation(BaseModel):
    text_length: int
    normalized_capital_count: float


class DummyEvaluationLogic(
    SingleOutputEvaluationLogic[str, str, None, DummyEvaluation]
):
    def do_evaluate_single_output(
        self, example: Example[Input, ExpectedOutput], output: str
    ) -> Evaluation:
        return DummyEvaluation(
            text_length=len(output),
            normalized_capital_count=sum(c.isupper() for c in output) / len(output),
        )


class DummyAggregatedEvaluation(BaseModel):
    avg_length: float
    avg_normalized_capital_count: float


class DummyAggregationLogic(
    AggregationLogic[DummyEvaluation, DummyAggregatedEvaluation]
):
    def aggregate(
        self, evaluations: Iterable[DummyEvaluation]
    ) -> DummyAggregatedEvaluation:
        eval_list = list(evaluations)
        avg_length = sum([s.text_length for s in eval_list]) / len(eval_list)
        avg_normalized_capital_count = sum(
            [s.normalized_capital_count for s in eval_list]
        ) / len(eval_list)
        return DummyAggregatedEvaluation(
            avg_length=avg_length,
            avg_normalized_capital_count=avg_normalized_capital_count,
        )

In [None]:
EXPERIMENT_NAME = "compare-tasks"
random.seed(42)
examples = [
    Example(
        input="".join(random.choices(string.ascii_letters, k=random.randint(1, 50))),
        expected_output=None,
    )
    for i in range(10)
]

dataset_repository = InMemoryDatasetRepository()
dataset = dataset_repository.create_dataset(
    examples=examples, dataset_name="my-dataset"
)
run_repository = InMemoryRunRepository()
evaluation_repository = InMemoryEvaluationRepository()
aggregation_repository = InMemoryAggregationRepository()

evaluator = Evaluator(
    dataset_repository,
    run_repository,
    evaluation_repository,
    EXPERIMENT_NAME,
    DummyEvaluationLogic(),
)

In [None]:
model_list = ["model a", "model b", "model c"]
prompt_list = [
    "A nice story starts with:",
    "Some kind of prompt",
    "No prompt at all",
    "OPTIMIZING PROMPTS IS HARD TO DO",
]
for model, prompt in itertools.product(model_list, prompt_list):
    dummy_task = DummyTask(model=model, prompt=prompt)

    description = f"|{model}|{prompt}|"
    runner = Runner(dummy_task, dataset_repository, run_repository, EXPERIMENT_NAME)
    run_overview = runner.run_dataset(dataset.id, description=description)

    eval_overview = evaluator.evaluate_runs(run_overview.id, description=description)

    aggregator = Aggregator(
        evaluation_repository,
        aggregation_repository,
        EXPERIMENT_NAME + ":" + description,
        DummyAggregationLogic(),
    )
    aggregator.aggregate_evaluation(eval_overview.id)

In [None]:
from intelligence_layer.evaluation.infrastructure.repository_navigator import (
    aggregation_overviews_to_pandas,
)

aggregations_of_interest = [
    overview
    for overview in aggregation_repository.aggregation_overviews(
        aggregation_type=DummyAggregatedEvaluation
    )
    if overview.description.startswith(EXPERIMENT_NAME)
]
formated_aggregations = aggregation_overviews_to_pandas(aggregations_of_interest)

aggregation_fields = list(DummyAggregatedEvaluation.model_fields.keys())
formated_aggregations = formated_aggregations[["description"] + aggregation_fields]
formated_aggregations[["model", "prompt"]] = formated_aggregations[
    "description"
].str.split("|", expand=True)[[1, 2]]
formated_aggregations.drop(columns="description", inplace=True)

display(
    formated_aggregations.sort_values(
        by="avg_normalized_capital_count", ascending=False
    )
)

In [None]:
formated_aggregations.pivot(
    index="model", columns="prompt", values="avg_normalized_capital_count"
).plot(kind="box", rot=90, title="avg_normalized_capital_count")
formated_aggregations.pivot(index="prompt", columns="model", values="avg_length").plot(
    kind="box", title="avg_length"
)
pass