In [None]:
import random
from typing import Iterable

from pydantic import BaseModel

from intelligence_layer.core import Input, NoOpTracer, Task, TaskSpan
from intelligence_layer.evaluation import (
    Evaluation,
    Evaluator,
    InMemoryEvaluationRepository,
    SingleOutputEvaluationLogic,
)
from intelligence_layer.evaluation.aggregation.aggregator import (
    AggregationLogic,
    Aggregator,
)
from intelligence_layer.evaluation.aggregation.in_memory_aggregation_repository import (
    InMemoryAggregationRepository,
)
from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput
from intelligence_layer.evaluation.dataset.in_memory_dataset_repository import (
    InMemoryDatasetRepository,
)
from intelligence_layer.evaluation.run.in_memory_run_repository import (
    InMemoryRunRepository,
)
from intelligence_layer.evaluation.run.runner import Runner


class DummyTask(Task[str, str]):
    def __init__(self, model: str, prompt: str):
        self.model = model
        self.prompt = prompt

    def do_run(self, input: str, task_span: TaskSpan) -> str:
        wordlist = [
            "apple",
            "banana",
            "car",
            "dog",
            "elephant",
            "fish",
            "goat",
            "hat",
            "igloo",
            "jacket",
        ]
        sentences = [
            "Once upon a time,",
            "In a land far, far away,",
            "Suddenly,",
            "One day,",
            "In the morning,",
        ]

        random.seed(hash(input) + hash(self.model))  # Set the seed based on the prompt

        story = self.prompt + " "
        for _ in range(10):
            sentence = random.choice(sentences)
            word = random.choice(wordlist)
            story += sentence + " " + word + " "
        return story


class DummyEvaluation(BaseModel):
    text_length: int
    normalized_capital_count: float


class DummyEvaluationLogic(
    SingleOutputEvaluationLogic[str, str, None, DummyEvaluation]
):
    def do_evaluate_single_output(
        self, example: Example[Input, ExpectedOutput], output: str
    ) -> Evaluation:
        return DummyEvaluation(
            text_length=len(output),
            normalized_capital_count=sum(c.isupper() for c in output) / len(output),
        )


class DummyAggregatedEvaluation(BaseModel):
    avg_length: float
    avg_normalized_capital_count: float


class DummyAggregationLogic(
    AggregationLogic[DummyEvaluation, DummyAggregatedEvaluation]
):
    def aggregate(
        self, evaluations: Iterable[DummyEvaluation]
    ) -> DummyAggregatedEvaluation:
        eval_list = list(evaluations)
        avg_length = sum([s.text_length for s in eval_list]) / len(eval_list)
        avg_normalized_capital_count = sum(
            [s.normalized_capital_count for s in eval_list]
        ) / len(eval_list)
        return DummyAggregatedEvaluation(
            avg_length=avg_length,
            avg_normalized_capital_count=avg_normalized_capital_count,
        )

In [None]:
examples = [
    Example(input="Some text", expected_output=None, data="data0"),
    Example(input="Some other text", expected_output=None, data="data1"),
]

dataset_repository = InMemoryDatasetRepository()
dataset = dataset_repository.create_dataset(
    examples=examples, dataset_name="my-dataset"
)
run_repository = InMemoryRunRepository()
evaluation_repository = InMemoryEvaluationRepository()
aggregation_repository = InMemoryAggregationRepository()

In [None]:
dummy_task = DummyTask(model="my model", prompt="my prompt")
runner = Runner(dummy_task, dataset_repository, run_repository, "my_first_run")
runner.run_dataset(dataset.id, NoOpTracer())

In [None]:
evaluator = Evaluator(
    dataset_repository,
    run_repository,
    evaluation_repository,
    "my_evaluation",
    DummyEvaluationLogic(),
)

evaluator.evaluate_runs(*run_repository.run_overview_ids())

In [None]:
aggregator = Aggregator(
    evaluation_repository,
    aggregation_repository,
    "my_first_aggregation",
    DummyAggregationLogic(),
)