In [None]:
import itertools
import random
import string
from collections.abc import Iterable

from pydantic import BaseModel

from intelligence_layer.core import Input, Task, TaskSpan
from intelligence_layer.evaluation import (
    AggregationLogic,
    Aggregator,
    Evaluation,
    Evaluator,
    Example,
    ExpectedOutput,
    InMemoryAggregationRepository,
    InMemoryDatasetRepository,
    InMemoryEvaluationRepository,
    InMemoryRunRepository,
    Runner,
    SingleOutputEvaluationLogic,
    aggregation_overviews_to_pandas,
)


class DummyTask(Task[str, str]):
    def __init__(self, model: str, prompt: str):
        self.model = model
        self.prompt = prompt

    def do_run(self, input: str, task_span: TaskSpan) -> str:
        wordlist = [
            "apple",
            "banana",
            "car",
            "dog",
            "elephant",
            "fish",
            "goat",
            "hat",
            "igloo",
            "jacket",
        ]
        sentences = [
            "Once upon a time,",
            "In a land far, far away,",
            "Suddenly,",
            "One day,",
            "In the morning,",
        ]

        random.seed(hash(input) + hash(self.model))  # Set the seed based on the prompt

        story = self.prompt + " "
        for _ in range(10):
            sentence = random.choice(sentences)
            word = random.choice(wordlist)
            story += sentence + " " + word + " "
        return story


class DummyEvaluation(BaseModel):
    text_length: int
    normalized_capital_count: float


class DummyEvaluationLogic(
    SingleOutputEvaluationLogic[str, str, None, DummyEvaluation]
):
    def do_evaluate_single_output(
        self, example: Example[Input, ExpectedOutput], output: str
    ) -> Evaluation:
        return DummyEvaluation(
            text_length=len(output),
            normalized_capital_count=sum(c.isupper() for c in output) / len(output),
        )


class DummyAggregatedEvaluation(BaseModel):
    avg_length: float
    avg_normalized_capital_count: float


class DummyAggregationLogic(
    AggregationLogic[DummyEvaluation, DummyAggregatedEvaluation]
):
    def aggregate(
        self, evaluations: Iterable[DummyEvaluation]
    ) -> DummyAggregatedEvaluation:
        eval_list = list(evaluations)
        avg_length = sum([s.text_length for s in eval_list]) / len(eval_list)
        avg_normalized_capital_count = sum(
            [s.normalized_capital_count for s in eval_list]
        ) / len(eval_list)
        return DummyAggregatedEvaluation(
            avg_length=avg_length,
            avg_normalized_capital_count=avg_normalized_capital_count,
        )


# Initialize the dataset
random.seed(42)
examples = [
    Example(
        input="".join(random.choices(string.ascii_letters, k=random.randint(1, 50))),
        expected_output=None,
    )
    for i in range(10)
]

dataset_repository = InMemoryDatasetRepository()
dataset = dataset_repository.create_dataset(
    examples=examples, dataset_name="my-dataset"
)

# Optimizing Tasks by Comparing Aggregations

In this tutorial we demonstrate how to optimize a given `Task` that depends on a `model` and a `prompt` parameter. This is done by evaluating each combination of parameters for the tasks separately and then comparing the resulting aggregations.

In this scenario, our task does not depend on an LLM, for the sake of execution speed. However, the demonstrated principles generalize to other use cases.

## Setup

We assume the dataset, the `Task` and `DatasetRepository` are already given, so we can start by instantiating the remaining repositories and our `Evaluator`. The `EXPERIMENT_NAME` will later be used to identify the aggregations of interest. Therefore, we pass it to the `Evaluator` and later to the `Runner` and `Aggregator`. 

In [None]:
EXPERIMENT_NAME = "compare-tasks"

# The `DatasetRepository` is named `dataset_repository`
run_repository = InMemoryRunRepository()
evaluation_repository = InMemoryEvaluationRepository()
aggregation_repository = InMemoryAggregationRepository()

evaluator = Evaluator(
    dataset_repository,
    run_repository,
    evaluation_repository,
    EXPERIMENT_NAME,
    DummyEvaluationLogic(),
)

aggregator = Aggregator(
    evaluation_repository,
    aggregation_repository,
    EXPERIMENT_NAME,
    DummyAggregationLogic(),
)

## Running Experiments for Different Configurations

In [None]:
# Definition of parameters
model_list = ["model a", "model b", "model c"]
label = "dummy_label"
labels = {label}

prompt_list = [
    "A nice story starts with:",
    "Some kind of prompt",
    "No prompt at all",
    "OPTIMIZING PROMPTS IS HARD TO DO",
]

# Loop over all combinations of parameters and run the `Task` for each combination.
# Note, that this can be **very** expensive for large sets of parameters.
for i, (model, prompt) in enumerate(itertools.product(model_list, prompt_list)):
    dummy_task = DummyTask(model=model, prompt=prompt)

    # Model and prompt are stored in the metadata to specify the configuration of the current experiment
    metadata = dict({"model": model, "prompt": prompt})
    description = f"Evaluate dummy task {i}"
    runner = Runner(dummy_task, dataset_repository, run_repository, EXPERIMENT_NAME)
    run_overview = runner.run_dataset(
        dataset.id, metadata=metadata, description=description, labels=labels
    )

    eval_overview = evaluator.evaluate_runs(
        run_overview.id, metadata=metadata, description=description, labels=labels
    )

    aggregator.aggregate_evaluation(
        eval_overview.id, metadata=metadata, description=description, labels=labels
    )

## Comparison of Different Configurations



In [None]:
# Retrieve all aggregations and filter them by desired criteria, i.e., the label `dummy_label`. Filtering can also be done on description and/or metadata.
aggregations_of_interest = [
    overview
    for overview in aggregation_repository.aggregation_overviews(
        aggregation_type=DummyAggregatedEvaluation
    )
    if label in overview.labels
]

# Convert the desired aggregation into a pandas dataframe
formated_aggregations = aggregation_overviews_to_pandas(aggregations_of_interest)

# Print all columns to check for columns of interest
formated_aggregations.columns

 The following steps are very specific to the experiment setup, mostly depending on standard pandas knowledge. They are just one example of how one might analyze the configurations.

In [None]:
aggregation_fields = list(DummyAggregatedEvaluation.model_fields.keys())
# Filter for columns of interest
formated_aggregations = formated_aggregations[["model", "prompt", *aggregation_fields]]

display(
    formated_aggregations.sort_values(
        by="avg_normalized_capital_count", ascending=False
    )
)

In [None]:
formated_aggregations.pivot(
    index="model", columns="prompt", values="avg_normalized_capital_count"
).plot(kind="box", rot=90, title="avg_normalized_capital_count")
formated_aggregations.pivot(index="prompt", columns="model", values="avg_length").plot(
    kind="box", title="avg_length"
)
pass

With these results, it's easy to see which prompt is best to optimize our score! The model on the other hand does not seem to have a big impact on our metrics.

## Adding Different Run Configurations

In [None]:
# Add a new model to the run configuration
model_list.append("model d")

# We use `run_is_already_computed` to only run the new model `model d` with the existing prompts.
# The previous runs/evaluations/aggregations are not recomputed and keep their old results, unless the metadata of the run has changed.
for i, (model, prompt) in enumerate(itertools.product(model_list, prompt_list)):
    dummy_task = DummyTask(model=model, prompt=prompt)

    # Model and prompt are stored in the metadata to specify the configuration of the current experiment
    metadata = dict({"model": model, "prompt": prompt})
    description = f"Evaluate dummy task {i}"
    runner = Runner(dummy_task, dataset_repository, run_repository, EXPERIMENT_NAME)
    if not runner.run_is_already_computed(metadata):
        run_overview = runner.run_dataset(
            dataset.id,
            metadata=metadata,
            description=description,
            labels=labels,
        )

        eval_overview = evaluator.evaluate_runs(
            run_overview.id,
            metadata=metadata,
            description=description,
            labels=labels,
        )

        aggregator.aggregate_evaluation(
            eval_overview.id, metadata=metadata, description=description, labels=labels
        )

# Let's print the number of run/evaluation/aggregation overviews.
# Because we have not recomputed the 12 (3 models times 4 examples) runs/evaluations/aggregations we now expect 16 for each in total.
# Without the `run_is_already_computed` check we would get 28 runs/evaluations/aggregations!
print(f"Number of Runs: {len(list(run_repository.run_overviews()))}")
print(
    f"Number of Evaluations: {len(list(evaluation_repository.evaluation_overviews()))}"
)
print(
    f"Number of Aggregations: {len(list(aggregation_repository.aggregation_overviews(DummyAggregatedEvaluation)))}"
)