In [None]:
from documentation.how_tos.example_data import (
    DummyAggregationLogic,
    DummyEvaluation,
    DummyExample,
    DummyTask,
)
from intelligence_layer.evaluation import (
    Aggregator,
    IncrementalEvaluator,
    InMemoryAggregationRepository,
    InMemoryEvaluationRepository,
    InMemoryRunRepository,
    Runner,
)
from intelligence_layer.evaluation.dataset.domain import Example
from intelligence_layer.evaluation.dataset.in_memory_dataset_repository import (
    InMemoryDatasetRepository,
)
from intelligence_layer.evaluation.evaluation.evaluator.incremental_evaluator import (
    IncrementalEvaluationLogic,
)
from intelligence_layer.evaluation.run.domain import SuccessfulExampleOutput

# How to implement complete incremental evaluation workflows from running (multiple) tasks to aggregation
This notebook outlines how to:
 - run multiple tasks and configurations on the same dataset
 - perform evaluations in an incremental fashion, i.e., adding additional runs to your existing evaluations without the need for recalculation
 - run aggregation on these evaluations
    
## Step-by-Step Guide
1. Setup:
- Initialize all necessary repositories: 
  - dataset
  - run
  - evaluation
  - aggregation
- Create dataset from example(s)
- Initialized task(s)
- Initialize `Runner` for each task 
2. Run task(s) for the dataset (see [here](./how_to_run_a_task_on_a_dataset.ipynb))
3. Compose a list of IDs of runs you want to evaluate.
4. Define and initialize an `IncrementalEvaluationLogic`; This is similar to a normal `EvaluationLogic` (see [here](./how_to_implement_a_simple_evaluation_and_aggregation_logic.ipynb)) but you also have to implement your own `do_incremental_evaluate` method
5. Initialize an `IncrementalEvaluator` with the repositories and your custom `IncrementalEvaluationLogic`
6. Call the `evaluate_runs` method of the `IncrementalEvaluator` to evaluate the run(s) and create a single `EvaluationOverview`
7. Aggregate your evaluation of the run(s) using the [standard aggregation](./how_to_aggregate_evaluations.ipynb) or using a [custom aggregation logic](./how_to_implement_a_simple_evaluation_and_aggregation_logic.ipynb)

#### Steps for addition of new runs 
8. Define and run some new task(s)
9. Define a list for runs that should not be re-evaluated
10. Call the `evaluate_additional_runs` method of the `IncrementalEvaluator`:
 - `run_ids`: Runs to be included in the evaluation results, including those that have been evaluated before
 - `previous_evaluation_ids`: Runs **not** to be re-evaluated, depending on the specific implementation of the `do_incremental_evaluate` method
11. Aggregate all your `EvaluationOverview`s in your `EvaluationRepository`

In [None]:
# Preparation
examples = [
    DummyExample(input="input1", expected_output="expected_output1", data="data1")
]

# Step 1
dataset_repository = InMemoryDatasetRepository()
run_repository = InMemoryRunRepository()
evaluation_repository = InMemoryEvaluationRepository()
aggregation_repository = InMemoryAggregationRepository()

my_dataset = dataset_repository.create_dataset(examples, "MyDataset")

first_task = DummyTask()
first_runner = Runner(first_task, dataset_repository, run_repository, "MyFirstRun")

# Step 2
first_run_overview = first_runner.run_dataset(my_dataset.id)
print(f"ID of first run: {first_run_overview.id}")

# Step 3
run_overview_ids_for_first_evaluation = []
for run_overview in run_repository.run_overviews():
    if (
        run_overview.description == "MyFirstRun"
    ):  ## This is filter for all the runs you want to include
        run_overview_ids_for_first_evaluation.append(run_overview.id)


# Step 4
class DummyIncrementalEvaluationLogic(
    IncrementalEvaluationLogic[str, str, str, DummyEvaluation]
):
    def do_incremental_evaluate(
        self,
        example: Example[str, str],
        outputs: list[SuccessfulExampleOutput[str]],
        already_evaluated_outputs: list[list[SuccessfulExampleOutput[str]]],
    ) -> DummyEvaluation:
        output_str = "(" + (", ".join(o.output for o in outputs)) + ")"
        return DummyEvaluation(
            eval=f"{example.input}, {example.expected_output}, {output_str}, {already_evaluated_outputs} -> evaluation"
        )


incremental_evaluation_logic = DummyIncrementalEvaluationLogic()

# Step 5
incremental_evaluator = IncrementalEvaluator(
    dataset_repository,
    run_repository,
    evaluation_repository,
    "My incremental evaluation",
    incremental_evaluation_logic,
)

# Step 6
evaluation_overview_first_task = incremental_evaluator.evaluate_runs(
    *run_overview_ids_for_first_evaluation
)

# Step 7
aggregation_logic = DummyAggregationLogic()
aggregator = Aggregator(
    evaluation_repository, aggregation_repository, "MyAggregator", aggregation_logic
)
first_aggregation_overview = aggregator.aggregate_evaluation(
    *evaluation_repository.evaluation_overview_ids()
)
print(f"First aggregation: {first_aggregation_overview}")

In [None]:
## Addition of new task/run
# Step 8
second_task = DummyTask()
second_runner = Runner(second_task, dataset_repository, run_repository, "MySecondRun")
second_run_overview = second_runner.run_dataset(my_dataset.id)
print(f"ID of second run: {second_run_overview.id}")

# Step 9
already_evaluated_run_ids = evaluation_repository.evaluation_overview_ids()

# Step 10
incremental_evaluator.evaluate_additional_runs(
    *run_repository.run_overview_ids(),
    previous_evaluation_ids=already_evaluated_run_ids,
)

# Step 11
second_aggregation_overview = aggregator.aggregate_evaluation(
    *evaluation_repository.evaluation_overview_ids()
)
print(second_aggregation_overview)