In [None]:
from dotenv import load_dotenv
from pydantic import BaseModel
from intelligence_layer.evaluation.dataset.domain import Example
from intelligence_layer.evaluation.evaluation.evaluator import (
    SingleOutputEvaluationLogic,
)
from typing import Iterable
from intelligence_layer.evaluation.aggregation.aggregator import AggregationLogic
import numpy as np

load_dotenv()

# How to implement logic for the evaluation

1. Check the dataset you are using for the `Example` data types "`Input`" and "`ExpectedOutput`" and the `Task` for the `Output` data type.
2. Create an `Evaluation` type that contains the domain-specific evaluation result for a single `Example`.
3. Decide if you want to use a single `Output` per `Example`, or multiple, during your evaluation to generate your evaluation results.
   - For single, we recommend to implement a `SingleOutputEvaluationLogic`.
   - For multiple, implement an `EvaluationLogic`.'
4. Implement the evaluation logic in the `do_evaluate_single_output` or `do_evaluate` methods for `SingleOutputEvaluationLogic` or `EvaluationLogic` respectively.

### Example
In the following example we want to evaluate a story-generating task that generates a story to a topic with a targeted word count

In [None]:
# Step 1 - This is only redefined here for completeness. Normally these would be imported.
# Note that we do not have an ExpectedOutput here
class StoryTaskInput(BaseModel):
    topic: str
    targeted_word_count: int


class StoryTaskOutput(BaseModel):
    story: str


# Step 2 - We want to analyze the if the word count is accurate
class StoryEvaluation(BaseModel):
    word_count_off_by: int


class StoryEvaluationLogic(
    # Step 3 - We only need a single output to analyze the word count
    SingleOutputEvaluationLogic[StoryTaskInput, StoryTaskOutput, None, StoryEvaluation]
):
    def do_evaluate_single_output(
        self, example: Example[StoryTaskInput, None], output: StoryTaskOutput
    ) -> StoryEvaluation:
        # Step 4 - Implement the domain specific logic
        output_word_count = len(output.story.split())
        word_count_off_by = output_word_count - example.input.targeted_word_count
        return StoryEvaluation(word_count_off_by=word_count_off_by)

# How to implement logic for the aggregation

0. Implement the evaluation logic for your use-case.
1. Create an `AggregatedEvaluation` type that contains the domain specific aggregated data for the evaluation.
2. Implement an `AggregationLogic` for your data types
3. Implement the domain-specific logic in the `aggregate` method

### Example
In the following example, we calculate basic statistics on how much off the word count is based on the abovementioned example

In [None]:
# Step 0 - See the example above


# Step 1
class StoryAggregation(BaseModel):
    wc_off_mean: float
    wc_off_median: int
    wc_off_std: float


# Step 2
class StoryAggregationLogic(AggregationLogic[StoryEvaluation, StoryAggregation]):
    def aggregate(self, evaluations: Iterable[StoryEvaluation]) -> StoryAggregation:
        # Step 3
        word_counts = np.array(
            [evaluation.word_count_off_by for evaluation in evaluations]
        )
        wc_off_mean = np.mean(word_counts)
        wc_off_median = np.median(word_counts)
        wc_off_std = np.std(word_counts)
        return StoryAggregation(
            wc_off_mean=wc_off_mean, wc_off_median=wc_off_median, wc_off_std=wc_off_std
        )