In [None]:
from collections.abc import Iterable

import numpy as np
from dotenv import load_dotenv
from pydantic import BaseModel

from intelligence_layer.evaluation import (
    AggregationLogic,
    Example,
    SingleOutputEvaluationLogic,
)

load_dotenv()

# How to implement logic for the evaluation

1. Determine the data types you need for the evaluation:
   - An `Example` of the dataset you are using defines "`Input`" and "`ExpectedOutput`" data types
   - The task you are using defines the `Output` data type.
2. Create an `Evaluation` type that will contain the domain-specific evaluation result for a single `Example`.
3. Decide if you want to use a single `Output` per `Example`, or multiple outputs per example, during your evaluation to generate your evaluation results.
   - For a single output, we recommend to implement a `SingleOutputEvaluationLogic`.
   - For multiple outputs, implement an `EvaluationLogic`.
4. Implement the evaluation logic in the `do_evaluate_single_output` method for `SingleOutputEvaluationLogic` or in the `do_evaluate` method for `EvaluationLogic`.

### Example
In the following example we want to evaluate a story-generating task that generates a story of a topic with a targeted word count

In [None]:
# Step 1 - This is only redefined here for completeness. Normally these would be imported.
# Note that we do not have an ExpectedOutput here.


class StoryTaskInput(BaseModel):
    topic: str
    targeted_word_count: int


class StoryTaskOutput(BaseModel):
    story: str


# Step 2 - We want to analyze if the word count is accurate
class StoryEvaluation(BaseModel):
    word_count_off_by: int


class StoryEvaluationLogic(
    # Step 3 - We only need a single output to analyze the word count
    SingleOutputEvaluationLogic[
        StoryTaskInput, StoryTaskOutput, None, StoryEvaluation
    ]  # We pass None here as we do not have an ExpectedOutput
):
    def do_evaluate_single_output(
        self, example: Example[StoryTaskInput, None], output: StoryTaskOutput
    ) -> StoryEvaluation:
        # Step 4 - Implement the domain specific logic
        output_word_count = len(output.story.split())
        word_count_off_by = output_word_count - example.input.targeted_word_count
        return StoryEvaluation(word_count_off_by=word_count_off_by)

# How to implement a logic for an aggregation

0. Implement the evaluation logic for your use-case. (see [above](#how-to-implement-logic-for-the-evaluation))
1. Create an `AggregatedEvaluation` type that will contain the domain specific data aggregated from evaluations.
2. Implement an `AggregationLogic` for your data types
   1. Implement the domain-specific logic in the `aggregate` method

### Example
In the following example, we calculate basic statistics on the word count differences of the previous evaluation example

In [None]:
# Step 0 - See the example above


# Step 1
class StoryAggregation(BaseModel):
    wc_off_mean: float
    wc_off_median: int
    wc_off_std: float


# Step 2
class StoryAggregationLogic(AggregationLogic[StoryEvaluation, StoryAggregation]):
    def aggregate(self, evaluations: Iterable[StoryEvaluation]) -> StoryAggregation:
        # Step 2.1
        word_counts = np.array(
            [evaluation.word_count_off_by for evaluation in evaluations]
        )
        wc_off_mean = np.mean(word_counts)
        wc_off_median = np.median(word_counts)
        wc_off_std = np.std(word_counts)
        return StoryAggregation(
            wc_off_mean=wc_off_mean, wc_off_median=wc_off_median, wc_off_std=wc_off_std
        )