In [None]:
from intelligence_layer.connectors import (
    DefaultArgillaClient,
    Field,
    Question,
    RecordData,
    ArgillaEvaluation,
)
from intelligence_layer.evaluation import (
    ArgillaEvaluationRepository,
    InMemoryEvaluationRepository,
    ArgillaEvaluationLogic,
    SuccessfulExampleOutput,
    ArgillaAggregator,
    Example,
    RecordDataSequence,
    AggregationLogic,
    InMemoryAggregationRepository,
)
from pydantic import BaseModel

from typing import Iterable

from dotenv import load_dotenv

load_dotenv()

# How to evaluate with human evaluation via Argilla
1. Initialize an Argilla client with the correct settings for your setup
   - By default, the url and api key are read from the environment variables `ARGILLA_API_URL` and `ARGILLA_API_KEY`
2. Create `Question`s and `Field`s to structure the data that will be displayed in Argilla
3. Choose an Argilla workspace and get its ID
4. Create an `ArgillaEvaluationRepository`
5. Implement an `ArgillaEvaluationLogic`
6. Submit tasks to the Argilla instance by running the `ArgillaEvaluator`
   - Make sure to save the `EvaluationOverview.id`, as it is needed to retrieve the results later
7. **Use the Argilla web platform to evaluate** 

### Example

In [None]:
# Step 1
client = DefaultArgillaClient(
    # api_url="your url here",     # not necessary if ARGILLA_API_URL is set in environment
    # api_key="your api key here", # not necessary if ARGILLA_API_KEY is set in environment
)

# Step 2
questions = [
    Question(
        name="rating",
        title="Funniness",
        description="How funny do you think is the joke? Rate it from 1-5.",
        options=range(1, 6),
    )
]
fields = [
    Field(name="input", title="Topic"),
    Field(name="output", title="Joke"),
]

# Step 3
workspace_id = client.ensure_workspace_exists("my-workspace-name")

# Step 4
data_storage = (
    InMemoryEvaluationRepository()
)  # Use FileEvaluationRepository for persistent results
evaluation_repository = ArgillaEvaluationRepository(
    data_storage, client, workspace_id, fields, questions
)


# Step 5
class StoryTaskInput(BaseModel):  # Should already be implemented in your task
    topic: str
    targeted_word_count: int


class StoryTaskOutput(BaseModel):  # Should already be implemented in your task
    story: str


class CustomArgillaEvaluationLogic(
    ArgillaEvaluationLogic[
        StoryTaskInput, StoryTaskOutput, None
    ]  # No expected output, therefore "None"
):
    def _to_record(
        self,
        example: Example[StoryTaskInput, None],
        *output: SuccessfulExampleOutput[StoryTaskOutput],
    ) -> RecordDataSequence:
        return RecordDataSequence(
            records=[
                RecordData(
                    content={
                        # labels as defined in Field.name
                        "input": example.input.topic,
                        "output": run_output.output.story,
                    },
                    example_id=example.id,
                )
                for run_output in output
            ]
        )


evaluation_logic = CustomArgillaEvaluationLogic()

In [None]:
%%script false --no-raise-error
# we skip this as we do not have a dataset or run in this example

# Step 6
runs_to_evaluate = ["your_run_id_of_interest", "other_run_id_of_interest"]

evaluator = ArgillaEvaluator(
    ..., evaluation_repository, description="My evaluation description", evaluation_logic=evaluation_logic
)
evaluation_overview = evaluator.evaluate_runs(*runs_to_evaluate)
print("ID to retrieve results later: ", evaluation_overview.id)

# Step 7

####################################
# Evaluate via the Argilla UI here #
####################################


```python
```

# How to aggregate an Argilla evaluation
0. Submit tasks to Argilla and perform an evaluation (see [here](#how-to-evaluate-with-human-evaluation-via-argilla)).
1. Implement an `AggregationLogic` that takes `ArgillaEvaluation`s as input.
2. Remember the ID of the evaluation and the name of the Argilla workspace that you want to aggregate.
3. Initialize the `ArgillaEvaluationRepository` and an aggregation repository.
4. Aggregate the results with an `ArgillaAggregator`.

In [None]:
# Step 1
class CustomArgillaAggregation(BaseModel):
    avg_funniness: float


class CustomArgillaAggregationLogic(
    AggregationLogic[ArgillaEvaluation, CustomArgillaAggregation]
):
    def aggregate(
        self, evaluations: Iterable[ArgillaEvaluation]
    ) -> CustomArgillaAggregation:
        evaluation_list = list(evaluations)
        total_score = sum(
            evaluation.metadata[
                "rating"
            ]  # This name is defined by the `Question`s given to the Argilla repository during submission
            for evaluation in evaluation_list
        )
        return CustomArgillaAggregation(
            avg_funniness=total_score / len(evaluation_list)
        )


aggregation_logic = CustomArgillaAggregationLogic()

# Step 2 - See the first example for more info
eval_id = "my-previous-eval-id"
client = DefaultArgillaClient()
workspace_id = client.ensure_workspace_exists("my-workspace-name")

# Step 3
evaluation_repository = ArgillaEvaluationRepository(
    InMemoryEvaluationRepository(), client, workspace_id
)
aggregation_repository = InMemoryAggregationRepository()

# Step 4
aggregator = ArgillaAggregator(
    evaluation_repository,
    aggregation_repository,
    "My aggregation description",
    aggregation_logic,
)

In [None]:
%%script false --no-raise-error
# we skip this as we do not have a dataset or run in this example

aggregation = aggregator.aggregate_evaluation(eval_id)