In [None]:
import argilla
from dotenv import load_dotenv
from example_data import StoryTaskInput, StoryTaskOutput, argilla_example_data
from pydantic import BaseModel

from intelligence_layer.connectors import (
    ArgillaEvaluation,
    ArgillaWrapperClient,
    RecordData,
)
from intelligence_layer.evaluation import (
    ArgillaEvaluationLogic,
    ArgillaEvaluator,
    AsyncInMemoryEvaluationRepository,
    Example,
    RecordDataSequence,
    SuccessfulExampleOutput,
)

load_dotenv()

# How to evaluate with human evaluation via Argilla
1. Initialize an Argilla client with the correct settings for your setup
   - By default, the url and api key are read from the environment variables `ARGILLA_API_URL` and `ARGILLA_API_KEY`
2. Choose an Argilla workspace and get its ID
3. Create an `AsyncEvaluationRepository`
4. Define new output type for the evaluation
5. Implement an `ArgillaEvaluationLogic`
   1. Create questions and fields to structure the data that will be displayed in Argilla
   2. Implement `to_record` to convert the task input into an Argilla record
   3. Implement `from_record` to convert the record back to an evaluation result
6. Submit tasks to the Argilla instance by running the `ArgillaEvaluator`
7. **Use the Argilla web platform to evaluate** 
8. Collect all labelled evaluations from Argilla
   - Make sure to save the `EvaluationOverview.id`, as it is needed to retrieve the results later

### Example

In [None]:
# Step 0 - Create a dataset and run a task on it


my_example_data = argilla_example_data()

# Step 1

client = ArgillaWrapperClient(
    # api_url="your url here",     # not necessary if ARGILLA_API_URL is set in environment
    # api_key="your api key here", # not necessary if ARGILLA_API_KEY is set in environment
)


# Step 2
workspace_id = client.ensure_workspace_exists("my-workspace-name")

# Step 3
evaluation_repository = (
    AsyncInMemoryEvaluationRepository()
)  # Use FileEvaluationRepository for persistent results


# Step 4
class FunnyOutputRating(BaseModel):
    rating: int


# Step 5


class CustomArgillaEvaluationLogic(
    ArgillaEvaluationLogic[
        StoryTaskInput, StoryTaskOutput, None, FunnyOutputRating
    ]  # No expected output, therefore "None"
):
    # Step 5.1
    def __init__(self):
        super().__init__(
            questions=[
                argilla.RatingQuestion(
                    name="rating",
                    title="Funniness",
                    description="How funny do you think is the joke? Rate it from 1-5.",
                    values=list(range(1, 6)),
                ),
                argilla.LabelQuestion(
                    name="on-topic",
                    title="Is the question on-topic?",
                    description="<<description on what on topic means>>",
                    labels={"YES": "displayed-yes", "NO": "displayed-no"},
                ),
            ],
            fields={
                "input": argilla.TextField(name="input", title="Topic"),
                "output": argilla.TextField(name="output", title="Joke"),
            },
        )

    # Step 5.2
    def to_record(
        self,
        example: Example[StoryTaskInput, None],
        *output: SuccessfulExampleOutput[StoryTaskOutput],
    ) -> RecordDataSequence:
        return RecordDataSequence(
            records=[
                RecordData(
                    content={
                        # labels as defined in Field.name
                        self.fields["input"].name: example.input.topic,
                        self.fields["output"].name: run_output.output.story,
                    },
                    example_id=example.id,
                )
                for run_output in output
            ]
        )

    # Step 5.3
    def from_record(self, argilla_evaluation: ArgillaEvaluation) -> FunnyOutputRating:
        print(argilla_evaluation)
        rating = (
            argilla_evaluation.responses["rating"]
            if argilla_evaluation.responses["on-topic"] == "YES"
            else 0
        )
        return FunnyOutputRating(rating=rating)


evaluation_logic = CustomArgillaEvaluationLogic()

In [None]:
# we skip this as we do not have a dataset or run in this example

# Step 6

evaluator = ArgillaEvaluator(
    my_example_data.dataset_repository,
    my_example_data.run_repository,
    evaluation_repository,
    description="My evaluation description",
    evaluation_logic=evaluation_logic,
    argilla_client=client,
    workspace_id=workspace_id,
)
partial_evaluation_overview = evaluator.submit(*my_example_data.run_ids)
print("ID to retrieve results later: ", partial_evaluation_overview.id)

# Step 7

####################################
# Evaluate via the Argilla UI here #
####################################

# Step 8
evaluation_overview = evaluator.retrieve(partial_evaluation_overview.id)
print("ID to retrieve the evaluation later: ", evaluation_overview.id)

In [None]:
# cleanup
client._delete_dataset(partial_evaluation_overview.id)