In [None]:
import shutil
from pathlib import Path
from typing import Iterable

from datasets import load_dataset
from dotenv import load_dotenv
from pydantic import BaseModel

from intelligence_layer.connectors import (
    ArgillaRatingEvaluation,
    DefaultArgillaClient,
    Field,
    LimitedConcurrencyClient,
    Question,
    RecordData,
)
from intelligence_layer.core import (
    CompleteOutput,
    Instruct,
    InstructInput,
    LuminousControlModel,
)
from intelligence_layer.evaluation import (
    AggregationLogic,
    Aggregator,
    ArgillaEvaluationLogic,
    ArgillaEvaluator,
    AsyncFileEvaluationRepository,
    Example,
    FileAggregationRepository,
    FileDatasetRepository,
    FileRunRepository,
    RecordDataSequence,
    Runner,
    SuccessfulExampleOutput,
)

load_dotenv()

client = LimitedConcurrencyClient.from_env()


REPOSITORY_ROOT_PATH = Path("human-eval-data")

# Human Evaluation using the Intelligence Layer

Although there are a variety of ways to automate the evaluation of LLM-based tasks, sometimes it is still necessary to get a human opinion.
To make this as painless as possible, we have integrated an [Argilla](https://argilla.io/)-Evaluator into the intelligence layer.
This notebook serves as a quick start guide.

## Environment setup
This notebook expects that you have added your Aleph Alpha token to your .env file.
Additionally you need to add the `ARGILLA_API_URL` and `ARGILLA_API_KEY` from env.sample to your .env file. 
Next, run

```bash
docker-compose up -d
``` 

from the intelligence layer base directory.

Once you go to `localhost:6900` and you are prompted to enter a username and password, use:
- username: `argilla`
- password: `1234`

<div class="alert alert-warning">

This notebook is designed such that the creation of the dataset, the submission to Argilla and the aggregation of the Argilla evaluations do not have to be done in a single session.

As a result, the data repositories are redefined for each step and we use file-based repositories that persist the data. If you run all steps in a single session, you can use InMemory-based repositories and reuse the same repository object for multiple steps.

Running this notebook creates a `human-eval-data` folder, which will be deleted if you run the whole notebook to completion. It also creates the `test-human-eval` Argilla workspace, which will also be deleted afterwards.
</div>

## Dataset Repository definition
First we need to define our dataset. Here we use an [Instruction Dataset](https://huggingface.co/datasets/HuggingFaceH4/instruction-dataset?row=0) from Huggingface. Before we can use it for human eval, we need to make an intelligence layer dataset repository.

In [None]:
dataset = load_dataset("HuggingfaceH4/instruction-dataset")["test"]

Let us explore the dataset a bit. It consists of prompts, example completions and metadata for 327 examples. Since we are doing human eval, for now we only need the prompt and corresponding id.

In [None]:
print(dataset)
print(dataset["meta"][0].keys())

We could now build a single `Example` like this:

In [None]:
example = Example(
    input=InstructInput(instruction=dataset["prompt"][0], input=None),
    expected_output=None,
    id=str(dataset["meta"][0]["id"]),
)

For our dataset repository, we can either use a `FileDatasetRepository` or an `InMemoryDatasetRepository`.

In [None]:
num_examples = 5
assert num_examples <= len(dataset)
dataset_repository = FileDatasetRepository(REPOSITORY_ROOT_PATH)
dataset_id = dataset_repository.create_dataset(
    examples=[
        Example(
            input=InstructInput(instruction=dataset["prompt"][i], input=None),
            expected_output=None,
            id=str(dataset["meta"][i]["id"]),
        )
        for i in range(num_examples)
    ],
    dataset_name="human-evaluation-dataset",
)

In [None]:
dataset_id.name

## Task Setup

We use an `Instruction` task to run the examples in our dataset.
In addition, we define a `Runner` to generate the completions from the model for our dataset
and a `RunRepository` to save the results.

In [None]:
model = LuminousControlModel(name="luminous-base-control", client=client)
task = Instruct(model=model)

dataset_repository = FileDatasetRepository(REPOSITORY_ROOT_PATH)
# either remember the id from before (dataset.id) or retrieve as below
dataset_id = [
    dataset.id
    for dataset in dataset_repository.datasets()
    if dataset.name == "human-evaluation-dataset"
][0]
dataset_repository.datasets()
run_repository = FileRunRepository(REPOSITORY_ROOT_PATH)
runner = Runner(task, dataset_repository, run_repository, "instruct-run")

run_overview = runner.run_dataset(dataset_id)

## Evaluator Definition


At the end of our evaluation we want a float score $s \in [1,5]$ describing the model performance.
We define this as an `InstructAggregatedEvaluation`, which will be used in our aggregation later.

We also define the `InstructEvaluation`, which represents an evaluation of a single entry, which we will aggregate later.

In [None]:
class InstructAggregatedEvaluation(BaseModel):
    general_rating: float | None
    fluency: float | None
    evaluated_examples: int


class InstructEvaluation(BaseModel):
    general_rating: float
    fluency: float

We can now start to define our human evaluation. This is done with `Questions` and `Fields`.  
`Fields` define what a user has to evaluate. In our example, this will be the model input (Instruction) and output (Model Completion).  
`Questions` are what a user has to answer in order to evaluate the `Fields`. The `name` property will later be used to access the human ratings.  
Both of these are passed to the `ArgillaEvaluationLogic` to create `RecordData` to convert data back and forth from Argilla. 

In [None]:
questions = [
    Question(
        name="general_rating",  # name of the field in program, used for retrieval later
        title="Rating",  # name shown to the user
        description="Rate this completion on a scale from 1 to 5",
        options=range(1, 6),
    ),
    Question(
        name="fluency",
        title="Fluency",
        description="How fluent is the completion?",
        options=range(1, 6),
    ),
]

fields = {
    "input": Field(name="input", title="Instruction"),
    "output": Field(name="output", title="Model Completion"),
}

Our defined fields and questions will look like this:
![Argilla Interface](../../assets/argilla_interface.png)

We can now define our `InstructArgillaEvaluationLogic` to translate our data to specific Argilla formats .
The logic has to implement the two abstract methods `to_record` and `from_record`.
Lets look at the documentation:

In [None]:
help(ArgillaEvaluationLogic.to_record)
print("-" * 100)
help(ArgillaEvaluationLogic.from_record)

Instead of performing the evaluation, the `ArgillaEvaluationLogic` is responsible for converting the evaluation data to a format that is accepted by Argilla. During the evaluation, these records will simply be submitted to Argilla and retrieved later.
We will now create everything we need to submit these evaluations to our Argilla instance.

In [None]:
class InstructArgillaEvaluationLogic(
    ArgillaEvaluationLogic[
        InstructInput,
        CompleteOutput,
        None,
        InstructEvaluation,
    ]
):
    def to_record(
        self,
        example: Example[InstructInput, None],
        *example_outputs: SuccessfulExampleOutput[CompleteOutput],
    ) -> RecordDataSequence:
        return RecordDataSequence(
            records=[
                RecordData(
                    content={
                        self.fields["input"].name: example.input.instruction,
                        self.fields["output"].name: example_outputs[
                            0
                        ].output.completion,
                    },
                    example_id=example.id,
                )
            ]
        )

    def from_record(self, argilla_evaluation: ArgillaRatingEvaluation) -> InstructEvaluation:
        return InstructEvaluation(
            general_rating=argilla_evaluation.responses["general_rating"],
            fluency=argilla_evaluation.responses["fluency"],
        )


argilla_client = DefaultArgillaClient()
workspace_id = argilla_client.ensure_workspace_exists("test-human-eval")

dataset_repository = FileDatasetRepository(REPOSITORY_ROOT_PATH)
run_repository = FileRunRepository(REPOSITORY_ROOT_PATH)
evaluation_repository = AsyncFileEvaluationRepository(REPOSITORY_ROOT_PATH)

eval_logic = InstructArgillaEvaluationLogic(fields, questions)
evaluator = ArgillaEvaluator(
    dataset_repository,
    run_repository,
    evaluation_repository,
    "instruct-evaluation",
    eval_logic,
    argilla_client=argilla_client,
    workspace_id=workspace_id,
)

After setting up the `ArgillaEvaluator`, the `sumit` methods posts the records to the Argilla instance.

In [None]:
# either remember the id from before (run_overview.id) or retrieve as below
run_id = [
    overview.id
    for overview in run_repository.run_overviews()
    if overview.description == "instruct-run"
][0]


partial_eval_overview = evaluator.submit(run_id)
print(partial_eval_overview)

If we try to perform an aggregation right now, it will have no evaluations, as none of the submitted records were evaluated by humans through Argilla yet.  
The next steps fetches only results that have been evaluated already

---

**Note:** Sometimes it is best to split up the human evaluation effort into multiple people. To best facilitate this, it is possible to split up the dataset by giving them labels.
Our Argilla client offers an easy way to do this:

In [None]:
eval_id = partial_eval_overview.id
argilla_client.split_dataset(eval_id, n_splits=3)

These splits can then be filered by, as shown below.  
<img src="../../assets/argilla_splits.png" alt="drawing" width="300"/>

To finish the evaluation, we can retrieve the evaluated examples as follows:

In [None]:
evaluation_repository = AsyncFileEvaluationRepository(REPOSITORY_ROOT_PATH)

# either remember the id from before (eval_overview.id) or retrieve as below
eval_id = [
    overview.id
    for overview in evaluation_repository.partial_evaluation_overviews()
    if overview.description == "instruct-evaluation"
][0]

evaluation_overview = evaluator.retrieve(eval_id)


Note that all examples that are not yet evaluated in argilla are noted as `failed_examples` and not passed to the next step.

---

For the Aggregation, we first need to define our `AggregationLogic` that takes our previously defined types as input and output. Here, we use `InstructEvaluation` and `InstructAggregatedEvaluation`.

In [None]:
class InstructArgillaAggregationLogic(
    AggregationLogic[InstructEvaluation, InstructAggregatedEvaluation]
):
    def aggregate(
        self,
        evaluations: Iterable[InstructEvaluation],
    ) -> InstructAggregatedEvaluation:
        evaluations = list(evaluations)

        if len(evaluations) == 0:  # if no evaluations were submitted, return
            return InstructAggregatedEvaluation(
                general_rating=None,
                fluency=None,
                evaluated_examples=0,
            )

        general_rating = sum(
            evaluation.general_rating for evaluation in evaluations
        ) / len(evaluations)

        fluency = sum(evaluation.fluency for evaluation in evaluations) / len(
            evaluations
        )

        return InstructAggregatedEvaluation(
            general_rating=general_rating,
            fluency=fluency,
            evaluated_examples=len(evaluations),
        )


aggregation_logic = InstructArgillaAggregationLogic()

With this, we can define our `Aggregator` and aggregate all evaluations. This step is the same as non-human evaluation.

In [None]:
aggregation_repository = FileAggregationRepository(REPOSITORY_ROOT_PATH)

aggregator = Aggregator(
    evaluation_repository,
    aggregation_repository,
    "instruct-aggregation",
    aggregation_logic,
)

output = aggregator.aggregate_evaluation(eval_id)
print(output.statistics)

### Cleanup

In [None]:
# ! WARNING ! This deletes the "test-human-eval" argilla workspace and the "human-eval-data" folder.
argilla_client.delete_workspace(workspace_id)

shutil.rmtree(REPOSITORY_ROOT_PATH)