In [1]:
from pathlib import Path
from fsspec.implementations.local import LocalFileSystem

from example_data import DummyEvaluationLogic, example_data, DummyEvaluation

from intelligence_layer.evaluation import Evaluator, StudioEvaluationRepository
from intelligence_layer.evaluation.dataset.studio_dataset_repository import StudioDatasetRepository
from intelligence_layer.connectors.data.data import DataClient

# How to evaluate runs
0. Run your tasks on the datasets where you want to evaluate them on (see [here](./how_to_run_a_task_on_a_dataset.ipynb))
   - When evaluating multiple runs, all of them need the same data types 
2. Initialize all necessary repositories for the `Evaluator`, and an `EvaluationLogic`.
3. Run the evaluator to evaluate all examples and create a single `EvaluationOverview`
4. (Optional) Save the evaluation id for later use

### Example

In [2]:
# Step 0

my_example_data = example_data()
run_ids = [my_example_data.run_overview_1.id, my_example_data.run_overview_2.id]


# Step 1
studio_dataset_repository = StudioDatasetRepository(
    repository_id="<your_repository_id>",
    data_client=DataClient(token="<your_token>", base_data_platform_url="http://localhost:8080"),
)
dataset_repository = my_example_data.dataset_repository
run_repository = my_example_data.run_repository
evaluation_repository = StudioEvaluationRepository(
    file_system=LocalFileSystem(True),
    root_directory=Path("evals"),
    studio_dataset_repository=studio_dataset_repository,
    evaluation_type=DummyEvaluation,
)
evaluation_logic = DummyEvaluationLogic()


Evaluating: 2it [00:00, 31300.78it/s]


Evaluating: 2it [00:00, 28532.68it/s]


In [None]:

# Step 3
evaluator = Evaluator(
    dataset_repository,
    run_repository,
    evaluation_repository,
    "My dummy evaluation",
    evaluation_logic,
)

evaluation_overview = evaluator.evaluate_runs(
    *run_ids, labels=set({"label"}), metadata=dict({"key": "value"})
)

# Step 4
print(evaluation_overview.id)
print(evaluation_overview.metadata)
print(evaluation_overview.labels)