In [1]:
from collections.abc import Sequence

from pydantic import BaseModel

from intelligence_layer.evaluation import Example
from intelligence_layer.evaluation.dataset.studio_dataset_repository import StudioDatasetRepository
from intelligence_layer.connectors.data import DataClient

# How to create a dataset

0. Collect data for examples.
1. Convert data to `Example`s.
1. Create a `DatasetRepository`.
2. Store `Example`s to `DatasetRepository`.
3. Remember the dataset id.

### Example

In [3]:
class StoryTaskInput(BaseModel):  # Should already be implemented in your task
    topic: str
    targeted_word_count: int


class StoryTaskExpectedOutput(BaseModel):  # Should already be implemented in your task
    keywords: Sequence[str]


# Step 1
examples = [
    Example(
        input=StoryTaskInput(topic="rain", targeted_word_count=42),
        expected_output=StoryTaskExpectedOutput(keywords=["wet"]),
        metadata={
            "author": "Shakespeare"
        },  # the metadata is optional and can contain custom information
    ),
    # ...
]*10

# Step 2 - Use FileDatasetRepository or HuggingFaceDatasetRepository for persistence
dataset_repository = StudioDatasetRepository(
    repository_id="<repository_id>",
    data_client=DataClient(
        token="your_token",
        base_data_platform_url="http://localhost:8080",
    ),
)

# Step 3
dataset = dataset_repository.create_dataset(
    examples=examples,
    dataset_name="StoryDataset",
    labels=set(["label1", "label2"]),
    metadata=dict({"key_a": ["a", "b"], "key_b": "value"}),
)

# Step 4
print(dataset.id)
print(dataset.labels)
print(dataset.metadata)

65421249-cdea-4a98-a5c8-0ed9280869d5
{'label2', 'label1'}
{'key_a': ['a', 'b'], 'key_b': 'value'}
