In [1]:
import json
from pathlib import Path

from dotenv import load_dotenv

from intelligence_layer.connectors.studio.studio import StudioClient
from intelligence_layer.core import TextChunk
from intelligence_layer.core.model import Llama3InstructModel
from intelligence_layer.evaluation.benchmark.studio_benchmark import (
    StudioBenchmarkRepository,
)
from intelligence_layer.evaluation.dataset.domain import Example
from intelligence_layer.evaluation.dataset.studio_dataset_repository import (
    StudioDatasetRepository,
)
from intelligence_layer.examples import (
    ClassifyInput,
    PromptBasedClassify,
)
from intelligence_layer.examples.classify.classify import (
    SingleLabelClassifyAggregationLogic,
    SingleLabelClassifyEvaluationLogic,
)
from intelligence_layer.examples.classify.prompt_based_classify_with_definitions import (
    LabelWithDefinition,
    PromptBasedClassifyWithDefinitions,
)

load_dotenv()

True

# Evaluate with Studio

This notebook shows how you can evaluate the performance of `Task`s using Studio. This notebook focuses on the `PromptBasedClassify` for demonstration purposes.

First, we need to instantiate the `StudioClient`. We can either pass an existing project or let the `StudioClient` create it by setting the `create_project` flag to `True.`

In [2]:
studio_client = StudioClient(project="Classify with Studio", create_project=True)
studio_dataset_repository = StudioDatasetRepository(studio_client)
studio_benchmark_repository = StudioBenchmarkRepository(studio_client)

Next, we will create our evaluation dataset from some pre-defined dataset.

In [3]:
with Path("data/classify_examples.json").open() as json_data:
    data = json.load(json_data)

We need to transform our dataset into the required format. 
Therefore, let's check out what it looks like.

In [4]:
data[0]

{'label': 'Finance',
 'message': 'I just traveled to Paris for a conference, where can I get the train ride refunded?'}

This isn't quite yet the format we need, therefore we translate it into the interface of our `Example`.

This is the target structure:

``` python
class Example(BaseModel, Generic[Input, ExpectedOutput]):
    input: Input
    expected_output: ExpectedOutput
    id: Optional[str] = Field(default_factory=lambda: str(uuid4()))
    metadata: Optional[SerializableDict]
```

We want the `input` in each `Example` to contain the input of an actual task.
The `expected_output` shall correspond to anything we wish to compare our generated output to (i.e., the expected label in our case).

In [5]:
all_labels = list(set(item["label"] for item in data))
dataset = studio_dataset_repository.create_dataset(
    examples=[
        Example(
            input=ClassifyInput(chunk=TextChunk(item["message"]), labels=all_labels),
            expected_output=item["label"],
        )
        for item in data
    ],
    dataset_name="Single Label Classify Dataset",
)
print(f"Dataset ID: {dataset.id}")

Dataset ID: 5e3ce59d-f87c-448d-b811-c02ecae1588d


This also automatically uploads the created dataset to you **Studio** instance.
We can inspect the dataset and the individual examples in **Studio** under **Evaluate/Datasets**. Do not forget to select the correct project!

After we have checked our `Dataset`, we can create our first `Benchmark`. To this end, we need the `EvaluationLogic` and the `AggregationLogic` of our Classify use-case. After creating the `Benchmark`, make sure to copy the ID of the `Benchmark` into the `get_benchmark` method, so you don't have to create the `Benchmark` again every time you run the evaluation.

In [6]:
import random
import string

evaluation_logic = SingleLabelClassifyEvaluationLogic()
aggregation_logic = SingleLabelClassifyAggregationLogic()

rand_str = "".join(
    random.choice(string.ascii_uppercase + string.ascii_lowercase + string.digits)
    for _ in range(16)
)

benchmark = studio_benchmark_repository.create_benchmark(
    dataset.id,
    evaluation_logic,
    aggregation_logic,
    f"Single Label Classify Benchmark {rand_str}",  # Benchmark names need to be unique, therefore we add a random string to the name
)
print(f"Benchmark ID: {benchmark.id}")

Benchmark ID: ec40c09a-0472-4c70-bd48-7427e4abc87f


With this, we are ready to `execute` our first `Benchmark`. We pass it a meaningful name and execute it. After about two minutes we can take a look at the results in **Studio** in the **Evaluate/Benchmarks** section.

In [7]:
benchmark.execute(PromptBasedClassify(), "Classify v0.0 with Luminous")

Running Task: 100%|██████████| 24/24 [00:56<00:00,  2.37s/it]
Evaluating: 24it [00:00, 92691.80it/s]
Submitting traces to Studio: 100%|██████████| 24/24 [00:01<00:00, 21.70it/s]


AttributeError: 'list' object has no attribute 'model_dump_json'

Let's try to improve our results and run this again using a `Llama` model.

In [None]:
benchmark.execute(
    PromptBasedClassify(model=Llama3InstructModel("llama-3.1-8b-instruct")),
    "Classify v0.1 with Llama",
)

NameError: name 'benchmark' is not defined

For further comparisons we also `execute` the `PromptBasedClassifyWithDefinitions` task on the same `Benchmark`. This is possible because both `Task` have the exact same input and output format and can thus be compared to each other.

In [None]:
labels_with_definitions = [
    LabelWithDefinition(
        name="Finance",
        definition="Handles reimbursements, salary payments, and financial planning.",
    ),
    LabelWithDefinition(
        name="Sales",
        definition="Manages client inquiries, builds relationships, and drives revenue.",
    ),
    LabelWithDefinition(
        name="Communications",
        definition="Oversees media inquiries, partnerships, and public documentation.",
    ),
    LabelWithDefinition(
        name="Research",
        definition="Collaborates on innovative projects and explores market applications.",
    ),
    LabelWithDefinition(
        name="IT Support",
        definition="Provides technical assistance for devices and platform access issues.",
    ),
    LabelWithDefinition(
        name="Human Resources",
        definition="Manages onboarding, leave requests, and career development.",
    ),
    LabelWithDefinition(
        name="Product",
        definition="Addresses customer issues, ensures compliance, and demonstrates product use.",
    ),
    LabelWithDefinition(
        name="Customer",
        definition="Schedules meetings and ensures customer needs are effectively met.",
    ),
    LabelWithDefinition(
        name="Security",
        definition="Maintains physical and digital safety, including badge and certificate issues.",
    ),
    LabelWithDefinition(
        name="Marketing",
        definition="Manages strategic initiatives and promotes the company's offerings.",
    ),
    LabelWithDefinition(
        name="CEO Office",
        definition="Handles executive engagements and key stakeholder follow-ups.",
    ),
]

classify_with_definitions = PromptBasedClassifyWithDefinitions(
    labels_with_definitions=labels_with_definitions,
    model=Llama3InstructModel("llama-3.1-8b-instruct"),
)
benchmark.execute(classify_with_definitions, "Classify v1.0 with definitions and Llama")