In [None]:
from typing import Any, Optional

from sqlalchemy import and_, func, select
from sqlalchemy.sql.elements import BinaryExpression
from sqlalchemy.types import Unicode

from phoenix.db import models
from phoenix.db.engines import create_engine

In [None]:
baseline_experiment_id = 218
compare_experiment_ids = [217]

In [None]:
engine = create_engine("sqlite:////Users/xandersong/.phoenix/phoenix.db")


async def print_experiment_runs(
    baseline_experiment_id: int,
    compare_experiment_ids: list[int],
    filter_condition: Optional[BinaryExpression[Any]] = None,
) -> None:
    async with engine.connect() as conn:
        baseline_experiment = await conn.execute(
            select(models.Experiment).where(models.Experiment.id == baseline_experiment_id)
        )
        baseline_experiment = (
            await conn.execute(
                select(models.Experiment).where(models.Experiment.id == baseline_experiment_id)
            )
        ).first()
        assert baseline_experiment is not None
        dataset_id = baseline_experiment.dataset_id
        version_id = baseline_experiment.dataset_version_id

        revision_ids = (
            select(func.max(models.DatasetExampleRevision.id))
            .join(
                models.DatasetExample,
                models.DatasetExample.id == models.DatasetExampleRevision.dataset_example_id,
            )
            .where(
                and_(
                    models.DatasetExampleRevision.dataset_version_id <= version_id,
                    models.DatasetExample.dataset_id == dataset_id,
                )
            )
            .group_by(models.DatasetExampleRevision.dataset_example_id)
            .scalar_subquery()
        )
        examples = (
            select(models.DatasetExample)
            .join(
                models.DatasetExampleRevision,
                models.DatasetExample.id == models.DatasetExampleRevision.dataset_example_id,
            )
            .join(
                models.ExperimentRun,
                onclause=models.ExperimentRun.dataset_example_id == models.DatasetExample.id,
            )
            .join(
                models.ExperimentRunAnnotation,
                onclause=models.ExperimentRunAnnotation.experiment_run_id
                == models.ExperimentRun.id,
                isouter=True,
            )
            .where(
                and_(
                    models.DatasetExampleRevision.id.in_(revision_ids),
                    models.DatasetExampleRevision.revision_kind != "DELETE",
                )
            )
            .order_by(models.DatasetExampleRevision.dataset_example_id.desc())
        )
        if filter_condition is not None:
            examples = examples.where(filter_condition)
        print(examples.compile(compile_kwargs={"literal_binds": True}))
        results = await conn.execute(examples)
        for result in results:
            print(result.id)

In [None]:
await print_experiment_runs(
    baseline_experiment_id,
    compare_experiment_ids,
)

`experiments[0].error is None`

In [None]:
await print_experiment_runs(
    baseline_experiment_id,
    compare_experiment_ids,
    and_(
        models.ExperimentRun.error.is_(None),
        models.ExperimentRun.experiment_id == baseline_experiment_id,
    ),
)

`experiments[1].error is None`

In [None]:
await print_experiment_runs(
    baseline_experiment_id,
    compare_experiment_ids,
    and_(
        models.ExperimentRun.error.is_(None),
        models.ExperimentRun.experiment_id == compare_experiment_ids[0],
    ),
)

`experiments[0].latency_ms > 1000`

In [None]:
await print_experiment_runs(
    baseline_experiment_id,
    compare_experiment_ids,
    and_(
        models.ExperimentRun.latency_ms < 5000,
        models.ExperimentRun.experiment_id == baseline_experiment_id,
    ),
)

`experiments[0].evals["judged_correct"] == "incorrect"`

In [None]:
await print_experiment_runs(
    baseline_experiment_id,
    compare_experiment_ids,
    and_(
        models.ExperimentRunAnnotation.name == "judged_correct",
        models.ExperimentRunAnnotation.label == "incorrect",
        models.ExperimentRun.experiment_id == baseline_experiment_id,
    ),
)

`experiments[0].evals["matches_expected"] <= 0.5`

In [None]:
await print_experiment_runs(
    baseline_experiment_id,
    compare_experiment_ids,
    and_(
        models.ExperimentRunAnnotation.name == "matches_expected",
        models.ExperimentRunAnnotation.score <= 0.5,
        models.ExperimentRun.experiment_id == baseline_experiment_id,
    ),
)

`"specifications" in experiments[0].input["question"]`

In [None]:
await print_experiment_runs(
    baseline_experiment_id,
    compare_experiment_ids,
    and_(
        models.DatasetExampleRevision.input["question"].cast(Unicode).contains("specifications"),
        models.ExperimentRun.experiment_id == baseline_experiment_id,
    ),
)

`experiments[0].input["question"] == "Can you give me the specifications of the Samsung Galaxy S21? Also, what are similar options?"`

In [None]:
await print_experiment_runs(
    baseline_experiment_id,
    compare_experiment_ids,
    and_(
        models.DatasetExampleRevision.input["question"].cast(Unicode)
        == "Can you give me the specifications of the Samsung Galaxy S21? Also, what are similar options?",
        models.ExperimentRun.experiment_id == baseline_experiment_id,
    ),
)

`experiments[0].reference_output["question"] == "Can you give me the specifications of the Samsung Galaxy S21? Also, what are similar options?"`

In [None]:
await print_experiment_runs(
    baseline_experiment_id,
    compare_experiment_ids,
    and_(
        models.DatasetExampleRevision.output["expected_tool_calls"]
        .cast(Unicode)
        .contains("product_details"),
        models.ExperimentRun.experiment_id == baseline_experiment_id,
    ),
)

`"specifications" in experiments[0].output["messages"][0]["content"]`

In [None]:
await print_experiment_runs(
    baseline_experiment_id,
    compare_experiment_ids,
    and_(
        models.ExperimentRun.output["task_output"]["messages"][0]["content"]
        .cast(Unicode)
        .contains("Certainly"),
        models.ExperimentRun.experiment_id == baseline_experiment_id,
    ),
)

In [None]:
async with engine.connect() as conn:
    results = await conn.execute(
        select(models.ExperimentRun.output["task_output"]["messages"][0]["content"].label("value")),
    )
    for result in results:
        if result.value:
            print(result.value)