<center>
    <p style="text-align:center">
        <img alt="phoenix logo" src="https://raw.githubusercontent.com/Arize-ai/phoenix-assets/9e6101d95936f4bd4d390efc9ce646dc6937fb2d/images/socal/github-large-banner-phoenix.jpg" width="1000"/>
        <br>
        <br>
        <a href="https://docs.arize.com/phoenix/">Docs</a>
        |
        <a href="https://github.com/Arize-ai/phoenix">GitHub</a>
        |
        <a href="https://arize-ai.slack.com/join/shared_invite/zt-2w57bhem8-hq24MB6u7yE_ZF_ilOYSBw#/shared-invite/email">Community</a>
    </p>
</center>
<h1 align="center">Guideline Eval</h1>
<h5 align="center">ðŸ‘‰ See Llama-Index <a href="https://github.com/run-llama/llama_index/blob/80cee5a511360eedd7837f20d283bf0a9bd05603/docs/docs/examples/evaluation/guideline_eval.ipynb">notebook</a> for more info ðŸ‘ˆ</h5>


<a href="https://colab.research.google.com/github/arize-ai/phoenix/blob/main/tutorials/experiments/llama-index/guideline_eval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


In [None]:
%pip install -Uqqq "arize-phoenix[llama-index]>=4.6" datasets nest_asyncio

# Enter OpenAI API Key

In [None]:
import os
from getpass import getpass

if not os.getenv("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass("ðŸ”‘ Enter your OpenAI API key: ")

# Import Modules

In [None]:
import json
from functools import partial
from textwrap import shorten
from time import time_ns
from typing import Tuple

import nest_asyncio
from datasets import load_dataset
from llama_index.core.evaluation import GuidelineEvaluator
from llama_index.llms.openai import OpenAI
from openinference.instrumentation.llama_index import LlamaIndexInstrumentor

import phoenix as px
from phoenix.experiments import evaluate_experiment, run_experiment
from phoenix.experiments.types import Explanation, Score
from phoenix.otel import register

nest_asyncio.apply()

# Launch Phoenix

In [None]:
px.launch_app().view()

# Instrument Llama-Index

In [None]:
tracer_provider = register(endpoint="http://127.0.0.1:4317")
LlamaIndexInstrumentor().instrument(skip_dep_check=True, tracer_provider=tracer_provider)

# Upload Dataset to Phoenix

In [None]:
sample_size = 7
path = "nvidia/ChatQA-Training-Data"
name = "synthetic_convqa"
df = load_dataset(path, name, split="train").to_pandas()
df = df.loc[:, ["messages", "document"]]
dataset = px.Client().upload_dataset(
    dataset_name=f"{name}_{time_ns()}",
    dataframe=df.sample(sample_size, random_state=42),
)

# Dataset Can be Viewed as Dataframe

In [None]:
dataset.as_dataframe()

# Take a Look at the Data Structure of an Example

In [None]:
dataset[0]

# Define Task Function on Examples

Task function can be either sync or async.

In [None]:
llm = OpenAI(model="gpt-3.5-turbo")


def task(input):
    return llm.complete(input["document"] + "\n\n" + input["messages"][-1]["content"]).text

# Check that Task Can Run Successfully

In [None]:
example = dataset[0]
task_output = task(example.input)
print(shorten(json.dumps(task_output), width=80))

# Dry-Run Experiment

On 3 randomly selected examples

In [None]:
experiment = run_experiment(dataset, task, dry_run=3)

# Experiment Results Can be Viewed as Dataframe

In [None]:
experiment.as_dataframe()

# Take a Look at the Data Structure of an Experiment Run

In [None]:
experiment[0]

# Define Evaluators For Each Experiment Run

Evaluators can be sync or async.

Function arguments `output` and `input` refer to the attributes of the same name in the `ExperimentRun` data structure shown above.

In [None]:
llm = OpenAI(temperature=0, model="gpt-4o")
guidelines = {
    "answer_fully": "The response should fully answer the query.",
    "unambiguous": "The response should avoid being vague or ambiguous.",
    "use_numbers": "The response should be specific and use statistics or numbers when possible.",
}


async def adapt(fn, output, input) -> Tuple[Score, Explanation]:
    ans = await fn(
        query=input["messages"][0]["content"],
        response=output,
        contexts=[input["document"]],
    )
    return ans.passing, ans.feedback


evaluators = {
    name: partial(adapt, GuidelineEvaluator(llm=llm, guidelines=guideline).aevaluate)
    for name, guideline in guidelines.items()
}

# Check that Evals Can Run Successfully

In [None]:
run = experiment[0]
example = dataset.examples[run.dataset_example_id]
for name, fn in evaluators.items():
    _ = await fn(run.output, example.input)
    print(name)
    print(shorten(json.dumps(_), width=80))

# Run Evaluations

In [None]:
experiment = evaluate_experiment(experiment, evaluators)

# Evaluation Results Can be Viewed as Dataframe

In [None]:
experiment.get_evaluations()

# Run Task and Evals Together

In [None]:
_ = run_experiment(dataset, task, evaluators)