<center>
    <p style="text-align:center">
        <img alt="phoenix logo" src="https://raw.githubusercontent.com/Arize-ai/phoenix-assets/9e6101d95936f4bd4d390efc9ce646dc6937fb2d/images/socal/github-large-banner-phoenix.jpg" width="1000"/>
        <br>
        <br>
        <a href="https://docs.arize.com/phoenix/">Docs</a>
        |
        <a href="https://github.com/Arize-ai/phoenix">GitHub</a>
        |
        <a href="https://join.slack.com/t/arize-ai/shared_invite/zt-1px8dcmlf-fmThhDFD_V_48oU7ALan4Q">Community</a>
    </p>
</center>
<h1 align="center">Answer Relevancy and Context Relevancy Evaluations - LlamaTrace</h1>
<h5 align="center">👉 See Llama-Index <a href="https://github.com/run-llama/llama_index/blob/80cee5a511360eedd7837f20d283bf0a9bd05603/docs/docs/examples/evaluation/answer_and_context_relevancy.ipynb">notebook</a> for more info 👈</h5>


In [1]:
%pip install -Uqqq "arize-phoenix[llama-index]>=4.6" nest_asyncio

# Enter OpenAI API Key

In [2]:
import os
from getpass import getpass

if not os.getenv("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass("🔑 Enter your OpenAI API key: ")

if not os.getenv("PHOENIX-API-KEY"):
    os.environ["PHOENIX-API-KEY"] = getpass("🔑 Enter your Phoenix API key: ")

🔑 Enter your OpenAI API key: ··········
🔑 Enter your Phoenix API key: ··········


# Import Modules

In [3]:
import json
import tempfile
from textwrap import shorten
from time import time_ns
from typing import Tuple

import nest_asyncio
import phoenix as px
from llama_index.core import VectorStoreIndex
from llama_index.core.evaluation import AnswerRelevancyEvaluator, ContextRelevancyEvaluator
from llama_index.core.llama_dataset import download_llama_dataset
from llama_index.llms.openai import OpenAI
from openinference.instrumentation.llama_index import LlamaIndexInstrumentor
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
from phoenix.experiments import evaluate_experiment, run_experiment
from phoenix.experiments.types import Explanation, Score

nest_asyncio.apply()

# Launch Phoenix

In [5]:
from opentelemetry.sdk import trace as trace_sdk
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
    OTLPSpanExporter as HTTPSpanExporter,
)
from openinference.instrumentation.llama_index import LlamaIndexInstrumentor


# Add Phoenix API Key for tracing
PHOENIX_API_KEY = os.environ["PHOENIX-API-KEY"]
os.environ["OTEL_EXPORTER_OTLP_HEADERS"] = f"api_key={PHOENIX_API_KEY}"
os.environ["PHOENIX_CLIENT_HEADERS"] = f"api_key={PHOENIX_API_KEY}"
os.environ["PHOENIX_COLLECTOR_ENDPOINT"] = "https://app.phoenix.arize.com"


# Add Phoenix
span_phoenix_processor = SimpleSpanProcessor(
    HTTPSpanExporter(endpoint="https://app.phoenix.arize.com/v1/traces")
)

# Add them to the tracer
tracer_provider = trace_sdk.TracerProvider()
tracer_provider.add_span_processor(span_processor=span_phoenix_processor)

# Instrument the application
LlamaIndexInstrumentor().instrument(tracer_provider=tracer_provider, skip_dep_check=True)

# Upload Dataset to Phoenix

In [7]:
sample_size = 7
dataset_name = "EvaluatingLlmSurveyPaperDataset"
with tempfile.TemporaryDirectory() as dir_name:
    rag_dataset, documents = download_llama_dataset(dataset_name, dir_name)
dataset = px.Client().upload_dataset(
    dataset_name=f"{dataset_name}_{time_ns()}",
    dataframe=rag_dataset.to_pandas().sample(sample_size, random_state=42),
)

📤 Uploading dataset...
💾 Examples uploaded: https://app.phoenix.arize.com/datasets/RGF0YXNldDo0MA==/examples
🗄️ Dataset version ID: RGF0YXNldFZlcnNpb246NDI=




# Dataset Can be Viewed as Dataframe

In [8]:
dataset.as_dataframe()

Unnamed: 0_level_0,input,output,metadata
example_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
RGF0YXNldEV4YW1wbGU6MTQ2Mw==,{'reference_contexts': ['1 to 5. The JEEBench ...,{'reference_answer': 'Some of the different ty...,"{'query_by': 'ai (gpt-3.5-turbo)', 'reference_..."
RGF0YXNldEV4YW1wbGU6MTQ2NA==,"{'reference_contexts': ['CoRR, abs/2308.14508,...",{'reference_answer': 'The purpose of benchmark...,"{'query_by': 'ai (gpt-3.5-turbo)', 'reference_..."
RGF0YXNldEV4YW1wbGU6MTQ2NQ==,"{'reference_contexts': ['Potsawee Manakul, Adi...",{'reference_answer': 'The title of the paper w...,"{'query_by': 'ai (gpt-3.5-turbo)', 'reference_..."
RGF0YXNldEV4YW1wbGU6MTQ2Ng==,{'reference_contexts': ['and interactivity. Co...,{'reference_answer': 'The provided context inf...,"{'query_by': 'ai (gpt-3.5-turbo)', 'reference_..."
RGF0YXNldEV4YW1wbGU6MTQ2Nw==,"{'reference_contexts': ['Paul Roit, Johan Ferr...",{'reference_answer': 'The title of the paper p...,"{'query_by': 'ai (gpt-3.5-turbo)', 'reference_..."
RGF0YXNldEV4YW1wbGU6MTQ2OA==,{'reference_contexts': ['Association for Compu...,{'reference_answer': 'The main focus of the pa...,"{'query_by': 'ai (gpt-3.5-turbo)', 'reference_..."
RGF0YXNldEV4YW1wbGU6MTQ2OQ==,"{'reference_contexts': ['Gelei Deng, Yi Liu, Y...",{'reference_answer': 'The main focus of the pa...,"{'query_by': 'ai (gpt-3.5-turbo)', 'reference_..."


# Take a Look at the Data Structure of an Example

In [9]:
dataset[0]

Example(
    id="RGF0YXNldEV4YW1wbGU6MTQ2Mw==",
    [1m[94minput[0m={
        "query": "In the context of evaluating language models'...",
        "reference_contexts": [
            "1 to 5. The JEEBench (Arora et al., 2023) is..."
        ]
    },
    [1m[94moutput[0m={
        "reference_answer": "Some of the different types of prompting..."
    },
    [1m[94mmetadata[0m={
        "query_by": "ai (gpt-3.5-turbo)",
        "reference_answer_by": "ai (gpt-3.5-turbo)"
    },
)

# Define Task Function on Examples

Task function can be either sync or async.

In [10]:
index = VectorStoreIndex.from_documents(documents=documents)
query_engine = index.as_query_engine()


async def task(input):
    ans = await query_engine.aquery(input["query"])
    return {
        "contexts": [node.text for node in ans.source_nodes],
        "response": ans.response,
    }

# Check that Task Can Run Successfully

In [11]:
example = dataset[0]
task_output = await task(example.input)
print(shorten(json.dumps(task_output), width=80))

{"contexts": ["of current prompting methods and libraries when confronted [...]


# Dry-Run Experiment

On 3 randomly selected examples

In [12]:
experiment = run_experiment(dataset, task, dry_run=3)

🧪 Experiment started.
🌵️ This is a dry-run for these example IDs:
RGF0YXNldEV4YW1wbGU6MTQ2Mw==
RGF0YXNldEV4YW1wbGU6MTQ2NA==
RGF0YXNldEV4YW1wbGU6MTQ2OA==


running tasks |          | 0/3 (0.0%) | ⏳ 00:00<? | ?it/s

✅ Task runs completed.

Tasks Summary (08/12/24 09:30 PM +0000)
---------------------------------------
|   n_examples |   n_runs |   n_errors |
|-------------:|---------:|-----------:|
|            3 |        3 |          0 |


# Experiment Results Can be Viewed as Dataframe

In [13]:
experiment.as_dataframe()

Unnamed: 0_level_0,output,input,expected,metadata,example_id
run_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DRY_RUN_1b1952,{'contexts': ['of current prompting methods an...,{'reference_contexts': ['1 to 5. The JEEBench ...,{'reference_answer': 'Some of the different ty...,"{'query_by': 'ai (gpt-3.5-turbo)', 'reference_...",RGF0YXNldEV4YW1wbGU6MTQ2Mw==
DRY_RUN_0a5485,"{'contexts': ['CoRR, abs/2308.14508, 2023a. do...","{'reference_contexts': ['CoRR, abs/2308.14508,...",{'reference_answer': 'The purpose of benchmark...,"{'query_by': 'ai (gpt-3.5-turbo)', 'reference_...",RGF0YXNldEV4YW1wbGU6MTQ2NA==
DRY_RUN_aaf7d2,{'contexts': ['Association for Computational L...,{'reference_contexts': ['Association for Compu...,{'reference_answer': 'The main focus of the pa...,"{'query_by': 'ai (gpt-3.5-turbo)', 'reference_...",RGF0YXNldEV4YW1wbGU6MTQ2OA==


# Take a Look at the Data Structure of an Experiment Run

In [14]:
experiment[0]

ExperimentRun(
    id="DRY_RUN_1b1952",
    example_id="RGF0YXNldEV4YW1wbGU6MTQ2Mw==",
    [1m[94moutput[0m={
        "contexts": [
            "of current prompting methods and libraries when...",
            "1 to 5. The JEEBench (Arora et al., 2023) is..."
        ],
        "response": "Some of the different types of prompting..."
    }
    [1m[94mexpected[0m={ # alias for the example.[1m[94moutput[0m dict
        "reference_answer": "Some of the different types of prompting..."
    },
    [1m[94mreference[0m={ # alias for the example.[1m[94moutput[0m dict
        "reference_answer": "Some of the different types of prompting..."
    },
    [1m[94minput[0m={ # alias for the example.[1m[94minput[0m dict
        "query": "In the context of evaluating language models'...",
        "reference_contexts": [
            "1 to 5. The JEEBench (Arora et al., 2023) is..."
        ]
    },
    [1m[94mmetadata[0m={ # alias for the example.[1m[94mmetadata[0m dict
     

# Define Evaluators For Each Experiment Run

Evaluators can be sync or async.

Function arguments `output` and `input` refers to the attributes of the same name in the `ExperimentRun` data structure shown above.

In [15]:
async def answer_relevancy(output, input) -> Tuple[Score, Explanation]:
    ans = await AnswerRelevancyEvaluator(
        llm=OpenAI(temperature=0, model="gpt-4o"),
    ).aevaluate(input["query"], response=output["response"])
    return ans.score, ans.feedback


async def context_relevancy(output, input) -> Tuple[Score, Explanation]:
    ans = await ContextRelevancyEvaluator(
        llm=OpenAI(temperature=0, model="gpt-4o"),
    ).aevaluate(input["query"], contexts=output["contexts"])
    return ans.score, ans.feedback


evaluators = [answer_relevancy, context_relevancy]

# Check that Evals Can Run Successfully

In [16]:
run = experiment[0]
example = dataset.examples[run.dataset_example_id]
for fn in (answer_relevancy, context_relevancy):
    _ = await fn(run.output, example.input)
    print(fn.__qualname__)
    print(shorten(json.dumps(_), width=80))

answer_relevancy
[1.0, "1. Does the provided response match the subject matter of the [...]
context_relevancy
[1.0, "### Feedback\n\n#### Question 1: Does the retrieved context match [...]


# Run Evaluations

In [17]:
experiment = evaluate_experiment(experiment, evaluators)

🧠 Evaluation started.
🌵️ This is a dry-run for these example IDs:
RGF0YXNldEV4YW1wbGU6MTQ2Mw==
RGF0YXNldEV4YW1wbGU6MTQ2NA==
RGF0YXNldEV4YW1wbGU6MTQ2OA==


running experiment evaluations |          | 0/6 (0.0%) | ⏳ 00:00<? | ?it/s


Experiment Summary (08/12/24 09:31 PM +0000)
--------------------------------------------
| evaluator         |   n |   n_scores |   avg_score |
|:------------------|----:|-----------:|------------:|
| answer_relevancy  |   3 |          3 |    1        |
| context_relevancy |   3 |          3 |    0.541667 |

Tasks Summary (08/12/24 09:30 PM +0000)
---------------------------------------
|   n_examples |   n_runs |   n_errors |
|-------------:|---------:|-----------:|
|            3 |        3 |          0 |


# Evaluation Results Can be Viewed as Dataframe

In [18]:
experiment.get_evaluations()

Unnamed: 0_level_0,name,score,explanation,output,input,expected,metadata,example_id
run_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
DRY_RUN_0a5485,answer_relevancy,1.0,1. Does the provided response match the subjec...,"{'contexts': ['CoRR, abs/2308.14508, 2023a. do...","{'reference_contexts': ['CoRR, abs/2308.14508,...",{'reference_answer': 'The purpose of benchmark...,"{'query_by': 'ai (gpt-3.5-turbo)', 'reference_...",RGF0YXNldEV4YW1wbGU6MTQ2NA==
DRY_RUN_0a5485,context_relevancy,0.375,### Feedback\n\n#### Question 1: Does the retr...,"{'contexts': ['CoRR, abs/2308.14508, 2023a. do...","{'reference_contexts': ['CoRR, abs/2308.14508,...",{'reference_answer': 'The purpose of benchmark...,"{'query_by': 'ai (gpt-3.5-turbo)', 'reference_...",RGF0YXNldEV4YW1wbGU6MTQ2NA==
DRY_RUN_1b1952,answer_relevancy,1.0,1. Does the provided response match the subjec...,{'contexts': ['of current prompting methods an...,{'reference_contexts': ['1 to 5. The JEEBench ...,{'reference_answer': 'Some of the different ty...,"{'query_by': 'ai (gpt-3.5-turbo)', 'reference_...",RGF0YXNldEV4YW1wbGU6MTQ2Mw==
DRY_RUN_1b1952,context_relevancy,1.0,### Feedback\n\n#### Question 1: Does the retr...,{'contexts': ['of current prompting methods an...,{'reference_contexts': ['1 to 5. The JEEBench ...,{'reference_answer': 'Some of the different ty...,"{'query_by': 'ai (gpt-3.5-turbo)', 'reference_...",RGF0YXNldEV4YW1wbGU6MTQ2Mw==
DRY_RUN_aaf7d2,answer_relevancy,1.0,1. Does the provided response match the subjec...,{'contexts': ['Association for Computational L...,{'reference_contexts': ['Association for Compu...,{'reference_answer': 'The main focus of the pa...,"{'query_by': 'ai (gpt-3.5-turbo)', 'reference_...",RGF0YXNldEV4YW1wbGU6MTQ2OA==
DRY_RUN_aaf7d2,context_relevancy,0.25,### Feedback\n\n#### Question 1: Does the retr...,{'contexts': ['Association for Computational L...,{'reference_contexts': ['Association for Compu...,{'reference_answer': 'The main focus of the pa...,"{'query_by': 'ai (gpt-3.5-turbo)', 'reference_...",RGF0YXNldEV4YW1wbGU6MTQ2OA==


# Run Task and Evals Together

In [19]:
_ = run_experiment(dataset, task, evaluators)

🧪 Experiment started.
📺 View dataset experiments: https://app.phoenix.arize.com/datasets/RGF0YXNldDo0MA==/experiments
🔗 View this experiment: https://app.phoenix.arize.com/datasets/RGF0YXNldDo0MA==/compare?experimentId=RXhwZXJpbWVudDozNw==




running tasks |          | 0/7 (0.0%) | ⏳ 00:00<? | ?it/s

✅ Task runs completed.
🧠 Evaluation started.


running experiment evaluations |          | 0/14 (0.0%) | ⏳ 00:00<? | ?it/s




🔗 View this experiment: https://app.phoenix.arize.com/datasets/RGF0YXNldDo0MA==/compare?experimentId=RXhwZXJpbWVudDozNw==

Experiment Summary (08/12/24 09:31 PM +0000)
--------------------------------------------
| evaluator         |   n |   n_scores |   avg_score |
|:------------------|----:|-----------:|------------:|
| answer_relevancy  |   7 |          7 |    1        |
| context_relevancy |   7 |          7 |    0.517857 |

Tasks Summary (08/12/24 09:31 PM +0000)
---------------------------------------
|   n_examples |   n_runs |   n_errors |
|-------------:|---------:|-----------:|
|            7 |        7 |          0 |
