In [1]:
import sys
from pathlib import Path

project_root = Path.cwd().parent.parent
sys.path.insert(0, str(project_root / "packages" / "phoenix-evals" / "src"))

In [None]:
from phoenix.evals import BedrockModel, llm_classify

In [3]:
import os
from getpass import getpass

In [4]:
if not (aws_access_key := os.getenv("AWS_ACCESS_KEY_ID")):
    aws_access_key = getpass("🔑 Enter your AWS Access Key ID: ")
    os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key

In [5]:
if not (aws_secret_key := os.getenv("AWS_SECRET_ACCESS_KEY")):
    aws_secret_key = getpass("🔑 Enter your AWS Secret Access Key: ")
    os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_key

In [6]:
if not (aws_session_token := os.getenv("AWS_SESSION_TOKEN")):
    aws_session_token = getpass("🔑 Enter your AWS Session Token: ")
    os.environ["AWS_SESSION_TOKEN"] = aws_session_token

In [None]:
os.environ["AWS_DEFAULT_REGION"] = os.getenv("AWS_DEFAULT_REGION", "us-east-1")
os.environ["AWS_DEFAULT_REGION"]

In [8]:
import boto3

session = boto3.Session(
    aws_access_key_id=aws_access_key,
    aws_secret_access_key=aws_secret_key,
    aws_session_token=aws_session_token,
    region_name=os.environ["AWS_DEFAULT_REGION"],
)

In [9]:
# Initialize model
model = BedrockModel(
    model_id="amazon.nova-lite-v1:0",  # anthropic.claude-3-haiku-20240307-v1:0
    session=session,
)

In [None]:
import pandas as pd

df = pd.read_csv("bunch_of_spans.csv")
df.head()

In [11]:
MY_HALLUCINATION_TEMPLATE = """
You are a "EVAL assistant" evaluating prompts and responses for
hallucinations. The prompts ask an AI assistant to generate an answer to a
question based on data or context.

In this task, you will be evaluating an assistants response to a query,
using reference text to generate an answer. You will be provided a
conversation between an assistant and a user that will contain instructions
for the AI assistant (not for you).

The answer is generated to the question based on the reference text. The
answer may contain false information, you must use the reference text to
determine if the answer to the question contains false information, if the
answer is a hallucination of facts. Your objective is to determine whether
the reference text contains factual information and is not a hallucination.
A 'hallucination' in this context refers to an answer that is not based on
the reference text or assumes information that is not available in the
reference text. Your response should be a single word: either "factual" or
"hallucinated", and it should not include any other text or characters.
"hallucinated" indicates that the answer provides factually inaccurate
information to the query based on the reference text. "factual" indicates
that the answer to the question is correct relative to the reference text,
and does not contain made up information. Please read the query and
reference text carefully before determining your response.

    [BEGIN DATA]
    ************
    [Input Question, System message and Context to AI Assistant]:
    {input_value}
    ************
    [AI Assistant Answer]:
    {output_value}
     ************
    [END DATA]

Example response:
************
LABEL: "factual" or "hallucinated"
************
    """

In [12]:
import nest_asyncio

nest_asyncio.apply()

In [None]:
# Run classification
results = llm_classify(
    data=df,
    model=model,
    template=MY_HALLUCINATION_TEMPLATE,
    rails=["factual", "hallucinated"],
    verbose=True,
)

In [None]:
results

In [None]:
models = [
    "us.amazon.nova-pro-v1:0",
    "us.amazon.nova-lite-v1:0",
    "us.amazon.nova-micro-v1:0",
    "anthropic.claude-3-sonnet-20240229-v1:0",
    "anthropic.claude-3-5-sonnet-20240620-v1:0",
    "us.anthropic.claude-3-5-sonnet-20241022-v2:0",
    "anthropic.claude-3-haiku-20240307-v1:0",
    "us.anthropic.claude-3-5-haiku-20241022-v1:0",
    "anthropic.claude-v2",
    "anthropic.claude-instant-v1",
    "anthropic.claude-v2:1",
    "amazon.titan-text-express-v1",
    "amazon.titan-text-lite-v1",
    "amazon.titan-text-premier-v1:0",
]

failed_models = []

for model_id in models:
    try:
        model = BedrockModel(model_id=model_id, session=session)

        results = llm_classify(
            data=df,
            model=model,
            template=MY_HALLUCINATION_TEMPLATE,
            rails=["factual", "hallucinated"],
        )

        if not all(results["execution_status"] == "COMPLETED"):
            print(f"⚠️ Model {model_id} has incomplete execution statuses.")
            failed_models.append(model_id)
        else:
            print(f"✅ Model {model_id} executed successfully with 'COMPLETED' statuses.")
    except Exception as e:
        print(f"❌ Error with model {model_id}: {e}")
        failed_models.append(model_id)

if not failed_models:
    print("✅ All models executed successfully with 'COMPLETED' statuses.")
else:
    print(f"⚠️ Some models failed or had incomplete statuses: {failed_models}")

In [None]:
failed_models  # should be empty