# Demo: end-to-end enterprise agent evaluation (Azure AI Agent Service)

This evaluation sample evalutes the enterprise mulit-tool agent in https://github.com/Azure-Samples/azure-ai-agent-service-enterprise-demo from end to end.

### Getting Started

This sample demonstrates how to evaluate Azure AI Agent
Before running the sample:
```bash
pip install azure-ai-projects azure-identity azure-ai-evaluation yfinance
```

### Initializing Project Client

In [None]:
import os
from datetime import datetime as pydatetime
from azure.identity import DefaultAzureCredential
from azure.ai.projects import AIProjectClient

from dotenv import load_dotenv

load_dotenv()

project_client = AIProjectClient.from_connection_string(
    credential=DefaultAzureCredential(),
    conn_str=os.environ["PROJECT_CONNECTION_STRING"],
)

MODEL_NAME = os.environ["MODEL_DEPLOYMENT_NAME"]
AGENT_NAME = "my-enterprise-agent-v1"
INSTRUCTION = (
    "You are a helpful enterprise assistant for Contoso. "
    f"Today's date is {pydatetime.now().strftime('%A, %b %d, %Y, %I:%M %p')}. "
    "You have access to HR documents in file_search, product catalog in ai_search, the grounding engine from bing and "
    "custom python functions like fetch_weather, fetch_stock_price, etc. "
    "Provide well-structured, concise, and professional answers."
)

# set up enterprise toolset with Bing/FileSearch/AZSSearch/yfinance/custom functions
from set_up_enterprise_toolset import set_up_enterprise_toolset

enterprise_toolset = set_up_enterprise_toolset(project_client)

### Create Agent or find existing agent 

In [None]:
found_agent = None
print("Listing existing agents...")
agents_list = project_client.agents.list_agents().data
print(f"Found {len(agents_list)} existing agents")

for agent in agents_list:
    if agent.name == AGENT_NAME:
        found_agent = agent
        print(f"Found existing agent: {agent.name} (id: {agent.id})")
        break

if found_agent:
    print("Updating existing agent...")
    print("found_agent.id = ", found_agent.id)
    try:
        agent = project_client.agents.update_agent(
            agent_id=found_agent.id,
            model=MODEL_NAME,
            instructions=found_agent.instructions,
            toolset=enterprise_toolset,
        )
        print("Agent updated successfully")
    except Exception as e:
        print(f"Error updating agent: {e!s}")
        raise
else:
    print("Creating new agent...")
    try:
        agent = project_client.agents.create_agent(
            model=MODEL_NAME, name=AGENT_NAME, instructions=INSTRUCTION, toolset=enterprise_toolset
        )
        print(f"New agent created with id: {agent.id}")
    except Exception as e:
        print(f"Error creating agent: {e!s}")
        raise

### Create Thread

In [None]:
thread_id = ""
if not thread_id:
    thread = project_client.agents.create_thread()
    thread_id = thread.id
    print(f"Created thread, ID: {thread.id}")

## Conversation with Agent
Use below cells to have conversation with the agent
- `Create Message[1]`
- `Execute[2]`

### Create Message[1]

In [None]:
# Create message to thread

MESSAGE = "The best tent available"

message = project_client.agents.create_message(
    thread_id=thread_id,
    role="user",
    content=MESSAGE,
)
print(f"Created message, ID: {message.id}")

### Execute[2]

In [None]:
run = project_client.agents.create_and_process_run(thread_id=thread_id, agent_id=agent.id)

print(f"Run finished with status: {run.status}")

if run.status == "failed":
    print(f"Run failed: {run.last_error}")

print(f"Run ID: {run.id}")

### (Optional) Batch Execute on a list of user queries

In [None]:
questions = [
    # HR Policy & Document Search (RAG) Questions
    "How many vacation days do employees get per year?",
    # Weather-Related Questions (Function Calling)
    "Will it rain in New York this weekend?",
    # Stock Market & Financial Questions
    "How is Microsoft's stock performing today?",
    # Bing Search Integration Questions
    "What are the latest developments in AI technology?",
    # Edge Cases and Error Handling
    "Can you access classified company documents?",
    # Combined Capability Questions
    "Based on the weather forecast and stock market performance, should we have our team meeting outside?",
]

for query in questions:
    message = project_client.agents.create_message(
        thread_id=thread_id,
        role="user",
        content=query,
    )
    print(f"Created message, ID: {message.id}")

    run = project_client.agents.create_and_process_run(thread_id=thread_id, agent_id=agent.id)

    print(f"Run finished with status: {run.status}")

    if run.status == "failed":
        print(f"Run failed: {run.last_error}")

    print(f"Run ID: {run.id}")
    import time

    time.sleep(0.1)

### List Agent Messages

In [None]:
for message in project_client.agents.list_messages(thread_id, order="asc").data:
    print(f"Role: {message.role}")
    print(f"Content: {message.content[0].text.value}")
    print("-" * 40)

# Evaluate the agent thread data

### Setting up evaluator

- Intent Resolution: Measures how well the agent identifies the user’s request, including how well it scopes the user’s intent, asks clarifying questions, and reminds end users of its scope of capabilities.
- Tool Call Accuracy: Evaluates the agent’s ability to select the appropriate tools, and process correct parameters from previous steps.
- Task Adherence: Measures how well the agent’s response adheres to its assigned tasks, according to its system message and prior steps.


In [None]:
from azure.ai.evaluation import ToolCallAccuracyEvaluator, IntentResolutionEvaluator, TaskAdherenceEvaluator
from pprint import pprint
from azure.ai.projects.models import ConnectionType

default_connection = project_client.connections.get_default(
    connection_type=ConnectionType.AZURE_OPEN_AI, include_credentials=True
)
model_config = default_connection.to_evaluator_model_config(
    deployment_name=os.environ["MODEL_DEPLOYMENT_NAME"],
    api_version=os.environ["MODEL_DEPLOYMENT_API_VERSION"],
    include_credentials=True,
)


# Measures how well the agent identifies the user’s request, including how well it scopes the user’s intent, asks clarifying questions, and reminds end users of its scope of capabilities.
intent_resolution = IntentResolutionEvaluator(model_config=model_config)
# Evaluates the agent’s ability to select the appropriate tools, and process correct parameters from previous steps.
tool_call_accuracy = ToolCallAccuracyEvaluator(model_config=model_config)
# Measures how well the agent’s response adheres to its assigned tasks, according to its system message and prior steps
task_adherence = TaskAdherenceEvaluator(model_config=model_config)

### Run Batch Evaluation on agent thread data (multiple agent runs)

In [None]:
from azure.ai.evaluation import AIAgentConverter

# Initialize the converter that will be backed by the project.
converter = AIAgentConverter(project_client)

# specify a file path to save agent output (which is evaluation input data)
filename = os.path.join(os.getcwd(), "evaluation_input_data.jsonl")

evaluation_data = converter.prepare_evaluation_data(thread_ids=thread_id, filename=filename)


print(f"Evaluation data saved to {filename}")

In [None]:
from azure.ai.evaluation import evaluate


response = evaluate(
    data=filename,
    evaluation_name="agent eval demo - batch run",
    evaluators={
        "intent_resolution": intent_resolution,
        "tool_call_accuracy": tool_call_accuracy,
        "task_adherence": task_adherence,
    },
    azure_ai_project={
        "subscription_id": os.environ["AZURE_SUBSCRIPTION_ID"],
        "project_name": os.environ["PROJECT_NAME"],
        "resource_group_name": os.environ["RESOURCE_GROUP_NAME"],
    },
)
pprint(f'AI Foundary URL: {response.get("studio_url")}')

### Inspect batch run results in memory

In [None]:
response

In [None]:
# overall average evaluation metrics
response["metrics"]