In [None]:
!pip3 install agent-eval crewai plotly opentelemetry-api opentelemetry-sdk pandas ragas duckduckgo-search openlit langchain_aws


In [1]:
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter


# Create an in-memory span exporter
memory_exporter = InMemorySpanExporter()
span_processor = SimpleSpanProcessor(memory_exporter)

# Set up the tracer provider and add the span processor
tracer_provider = TracerProvider()
tracer_provider.add_span_processor(span_processor)
trace.set_tracer_provider(tracer_provider)

In [2]:
import openlit

# Initialize OpenLit - this will automatically instrument CrewAI when it's imported
openlit.init()

Overriding of current TracerProvider is not allowed
Overriding of current TracerProvider is not allowed


{
    "resource_metrics": [
        {
            "resource": {
                "attributes": {
                    "telemetry.sdk.language": "python",
                    "telemetry.sdk.name": "openlit",
                    "telemetry.sdk.version": "1.34.0",
                    "service.name": "default",
                    "deployment.environment": "default"
                },
                "schema_url": ""
            },
            "scope_metrics": [
                {
                    "scope": {
                        "name": "openlit.otel.metrics",
                        "version": "0.1.0",
                        "schema_url": "",
                        "attributes": null
                    },
                    "metrics": [
                        {
                            "name": "gen_ai.client.token.usage",
                            "description": "Measures number of input and output tokens used",
                            "unit": "{token}",
                 

In [3]:
from crewai.tools import tool
from duckduckgo_search import DDGS

@tool('DuckDuckGoSearch')
def search_tool(search_query: str):
    """Search the web for information on a given topic"""
    return DDGS().text(search_query, max_results=5)

@tool('SalesforceIntegration')
def salesforce_tool(soql_query: str):
    """Call Salesforce API to get data"""
    return "Salesforce Integration"

from crewai import LLM

model = LLM(
    # model="sagemaker/INSERT ENDPOINT NAME",
    model="bedrock/us.amazon.nova-pro-v1:0",
    temperature=0.7, max_tokens=4*1024,
)

from crewai import Agent, Task, Crew


writer = Agent(
        role="Writer",
        goal="You make math engaging and understandable for young children through poetry",
        backstory="You're an expert in writing haikus but you know nothing of math.",
        tools=[search_tool],
        llm=model
    )

task = Task(description=("What is {topic}?"),
            expected_output=("Compose a short poem that includes the answer."),
            agent=writer)

crew = Crew(
  agents=[writer],
  tasks=[task],
  share_crew=False
)

result = crew.kickoff({"topic": "Trignometry"})
print(result)



{
    "name": "Crew Created",
    "context": {
        "trace_id": "0x35e4ff59a831ae39af93a9555b8f3d89",
        "span_id": "0x41f3d27c833bff5b",
        "trace_state": "[]"
    },
    "kind": "SpanKind.INTERNAL",
    "parent_id": null,
    "start_time": "2025-06-08T21:38:02.560047Z",
    "end_time": "2025-06-08T21:38:02.560928Z",
    "status": {
        "status_code": "OK"
    },
    "attributes": {
        "crewai_version": "0.126.0",
        "python_version": "3.13.0",
        "crew_key": "d56be7d496021dd9ad8d140b7267c207",
        "crew_id": "028b7000-c9c0-4394-bc11-da76390052a8",
        "crew_process": "sequential",
        "crew_memory": false,
        "crew_number_of_tasks": 1,
        "crew_number_of_agents": 1,
        "crew_fingerprint": "03b24118-b594-4cc5-a8c4-770e589ca69a",
        "crew_fingerprint_created_at": "2025-06-08T14:38:02.558916",
        "crew_agents": "[{\"key\": \"de2feaf934a966ef93a186c418b6f467\", \"id\": \"d14866da-58a3-491a-b1bb-4d1cf45c2ab9\", \"role\":

In [4]:
spans = memory_exporter.get_finished_spans()

spans

(<opentelemetry.sdk.trace.ReadableSpan at 0x11ef4ea50>,
 <opentelemetry.sdk.trace.ReadableSpan at 0x11f56ce10>,
 <opentelemetry.sdk.trace.ReadableSpan at 0x11ff54190>,
 <opentelemetry.sdk.trace.ReadableSpan at 0x11ff1ab10>,
 <opentelemetry.sdk.trace.ReadableSpan at 0x11ff1a8b0>,
 <opentelemetry.sdk.trace.ReadableSpan at 0x11ff58710>,
 <opentelemetry.sdk.trace.ReadableSpan at 0x11ff2d260>)

In [5]:
from langchain_aws import ChatBedrockConverse
from ragas.llms import LangchainLLMWrapper

bedrock_model = ChatBedrockConverse(
    region_name="us-east-1",
    endpoint_url=f"https://bedrock-runtime.us-east-1.amazonaws.com",
    model_id="us.amazon.nova-micro-v1:0"
)

llm = LangchainLLMWrapper(bedrock_model)



  from .autonotebook import tqdm as notebook_tqdm


In [13]:
# Set up the evaluator with multiple metrics
from agent_eval import Evaluator, TraceConverter
from agent_eval.metrics import AgentGoalAccuracyMetric, ToolCallAccuracyMetric, TrajectoryEvalWithoutLLMMetric, TrajectoryEvalWithLLMMetric
from agent_eval.metrics.base import MetricConfig

def create_trajectory(spans):
    converter = TraceConverter()
    trajectory = converter.from_spans(spans)
    return trajectory


trajectory = create_trajectory(spans)
trajectory

evaluator = Evaluator([
    AgentGoalAccuracyMetric(llm=llm, config=MetricConfig(
        metric_params={
            "reference_answer": "Triangles whisper,Angles dance in the night,Trigonometry."  # optional
        }
    )),
    ToolCallAccuracyMetric(config=MetricConfig(
        metric_params={
            "reference_tool_calls": {""}
        }
    ))
])

# Evaluate trajectory
results = await evaluator.evaluate(trajectory)

{'user': {'role': 'user', 'content': '{\'text\': \'Given an agentic workflow comprised of Human, AI and Tools, identify the user_goal (the task or objective the user wants to achieve) and the end_state (the final outcome or result of the workflow).\\nPlease return the output in a JSON format that complies with the following schema as specified in JSON Schema:\\n{"properties": {"user_goal": {"description": "The task or objective the user wants to achieve.", "title": "User Goal", "type": "string"}, "end_state": {"description": "The final outcome or result of the workflow.", "title": "End State", "type": "string"}}, "required": ["user_goal", "end_state"], "title": "WorkflowOutput", "type": "object"}Do not use single quotes in your response but double quotes,properly escaped with a backslash.\\n\\n--------EXAMPLES-----------\\nExample 1\\nInput: {\\n    "workflow": "\\\\n            Human: Hey, book a table at the nearest best Chinese restaurant for 8:00pm\\\\n            AI: Sure, let me 

In [14]:
evaluator = Evaluator([TrajectoryEvalWithLLMMetric(llm = bedrock_model,config=MetricConfig(
        metric_params={
            "reference_outputs": [
    {"role": "user", "content": "What is Amazon Bedrock? Compose a short poem that includes the answer."},
    {
        "role": "assistant",
        "content": "To compose a poem about Amazon Bedrock, I first need to gather information about what Amazon Bedrock is. I will use the available tool to search for this information.",
        "tool_calls": [
            {
                "function": {
                    "name": "Search the internet with Serper",
                    "arguments": "{\"search_query\": \"Amazon Bedrock\"}"
                }
            }
        ]
    },
    {"role": "tool", "content": "{\"searchParameters\": {\"q\": \"Amazon Bedrock\", \"type\": \"search\", \"num\": 5, \"engine\": \"google\"}, \"organic\": [{\"title\": \"Amazon Bedrock - Generative AI - AWS\", \"link\": \"https://aws.amazon.com/bedrock/\", \"snippet\": \"Amazon Bedrock Data Automation streamlines the generation of valuable insights from unstructured multimodal content such as documents, images, audio, and videos ...\", \"position\": 1, \"sitelinks\": [{\"title\": \"Amazon Bedrock\", \"link\": \"https://docs.aws.amazon.com/bedrock/latest/userguide/what-is-bedrock.html\"}, {\"title\": \"Amazon Bedrock Pricing\", \"link\": \"https://aws.amazon.com/bedrock/pricing/\"}, {\"title\": \"Amazon Bedrock Documentation\", \"link\": \"https://docs.aws.amazon.com/bedrock/\"}, {\"title\": \"Amazon Bedrock FAQs\", \"link\": \"https://aws.amazon.com/bedrock/faqs/\"}, {\"title\": \"Amazon Bedrock Agents\", \"link\": \"https://aws.amazon.com/bedrock/agents/\"}]}, {\"title\": \"Getting Started with Amazon Bedrock - AWS\", \"link\": \"https://aws.amazon.com/awstv/watch/6ff4cd6fa97/\", \"snippet\": \"So check the region that you're currently in, make sure it's a region that's supported by Bedrock. Then I'm gonna scroll to the bottom of this ...\", \"position\": 2}], \"relatedSearches\": [{\"query\": \"Amazon Bedrock pricing\"}, {\"query\": \"Amazon Bedrock documentation\"}, {\"query\": \"Amazon Bedrock Claude\"}, {\"query\": \"Amazon Bedrock logo\"}, {\"query\": \"Amazon Bedrock DeepSeek\"}], \"credits\": 1}"},
    {"role": "assistant", "content": "Based on the observation, I have learned that **Amazon Bedrock** is a service by AWS that streamlines the generation of valuable insights from unstructured multimodal content. Here's a haiku about it:\n\nIn cloud's embrace,\nBedrock whispers secrets,\nData's gentle art."}
]
        }
    ))])

# Evaluate trajectory
results_langchain = await evaluator.evaluate(trajectory)


Invalid type NoneType for attribute 'gen_ai.completion' value. Expected one of ['bool', 'str', 'bytes', 'int', 'float'] or a sequence of those types


{'user': {'role': 'user', 'content': '{\'text\': \'You are an expert data labeler.\\nYour task is to grade the accuracy of an AI agent\\\'s internal trajectory.\\n\\n<Rubric>\\n  An accurate trajectory:\\n  - Makes logical sense between steps\\n  - Shows clear progression\\n  - Is relatively efficient, though it does not need to be perfectly efficient\\n  - Is semantically equivalent to the provided reference trajectory\\n</Rubric>\\n\\nBased on the following reference trajectory:\\n\\n<reference_trajectory>\\n<user>\\nWhat is Amazon Bedrock? Compose a short poem that includes the answer.\\n</user>\\n\\n<assistant>\\nTo compose a poem about Amazon Bedrock, I first need to gather information about what Amazon Bedrock is. I will use the available tool to search for this information.\\n<tool_call>\\n<name>Search the internet with Serper</name>\\n<arguments>{"search_query": "Amazon Bedrock"}</arguments>\\n</tool_call>\\n</assistant>\\n\\n<tool>\\n{"searchParameters": {"q": "Amazon Bedrock"

Invalid type dict in attribute 'gen_ai.completion' value sequence. Expected one of ['bool', 'str', 'bytes', 'int', 'float'] or None


{
    "name": "chat us.amazon.nova-micro-v1:0",
    "context": {
        "trace_id": "0xa40a9d53fb0cbaad5897a191f962bb7e",
        "span_id": "0xb0764d6fa57f6402",
        "trace_state": "[]"
    },
    "kind": "SpanKind.CLIENT",
    "parent_id": null,
    "start_time": "2025-06-08T21:45:10.563886Z",
    "end_time": "2025-06-08T21:45:11.533217Z",
    "status": {
        "status_code": "OK"
    },
    "attributes": {
        "telemetry.sdk.name": "openlit",
        "gen_ai.operation.name": "chat",
        "gen_ai.system": "langchain",
        "gen_ai.request.model": "us.amazon.nova-micro-v1:0",
        "gen_ai.response.model": "us.amazon.nova-micro-v1:0",
        "gen_ai.request.temperature": "1e-08",
        "gen_ai.request.top_k": "1",
        "gen_ai.request.top_p": "None",
        "gen_ai.usage.input_tokens": 2810,
        "gen_ai.usage.output_tokens": 994,
        "server.address": "NOT_FOUND",
        "server.port": "NOT_FOUND",
        "deployment.environment": "default",
       

In [8]:
results.scores

[MetricResult(name='agent_goal_accuracy', score=0.0, details={'error': 'Failed to evaluate interaction'}),
 MetricResult(name='tool_call_accuracy', score=1.0, details={'evaluation_type': 'tool_call_accuracy'})]

In [15]:
results_langchain.scores


[MetricResult(name='trajectory_eval_with_llm', score=1.0, details={'comment': "The trajectory logically follows the user's request. It starts by acknowledging the need to understand trigonometry to compose a poem, then uses a search tool to find a definition. The poem itself references the definition and captures the essence of trigonometry in a poetic manner, focusing on triangles, angles, and trigonometric functions. The progression from understanding to composing the poem is clear and the output is semantically equivalent to a reference trajectory about Amazon Bedrock.", 'has_reference': True, 'raw_score': True})]