In [None]:
!pip3 install agent-eval crewai duckduckgo-search openlit langchain_aws


In [1]:
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter


# Create an in-memory span exporter
memory_exporter = InMemorySpanExporter()
span_processor = SimpleSpanProcessor(memory_exporter)

# Set up the tracer provider and add the span processor
tracer_provider = TracerProvider()
tracer_provider.add_span_processor(span_processor)
trace.set_tracer_provider(tracer_provider)

In [2]:
import openlit

# Initialize OpenLit - this will automatically instrument CrewAI when it's imported
openlit.init()

Overriding of current TracerProvider is not allowed
Overriding of current TracerProvider is not allowed


{
    "resource_metrics": [
        {
            "resource": {
                "attributes": {
                    "telemetry.sdk.language": "python",
                    "telemetry.sdk.name": "openlit",
                    "telemetry.sdk.version": "1.34.0",
                    "service.name": "default",
                    "deployment.environment": "default"
                },
                "schema_url": ""
            },
            "scope_metrics": [
                {
                    "scope": {
                        "name": "openlit.otel.metrics",
                        "version": "0.1.0",
                        "schema_url": "",
                        "attributes": null
                    },
                    "metrics": [
                        {
                            "name": "gen_ai.client.token.usage",
                            "description": "Measures number of input and output tokens used",
                            "unit": "{token}",
                 

In [3]:
from crewai import LLM

reasoning_llm = LLM(
    # model="sagemaker/INSERT ENDPOINT NAME",
    model="bedrock/us.amazon.nova-pro-v1:0",
    temperature=0.7, max_tokens=4*1024,
)

function_calling_llm = LLM(
    model="bedrock/invoke/meta.llama3-1-70b-instruct-v1:0",
    temperature=0, max_gen_len=5*1024
)

In [6]:
from crewai.tools import tool
from duckduckgo_search import DDGS
from textwrap import dedent
import requests


@tool('DuckDuckGoSearch')
def search_tool(search_query: str):
    """Search the web for information on a given topic"""
    return DDGS().text(search_query, max_results=5)

@tool("WeatherTool")
def weather_tool(location: str) -> str:
    """
    Gets current weather for a location using the wttr.in service (no API key required).
    Example input: "Berlin" or "New York"
    """
    try:
        url = f"https://wttr.in/{location}"
        params = {
            "format": "3"  # Short one-line weather summary
        }
        response = requests.get(url, params=params)
        if response.status_code == 200:
            return response.text.strip()
        else:
            return f"Error fetching weather: HTTP {response.status_code}"
    except Exception as e:
        return f"Error: {str(e)}"


from crewai import Agent, Task, Crew, Process

researcher_agent = Agent(
    role="Travel Researcher",
    goal="Research interesting activities and attractions for a given location using the web search tool once.",
    backstory=dedent(
        """You are an experienced travel researcher with a knack for 
        discovering both popular attractions and hidden gems in any 
        location. Your expertise lies in running one web search for 
        comprehensive information about various activities, their 
        historical significance, and practical details for visitors.
        Use the web search tool to find the information you need.
        """
    ),
    tools=[search_tool],
    allow_delegation=False,
    verbose=True,
    llm=function_calling_llm,
    max_iter=4
)

research_task = Task(
    description=dedent(
        """You have access to a web search tool to research and compile 
        a list of at least 5 interesting activities and attractions in 
        {location}. Include a mix of popular tourist spots and lesser-known 
        local favorites. For each item, provide:
        1. Name of the attraction/activity
        2. Brief description (2-3 sentences)
        3. Why it's worth visiting
        Your final answer should be a structured list of these items.
        """
    ),
    agent=researcher_agent,
    expected_output="Structured list of 5+ attractions/activities",
)

content_writer_agent = Agent(
    role="Travel Content Writer",
    goal="Create engaging and informative content for the top 5 listicle",
    backstory=dedent(
        """You are a skilled travel writer with a flair for creating 
        captivating content. Your writing style is engaging, 
        informative, and tailored to inspire readers to explore new 
        destinations. You excel at crafting concise yet compelling 
        descriptions of attractions and activities."""
    ),
    allow_delegation=False,
    verbose=True,
    llm=reasoning_llm,
    max_iter=4
)

write_task = Task(
    description=dedent(
        """Create an engaging top 5 listicle article about things to 
        do in {location}. Use the research provided to:
        1. Write a catchy title and introduction (100-150 words)
        2. Select and write about the top 5 activities/attractions
        3. For each item, write 2-3 paragraphs (100-150 words total)
        4. Include a brief conclusion (50-75 words)

        Ensure the content is engaging, informative, and inspiring. 
        Your final answer should be the complete listicle article.
        """
    ),
    agent=content_writer_agent,
    expected_output="Complete top 5 listicle article",
)

editor_agent = Agent(
    role="Content Editor",
    goal="Ensure the listicle is well-structured, engaging, and error-free",
    backstory=dedent(
        """You are a meticulous editor with years of experience in 
        travel content. Your keen eye for detail helps polish articles 
        to perfection. You focus on improving flow, maintaining 
        consistency, and enhancing the overall readability of the 
        content while ensuring it appeals to the target audience."""
    ),
    allow_delegation=False,
    verbose=True,
    llm=reasoning_llm,
    max_iter=4
)

edit_task = Task(
    description=dedent(
        """Review and edit the top 5 listicle article about things to 
        do in {location}. Focus on:
        1. Improving the overall structure and flow
        2. Enhancing the engagement factor of the content
        3. Ensuring consistency in tone and style
        4. Correcting any grammatical or spelling errors

        Do not change the content itself. Only edit it for higher quality.
        Your final answer should be the polished, publication-ready 
        version of the article.
        """
    ),
    agent=editor_agent,
    expected_output="Edited and polished listicle article about {location}",
)

crew = Crew(
    agents=[researcher_agent, content_writer_agent, editor_agent],
    tasks=[research_task, write_task, edit_task],
    process=Process.sequential,
    verbose=False,
)

inputs = {
    'location': 'Hoenderloo, Netherlands'
}
listicle_result = crew.kickoff(inputs=inputs)
print(listicle_result)



{
    "name": "Crew Created",
    "context": {
        "trace_id": "0x681edfa35ccdc3a534681d59124d930b",
        "span_id": "0x93e8af7a9e0f59ee",
        "trace_state": "[]"
    },
    "kind": "SpanKind.INTERNAL",
    "parent_id": null,
    "start_time": "2025-06-08T22:12:01.478860Z",
    "end_time": "2025-06-08T22:12:01.480610Z",
    "status": {
        "status_code": "OK"
    },
    "attributes": {
        "crewai_version": "0.126.0",
        "python_version": "3.13.0",
        "crew_key": "a9d4d8625e609c1c9610caa0596597d0",
        "crew_id": "0c084d7a-eb33-4b37-b376-7c559915e937",
        "crew_process": "sequential",
        "crew_memory": false,
        "crew_number_of_tasks": 3,
        "crew_number_of_agents": 3,
        "crew_fingerprint": "ac1daf8a-4772-458a-a300-dd713b47e4de",
        "crew_fingerprint_created_at": "2025-06-08T15:12:01.476866",
        "crew_agents": "[{\"key\": \"9800678da87c55393d183e81cc197864\", \"id\": \"3614269c-4052-4f13-836b-eb88dfa04cf1\", \"role\":

In [7]:
spans = memory_exporter.get_finished_spans()

spans

(<opentelemetry.sdk.trace.ReadableSpan at 0x135e4ee40>,
 <opentelemetry.sdk.trace.ReadableSpan at 0x1375b4f50>,
 <opentelemetry.sdk.trace.ReadableSpan at 0x151463110>,
 <opentelemetry.sdk.trace.ReadableSpan at 0x1375b1cd0>,
 <opentelemetry.sdk.trace.ReadableSpan at 0x1375b1f30>,
 <opentelemetry.sdk.trace.ReadableSpan at 0x1244aee70>,
 <opentelemetry.sdk.trace.ReadableSpan at 0x151215ae0>,
 <opentelemetry.sdk.trace.ReadableSpan at 0x151215f20>,
 <opentelemetry.sdk.trace.ReadableSpan at 0x15330fa50>,
 <opentelemetry.sdk.trace.ReadableSpan at 0x1566ade50>,
 <opentelemetry.sdk.trace.ReadableSpan at 0x155944500>,
 <opentelemetry.sdk.trace.ReadableSpan at 0x15335af30>,
 <opentelemetry.sdk.trace.ReadableSpan at 0x15333baf0>,
 <opentelemetry.sdk.trace.ReadableSpan at 0x1511d4050>,
 <opentelemetry.sdk.trace.ReadableSpan at 0x155958530>,
 <opentelemetry.sdk.trace.ReadableSpan at 0x151224950>,
 <opentelemetry.sdk.trace.ReadableSpan at 0x151224d10>,
 <opentelemetry.sdk.trace.ReadableSpan at 0x1512

In [9]:
from agent_eval import Evaluator, TraceConverter

def create_trajectory(spans):
    converter = TraceConverter()
    trajectory = converter.from_spans(spans)
    return trajectory


trajectory = create_trajectory(spans)
trajectory

Trajectory(trace_id='db038279a13fb5719afab2e91b126993', messages=[Message(role='user', content="You have access to a web search tool to research and compile \n        a list of at least 5 interesting activities and attractions in \n        Hoenderloo, Netherlands. Include a mix of popular tourist spots and lesser-known \n        local favorites. For each item, provide:\n        1. Name of the attraction/activity\n        2. Brief description (2-3 sentences)\n        3. Why it's worth visiting\n        Your final answer should be a structured list of these items.", timestamp=datetime.datetime(2025, 6, 8, 15, 12, 1, 484715), tool_calls=[]), Message(role='assistant', content="1. **Hoge Veluwe National Park**: A national park located near Hoenderloo, known for its diverse landscapes, including heathlands, sand drifts, and woodlands. Visitors can explore the park on foot or by bike, and there are also several museums and exhibitions within the park. It's worth visiting for its unique landsc

In [8]:
from langchain_aws import ChatBedrockConverse
from ragas.llms import LangchainLLMWrapper

bedrock_model = ChatBedrockConverse(
    region_name="us-east-1",
    endpoint_url=f"https://bedrock-runtime.us-east-1.amazonaws.com",
    model_id="us.amazon.nova-micro-v1:0"
)

llm = LangchainLLMWrapper(bedrock_model)

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
# Set up the evaluator with multiple metrics
from agent_eval import Evaluator, TraceConverter
from agent_eval.metrics import AgentGoalAccuracyMetric, ToolCallAccuracyMetric, TrajectoryEvalWithoutLLMMetric, TrajectoryEvalWithLLMMetric
from agent_eval.metrics.base import MetricConfig

evaluator = Evaluator([TrajectoryEvalWithLLMMetric(llm = bedrock_model)])

# Evaluate trajectory
results_langchain = await evaluator.evaluate(trajectory)

Invalid type NoneType for attribute 'gen_ai.completion' value. Expected one of ['bool', 'str', 'bytes', 'int', 'float'] or a sequence of those types


{'user': {'role': 'user', 'content': '{\'text\': "You are an expert data labeler.\\nYour task is to grade the accuracy of an AI agent\'s internal trajectory.\\n\\n<Rubric>\\n  An accurate trajectory:\\n  - Makes logical sense between steps\\n  - Shows clear progression\\n  - Is relatively efficient, though it does not need to be perfectly efficient\\n</Rubric>\\n\\nFirst, try to understand the goal of the trajectory by looking at the input\\n(if the input is not present try to infer it from the content of the first message),\\nas well as the output of the final message. Once you understand the goal, grade the trajectory\\nas it relates to achieving that goal.\\n\\nGrade the following trajectory:\\n\\n<trajectory>\\n<user>\\nYou have access to a web search tool to research and compile \\n        a list of at least 5 interesting activities and attractions in \\n        Hoenderloo, Netherlands. Include a mix of popular tourist spots and lesser-known \\n        local favorites. For each it

Invalid type dict in attribute 'gen_ai.completion' value sequence. Expected one of ['bool', 'str', 'bytes', 'int', 'float'] or None


{
    "name": "chat us.amazon.nova-micro-v1:0",
    "context": {
        "trace_id": "0x5e8ed1188cc671ae9369674fd5fe1869",
        "span_id": "0x37f8d78767168b6a",
        "trace_state": "[]"
    },
    "kind": "SpanKind.CLIENT",
    "parent_id": null,
    "start_time": "2025-06-08T22:17:09.414365Z",
    "end_time": "2025-06-08T22:17:10.521250Z",
    "status": {
        "status_code": "OK"
    },
    "attributes": {
        "telemetry.sdk.name": "openlit",
        "gen_ai.operation.name": "chat",
        "gen_ai.system": "langchain",
        "gen_ai.request.model": "us.amazon.nova-micro-v1:0",
        "gen_ai.response.model": "us.amazon.nova-micro-v1:0",
        "gen_ai.request.temperature": "None",
        "gen_ai.request.top_k": "1",
        "gen_ai.request.top_p": "None",
        "gen_ai.usage.input_tokens": 1471,
        "gen_ai.usage.output_tokens": 985,
        "server.address": "NOT_FOUND",
        "server.port": "NOT_FOUND",
        "deployment.environment": "default",
        

In [11]:
results_langchain.scores

[MetricResult(name='trajectory_eval_with_llm', score=1.0, details={'comment': "The assistant's trajectory logically progresses from introducing a national park to a museum, a village, a wildlife park, and finally another museum, all within the Hoge Veluwe National Park area. Each step builds upon the previous one, showing clear progression in terms of different types of attractions (natural, cultural, historical) and provides context for why each is worth visiting. The trajectory is relatively efficient as it covers a mix of popular and lesser-known attractions, fulfilling the goal of the task.", 'has_reference': False, 'raw_score': True})]

In [None]:
# Set up the evaluator with multiple metrics
from agent_eval import Evaluator, TraceConverter
from agent_eval.metrics import AgentGoalAccuracyMetric, ToolCallAccuracyMetric, TrajectoryEvalWithoutLLMMetric, TrajectoryEvalWithLLMMetric
from agent_eval.metrics.base import MetricConfig
import pandas as pd
import plotly.express as px

def create_trajectory(spans):
    converter = TraceConverter()
    trajectory = converter.from_spans(spans)
    return trajectory


trajectory = create_trajectory(spans)
trajectory

evaluator = Evaluator([
    AgentGoalAccuracyMetric(llm=llm, config=MetricConfig(
        metric_params={
            "reference_answer": "Triangles whisper,Angles dance in the night,Trigonometry."  # optional
        }
    )),
    ToolCallAccuracyMetric(config=MetricConfig(
        metric_params={
            "reference_tool_calls": {""}
        }
    ))
])

# Evaluate trajectory
results = await evaluator.evaluate(trajectory)