In [None]:
!uv pip install pandas python-dotenv 

In [None]:
import time
import os
import uuid
from contextlib import contextmanager
from dotenv import load_dotenv
import pandas as pd

# Load environment variables
load_dotenv()

# Benchmarking functions
@contextmanager
def timer():
    """Context manager to time execution"""
    start_time = time.time()
    yield
    end_time = time.time()
    execution_time = end_time - start_time
    print(f"Execution time: {execution_time:.2f} seconds")

# Configuration
NUM_TRACES = 20

# Hardcoded trace data for consistent benchmarking
HARDCODED_TRACES = [
    {
        "trace_id": f"trace_{i}",
        "user_id": f"user_{(i % 10) + 1}",
        "session_id": f"session_{(i % 5) + 1}",
        "tool_call": {
            "input_size": 100 + (i * 50),
            "duration": 0.005 + (i * 0.001)
        },
        "llm_call": {
            "provider": "openai",
            "model": "gpt-4",
            "temperature": 0.7,
            "input": f"Analyze the data for request {i}",
            "output": f"Analysis complete for request {i}: The result shows pattern {i % 3}",
            "prompt_tokens": 20 + (i % 30),
            "completion_tokens": 50 + (i % 80),
            "total_tokens": 70 + (i % 110),
            "duration": 0.1 + (i * 0.01)
        },
        "result": {
            "output_format": "json",
            "duration": 0.002 + (i * 0.0005)
        }
    }
    for i in range(NUM_TRACES) 
]

def run_benchmark(platform_name, log_function, num_traces=NUM_TRACES):
    print(f"Logging {num_traces} traces to {platform_name}...")
    
    with timer():
        for i in range(num_traces):
            trace_data = HARDCODED_TRACES[i]
            log_function(trace_data)
            
            if (i + 1) % 10 == 0:
                print(f"Logged {i + 1} traces...")
    
    print(f"Successfully logged {num_traces} traces to {platform_name}!")

# Arize

In [None]:
!uv pip install opentelemetry-exporter-otlp opentelemetry-api openinference-semantic-conventions opentelemetry-sdk arize-otel

In [2]:
from arize.otel import register
from opentelemetry import trace

# Configuration for span logging
ARIZE_SPACE_ID = os.getenv("ARIZE_SPACE_ID")
ARIZE_API_KEY = os.getenv("ARIZE_API_KEY")


# Setup OTel via our convenience function
tracer_provider = register(
    space_id = ARIZE_SPACE_ID, # in app space settings page
    api_key = ARIZE_API_KEY, # in app space settings page
    project_name = "ADB-Benchmarks", # name this to whatever you would like
)

trace.set_tracer_provider(tracer_provider)
tracer = trace.get_tracer(__name__)

Overriding of current TracerProvider is not allowed


🔭 OpenTelemetry Tracing Details 🔭
|  Arize Project: ADB-Benchmarks
|  Span Processor: BatchSpanProcessor
|  Collector Endpoint: otlp.arize.com
|  Transport: gRPC
|  Transport Headers: {'authorization': '****', 'api_key': '****', 'arize-space-id': '****', 'space_id': '****', 'arize-interface': '****'}
|  
|  Using a default SpanProcessor. `add_span_processor` will overwrite this default.
|  
|  `register` has set this TracerProvider as the global OpenTelemetry default.
|  To disable this behavior, call `register` with `set_global_tracer_provider=False`.



In [None]:
def log_trace_arize(trace_data):
    """Log a single trace using Arize"""
    with tracer.start_as_current_span(trace_data["trace_id"]) as root_span:
        # Set trace-level attributes
        root_span.set_attribute("user.id", trace_data["user_id"])
        root_span.set_attribute("session.id", trace_data["session_id"])
        
        # Tool call span
        with tracer.start_as_current_span("tool_call") as tool_span:
            tool_span.set_attribute("operation.type", "tool_call")
            tool_span.set_attribute("input.size", trace_data["tool_call"]["input_size"])
            tool_span.add_event("tool_call_started")
        
        # LLM call span
        with tracer.start_as_current_span("llm_call") as llm_span:
            llm_data = trace_data["llm_call"]
            llm_span.set_attribute("llm.provider", llm_data["provider"])
            llm_span.set_attribute("llm.model_name", llm_data["model"])
            llm_span.set_attribute("llm.temperature", llm_data["temperature"])
            llm_span.set_attribute("input.value", llm_data["input"])
            llm_span.set_attribute("output.value", llm_data["output"])
            llm_span.set_attribute("llm.token_count.prompt", llm_data["prompt_tokens"])
            llm_span.set_attribute("llm.token_count.completion", llm_data["completion_tokens"])
            llm_span.set_attribute("llm.token_count.total", llm_data["total_tokens"])
        
        # Result span
        with tracer.start_as_current_span("result") as result_span:
            result_span.set_attribute("operation.type", "result")
            result_span.set_attribute("output.format", trace_data["result"]["output_format"])

# Run the Arize benchmark
run_benchmark("Arize", log_trace_arize)

Logging 20 traces to Arize...
Logged 10 traces...
Logged 20 traces...
Execution time: 0.00 seconds
Successfully logged 20 traces to Arize!


# Langfuse

In [None]:
!uv pip install "langfuse<3.0.0"

In [None]:
from langfuse import Langfuse

# Initialize Langfuse client
LANGFUSE_PUBLIC_KEY = os.getenv("LANGFUSE_PUBLIC_KEY")
LANGFUSE_SECRET_KEY = os.getenv("LANGFUSE_SECRET_KEY")
LANGFUSE_HOST = "https://us.cloud.langfuse.com"

langfuse = Langfuse(
    public_key=LANGFUSE_PUBLIC_KEY,
    secret_key=LANGFUSE_SECRET_KEY,
    host=LANGFUSE_HOST
)

In [None]:
def log_trace_langfuse(trace_data):
    """Log a single trace to Langfuse using hardcoded data"""
    # Create trace using context manager
    with langfuse.start_as_current_span(
        name=trace_data["trace_id"],
        input={"trace_id": trace_data["trace_id"]}
    ) as trace:
        
        # Set trace-level attributes
        trace.update_trace(
            user_id=trace_data["user_id"],
            session_id=trace_data["session_id"]
        )
        
        # Tool call span
        with langfuse.start_as_current_span(
            name="tool_call",
            input={"size": trace_data["tool_call"]["input_size"]}
        ) as tool_span:
            tool_span.update(
                metadata={
                    "operation.type": "tool_call",
                    "input.size": trace_data["tool_call"]["input_size"]
                }
            )
            
            # Simulate tool call duration
            time.sleep(trace_data["tool_call"]["duration"])
        
        # LLM Call
        llm_data = trace_data["llm_call"]
        with langfuse.start_as_current_generation(
            name="llm_call",
            model=llm_data["model"],
            model_parameters={
                "temperature": llm_data["temperature"]
            },
            input=llm_data["input"]
        ) as generation:
            
            # Simulate LLM call duration
            time.sleep(llm_data["duration"])
            
            generation.update(
                output=llm_data["output"],
                usage_details={
                    "input_tokens": llm_data["prompt_tokens"],
                    "output_tokens": llm_data["completion_tokens"],
                    "total_tokens": llm_data["total_tokens"]
                },
                metadata={
                    "provider": llm_data["provider"]
                }
            )
        
        # Result span
        with langfuse.start_as_current_span(
            name="result",
            output={"format": trace_data["result"]["output_format"]}
        ) as result_span:
            result_span.update(
                metadata={
                    "operation.type": "result",
                    "output.format": trace_data["result"]["output_format"]
                }
            )
            
            # Simulate result processing duration
            time.sleep(trace_data["result"]["duration"])

def log_trace_langfuse_wrapper(trace_data):
    """Wrapper to handle Langfuse flush"""
    log_trace_langfuse(trace_data)

# Run the Langfuse benchmark
run_benchmark("Langfuse", log_trace_langfuse_wrapper)
langfuse.flush()

# Braintrust

In [None]:
!uv pip install braintrust

In [7]:
import braintrust
from braintrust import start_span

# Initialize Braintrust logger
BRAINTRUST_API_KEY = os.getenv("BRAINTRUST_API_KEY")

# Initialize logger for the project
logger = braintrust.init_logger(
    project="Testing",
    api_key=BRAINTRUST_API_KEY
)

In [8]:
@braintrust.traced
def log_trace_braintrust(trace_data):
    """Log a single trace to Braintrust using hardcoded data"""
    
    # Tool call span
    with start_span(name="tool_call") as tool_span:
        tool_span.log(
            input={"size": trace_data["tool_call"]["input_size"]},
            metadata={
                "operation.type": "tool_call",
                "input.size": trace_data["tool_call"]["input_size"]
            }
        )
        
        # Simulate tool call duration
        time.sleep(trace_data["tool_call"]["duration"])
        
        tool_span.log(
            output={"status": "completed"},
            metrics={
                "duration": trace_data["tool_call"]["duration"]
            }
        )
    
    # LLM generation span
    with start_span(name="llm_call") as llm_span:
        llm_data = trace_data["llm_call"]
        
        llm_span.log(
            input=llm_data["input"],
            metadata={
                "provider": llm_data["provider"],
                "model": llm_data["model"],
                "temperature": llm_data["temperature"]
            }
        )
        
        # Simulate LLM call duration
        time.sleep(llm_data["duration"])
        
        llm_span.log(
            output=llm_data["output"],
            metrics={
                "prompt_tokens": llm_data["prompt_tokens"],
                "completion_tokens": llm_data["completion_tokens"],
                "total_tokens": llm_data["total_tokens"],
                "duration": llm_data["duration"]
            }
        )
    
    # Result span
    with start_span(name="result") as result_span:
        result_span.log(
            input={"format_requested": trace_data["result"]["output_format"]},
            metadata={
                "operation.type": "result",
                "output.format": trace_data["result"]["output_format"]
            }
        )
        
        # Simulate result processing duration
        time.sleep(trace_data["result"]["duration"])
        
        result_span.log(
            output={"format": trace_data["result"]["output_format"]},
            metrics={
                "duration": trace_data["result"]["duration"]
            }
        )

def log_trace_braintrust_wrapper(trace_data):
    """Wrapper to add user/session context"""
    with start_span(
        name=trace_data["trace_id"],
        span_attributes={
            "user_id": trace_data["user_id"],
            "session_id": trace_data["session_id"]
        }
    ):
        log_trace_braintrust(trace_data)

# Run the Braintrust benchmark
run_benchmark("Braintrust", log_trace_braintrust_wrapper)

Logging 20 traces to Braintrust...
Logged 10 traces...
Logged 20 traces...
Execution time: 5.21 seconds
Successfully logged 20 traces to Braintrust!
