In [None]:
!uv pip install pandas dotenv

In [None]:
import time
from contextlib import contextmanager
import os
import pandas as pd
import uuid
from datetime import datetime
import uuid

from dotenv import load_dotenv
load_dotenv()

# Benchmarking functions
@contextmanager
def timer():
    """Context manager to time execution"""
    start_time = time.time()
    yield
    end_time = time.time()
    execution_time = end_time - start_time
    print(f"Execution time: {execution_time:.2f} seconds")

# Dataset
dataset_id = "ADB_Benchmark_Synthetic_Dataset_100_rows" # Edit this to change the source dataset

dataset_name = f"{dataset_id}_{str(uuid.uuid4())[:8]}"
df = pd.read_csv("datasets/" + dataset_id + ".csv")

# Arize

In [None]:
!uv pip install arize[Datasets]

In [None]:
from arize.experimental.datasets import ArizeDatasetsClient
from arize.experimental.datasets.utils.constants import GENERATIVE

ARIZE_API_KEY = os.getenv("ARIZE_API_KEY")
ARIZE_SPACE_ID = os.getenv("ARIZE_SPACE_ID")

In [None]:
# Benchmark dataset upload
print("\n Uploading dataset to Arize...")
client = ArizeDatasetsClient(api_key=ARIZE_API_KEY)

with timer():
    dataset_id_result = client.create_dataset(
        space_id=ARIZE_SPACE_ID, 
        dataset_name = dataset_name,
        dataset_type=GENERATIVE, 
        data=df
    )



# Langfuse

In [None]:
!uv pip install langfuse

In [None]:
# Upload each row as a dataset item. 
# Definitely not apples to apples, but that's how they support it.
# https://langfuse.com/docs/evaluation/dataset-runs/datasets

# Rate limit: 100 items per minute, fail after 98 rows

from langfuse import Langfuse

# Initialize Langfuse client
LANGFUSE_PUBLIC_KEY = os.getenv("LANGFUSE_PUBLIC_KEY")
LANGFUSE_SECRET_KEY = os.getenv("LANGFUSE_SECRET_KEY")
LANGFUSE_HOST = "https://us.cloud.langfuse.com"

langfuse = Langfuse(
    public_key=LANGFUSE_PUBLIC_KEY,
    secret_key=LANGFUSE_SECRET_KEY,
    host=LANGFUSE_HOST
)

In [None]:
# Create the dataset
print(f"Creating dataset '{dataset_name}'...")
dataset = langfuse.create_dataset(name=dataset_name)

# Upload each row as a dataset item
print(f"Uploading {len(df)} rows...")
for idx, row in df.iterrows():
    # Convert row to dict
    row_dict = row.to_dict()

    langfuse.create_dataset_item(
        dataset_name=dataset_name,
        input=row_dict.get('input'),
        expected_output=row_dict.get('output'),
        metadata={
            "row_index": idx,
            "id": row_dict.get('id'),
            "prompt_template": row_dict.get('attributes.llm.prompt_template.template'),
            "prompt_variables": row_dict.get('attributes.llm.prompt_template.variables'),
            "timestamp": row_dict.get('timestamp'),
            "model_name": row_dict.get('model_name'),
            "token_count_input": row_dict.get('token_count_input'),
            "token_count_output": row_dict.get('token_count_output'),
            "latency_ms": row_dict.get('latency_ms'),
            "cost_usd": row_dict.get('cost_usd'),
        }
    )

print(f"Successfully uploaded {len(df)} items to dataset '{dataset_name}'")


# Braintrust

In [None]:
!uv pip install braintrust

In [None]:
import braintrust

BRAINTRUST_API_KEY = os.getenv("BRAINTRUST_API_KEY")

print(f"Creating dataset '{dataset_name}' in Braintrust...")

with timer():
    # Initialize Braintrust client and create dataset
    braintrust.init(api_key=BRAINTRUST_API_KEY, project="Testing")
    dataset = braintrust.init_dataset(project="Testing",name=dataset_name)
    
    # Upload each row as a dataset item
    for idx, row in df.iterrows():
        row_dict = row.to_dict()
        dataset.insert(
            input=row_dict.get('input'),
            expected=row_dict.get('output'),
            metadata={
                "row_index": idx,
                "id": row_dict.get('id'),
                "prompt_template": row_dict.get('attributes.llm.prompt_template.template'),
                "prompt_variables": row_dict.get('attributes.llm.prompt_template.variables'),
                "timestamp": row_dict.get('timestamp'),
                "model_name": row_dict.get('model_name'),
                "token_count_input": row_dict.get('token_count_input'),
                "token_count_output": row_dict.get('token_count_output'),
                "latency_ms": row_dict.get('latency_ms'),
                "cost_usd": row_dict.get('cost_usd'),
            }
        )

print(f"Successfully uploaded {len(df)} items to dataset '{dataset_name}'")


# LangSmith

In [None]:
!uv pip install langsmith

In [None]:
# https://docs.smith.langchain.com/evaluation/how_to_guides/manage_datasets_programmatically
# Doesn't support any metadata - only input and output
# We may want to create a dataset with no metadata for a fair comparison

from langsmith import Client

LANGSMITH_API_KEY = os.getenv("LANGSMITH_API_KEY")
client = Client(api_key=LANGSMITH_API_KEY)


print(f"Creating dataset '{dataset_name}' in LangSmith...")

# Use upload_dataframe method (most efficient for large datasets)
with timer():
    dataset = client.upload_dataframe(
        df=df,
        input_keys=['input'],
        output_keys=['output'],
        name=dataset_name,
        description="Test dataset",
        data_type="kv" 
    )

print(f"Successfully uploaded dataset '{dataset_name}' to LangSmith")

