In [None]:
!uv pip install pandas python-dotenv "arize[Tracing]" numpy

In [None]:
import time
import os
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import uuid
import random
import string
from contextlib import contextmanager
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Benchmarking functions
@contextmanager
def timer():
    """Context manager to time execution"""
    start_time = time.time()
    yield
    end_time = time.time()
    execution_time = end_time - start_time
    print(f"Execution time: {execution_time:.2f} seconds")
    
# Configuration
TARGET_ROWS = 10_000_000  # Change this to test different sizes (e.g., 1_000_000, 10_000_000, 20_000_000)
TEXT_LENGTH = 25_000  # 25k characters
UNIQUE_KEYWORD_ROWS = 10000  # Number of rows with unique keywords
TIMESTAMP_SPREAD_DAYS = 90  # Spread over past 90 days

# Unique keywords for search testing
UNIQUE_KEYWORDS = [
    "BENCHMARK_UNIQUE_ALPHA",
    "BENCHMARK_UNIQUE_BETA", 
    "BENCHMARK_UNIQUE_GAMMA",
    "BENCHMARK_UNIQUE_DELTA",
    "BENCHMARK_UNIQUE_EPSILON",
    "BENCHMARK_UNIQUE_ZETA",
    "BENCHMARK_UNIQUE_ETA",
    "BENCHMARK_UNIQUE_THETA",
    "BENCHMARK_UNIQUE_IOTA",
    "BENCHMARK_UNIQUE_KAPPA"
]

print(f"Target dataset size: {TARGET_ROWS:,} rows")


## Step 1: Load Downloaded Data

First, download span data from Arize UI and save it as a CSV file. Update the path below to point to your downloaded file.


In [None]:
# Load the downloaded span data
# UPDATE THIS PATH to your downloaded CSV file
DATA_FILE_PATH = "tracing_export.csv"  # Change this to your actual file path

try:
    df_original = pd.read_csv(DATA_FILE_PATH)
    print(f"Loaded {len(df_original)} rows from {DATA_FILE_PATH}")
    print(f"Columns: {list(df_original.columns)}")
except FileNotFoundError:
    print(f"ERROR: File not found at {DATA_FILE_PATH}")
    print("Please download span data from Arize UI and update the DATA_FILE_PATH variable")
    raise


In [3]:
# =============================================================================
# DATA PREPARATION FUNCTIONS
# =============================================================================

def generate_large_text(base_text, target_length, unique_keyword=None):
    """Generate text of specified length with optional unique keyword"""
    # Start with unique keyword if provided
    parts = [f"SEARCHABLE_CONTENT: {unique_keyword}\n\n"] if unique_keyword else []
    parts.append(str(base_text) if base_text else "")
    
    # Fill remaining space with lorem ipsum variations
    lorem_base = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. "
    while sum(len(p) for p in parts) < target_length:
        random_word = ''.join(random.choices(string.ascii_lowercase, k=random.randint(5, 15)))
        parts.append(f"{lorem_base}Random_{random_word}. ")
    
    return ''.join(parts)[:target_length]


def duplicate_rows(df, target_rows):
    """Duplicate dataframe rows to reach target number"""
    current_rows = len(df)
    if current_rows >= target_rows:
        return df
    
    # Calculate multiplication factor and duplicate
    multiplier = (target_rows // current_rows) + 1
    print(f"   Duplicating {current_rows} rows {multiplier}x to reach {target_rows:,}")
    
    df_list = [df.copy() for _ in range(multiplier)]
    df_final = pd.concat(df_list, ignore_index=True).iloc[:target_rows].copy()
    df_final['unique_id'] = [str(uuid.uuid4()) for _ in range(len(df_final))]
    
    return df_final


def spread_timestamps(df, days_back=90):
    """Spread timestamps over the past N days from today"""
    num_rows = len(df)
    end_time = datetime.now()
    start_time = end_time - timedelta(days=days_back)
    
    print(f"   Spreading {num_rows:,} timestamps from {start_time.strftime('%Y-%m-%d')} to {end_time.strftime('%Y-%m-%d')}")
    
    # Generate evenly spaced timestamps and shuffle them
    time_increment = timedelta(days=days_back) / num_rows
    timestamps = [start_time + (i * time_increment) for i in range(num_rows)]
    random.shuffle(timestamps)
    
    # Update timestamp column (prefer start_time for log_spans compatibility)
    timestamp_col = next((col for col in ['start_time', 'timestamp', 'time'] if col in df.columns), 'start_time')
    df[timestamp_col] = timestamps
    
    return df


def prepare_data_simplified(df_original, target_rows, text_length=TEXT_LENGTH):
    """Simplified data preparation pipeline"""
    print(f"\n=== Preparing {target_rows:,} rows ===")
    
    # Step 1: Duplicate to target size
    print("1. Duplicating rows...")
    with timer():
        df = duplicate_rows(df_original, target_rows)
    
    # Step 2: Spread timestamps over past 90 days
    print("2. Spreading timestamps...")
    with timer():
        df = spread_timestamps(df, TIMESTAMP_SPREAD_DAYS)
    
    # Step 3: Generate large text with keywords
    print(f"3. Generating {text_length:,}-char text...")
    with timer():
        input_col = next((col for col in ['attributes.input.value', 'input', 'prompt'] if col in df.columns), 'attributes.input.value')
        
        # Add keywords to first 10 rows for search testing
        for i in range(len(df)):
            keyword = UNIQUE_KEYWORDS[i] if i < min(len(UNIQUE_KEYWORDS), len(df)) else None
            base_text = df.iloc[i].get(input_col, "")
            df.iloc[i, df.columns.get_loc(input_col)] = generate_large_text(base_text, text_length, keyword)
            
            if keyword:
                print(f"   Row {i}: '{keyword}'")
    
    print(f"✅ Ready: {len(df):,} rows")
    return df


In [None]:
# Prepare dataset with target number of rows
df_prepared = prepare_data_simplified(df_original, TARGET_ROWS)


## Step 2: Upload Spans to Arize


In [None]:
from arize.pandas.logger import Client

# Configuration for span logging
ARIZE_SPACE_ID = os.getenv("ARIZE_SPACE_ID")
ARIZE_API_KEY = os.getenv("ARIZE_API_KEY")

# Create a unique project name for this benchmark
PROJECT_NAME = f"TextSearchBench-{TARGET_ROWS}-{datetime.now().strftime('%H%M')}"

# Setup Arize client for logging spans
arize_client = Client(
    space_id=ARIZE_SPACE_ID,
    api_key=ARIZE_API_KEY,
)

print(f"Arize project name: {PROJECT_NAME}")
print("✅ Arize client setup complete!")


In [None]:
def upload_spans_to_arize(df):
    """Upload dataframe rows as spans to Arize using log_spans"""
    
    total_rows = len(df)
    print(f"\n=== Uploading {total_rows:,} spans to Arize using log_spans ===")
    print(f"Project: {PROJECT_NAME}")
    
    print("\nPreparing spans DataFrame for log_spans...")
    print("Using known column structure from tracing_export.csv format")
    
    with timer():
        # Create clean spans DataFrame with only the required columns
        spans_df = pd.DataFrame()
        
        # Required columns - ensure proper data types
        spans_df['context.trace_id'] = df['context.trace_id'].fillna('').astype(str)
        spans_df['context.span_id'] = df['context.span_id'].fillna('').astype(str)
        spans_df['name'] = df['name'].fillna('LLM_span').astype(str)
        
        # Handle timestamps
        spans_df['start_time'] = pd.to_datetime(df['start_time'], errors='coerce')
        
        # Calculate end times using latency_ms
        if 'latency_ms' in df.columns:
            # Ensure latency_ms is numeric
            latency_ms = pd.to_numeric(df['latency_ms'], errors='coerce').fillna(1000.0)
            spans_df['end_time'] = spans_df['start_time'] + pd.to_timedelta(latency_ms, unit='ms')
        else:
            # Generate random latency if not available
            latency_ms = [random.uniform(100, 2000) for _ in range(len(df))]
            spans_df['end_time'] = spans_df['start_time'] + pd.to_timedelta(latency_ms, unit='ms')
        
        # Handle input/output attributes - ensure they are strings
        if 'attributes.input.value' in df.columns:
            spans_df['attributes.input.value'] = df['attributes.input.value'].fillna('').astype(str)
        
        if 'attributes.output.value' in df.columns:
            spans_df['attributes.output.value'] = df['attributes.output.value'].fillna('').astype(str)
        
        # Handle status code
        if 'status_code' in df.columns:
            spans_df['status_code'] = df['status_code'].fillna('OK').astype(str)
        else:
            spans_df['status_code'] = 'OK'
        
        # Handle span kind
        if 'attributes.openinference.span.kind' in df.columns:
            spans_df['attributes.openinference.span.kind'] = df['attributes.openinference.span.kind'].fillna('LLM').astype(str)
        else:
            spans_df['attributes.openinference.span.kind'] = 'LLM'
        
        # Handle parent_id if present
        if 'parent_id' in df.columns:
            # Only include non-null parent_ids
            parent_ids = df['parent_id'].fillna('')
            spans_df['parent_id'] = parent_ids.astype(str)
            # Replace empty strings with None for proper parent relationship
            spans_df.loc[spans_df['parent_id'] == '', 'parent_id'] = None
        
        # Handle token counts if present (ensure they are numeric)
        if 'totalTokenCount' in df.columns:
            spans_df['attributes.llm.token_count.total'] = pd.to_numeric(df['totalTokenCount'], errors='coerce').fillna(0).astype(int)
        
        if 'attributes.llm.token_count.prompt' in df.columns:
            spans_df['attributes.llm.token_count.prompt'] = pd.to_numeric(df['attributes.llm.token_count.prompt'], errors='coerce').fillna(0).astype(int)
        
        if 'attributes.llm.token_count.completion' in df.columns:
            spans_df['attributes.llm.token_count.completion'] = pd.to_numeric(df['attributes.llm.token_count.completion'], errors='coerce').fillna(0).astype(int)
        
        # Add unique_id for tracking (from our benchmark preparation)
        if 'unique_id' in df.columns:
            spans_df['unique_id'] = df['unique_id'].astype(str)
        else:
            spans_df['unique_id'] = [str(uuid.uuid4()) for _ in range(len(spans_df))]
        
        print(f"   Prepared spans DataFrame with {len(spans_df)} rows and {len(spans_df.columns)} columns")
        print(f"   Key columns: {[col for col in ['context.trace_id', 'context.span_id', 'name', 'start_time', 'end_time', 'attributes.input.value'] if col in spans_df.columns]}")
        
        # Upload to Arize using log_spans
        print("\n🚀 Uploading spans to Arize...")
        response = arize_client.log_spans(
            dataframe=spans_df,
            model_id=PROJECT_NAME,
            model_version="1.0",
            validate=True,
            verbose=True
        )
        
        # Check response
        if response.status_code == 200:
            print(f"✅ Successfully uploaded {total_rows:,} spans to Arize!")
            print(f"   Project: {PROJECT_NAME}")
            print(f"   Response: {response.status_code}")
        else:
            print(f"❌ Upload failed with status code: {response.status_code}")
            print(f"   Response text: {response.text}")
            
    return response


In [None]:
# Upload the prepared dataset to Arize
upload_spans_to_arize(df_prepared)
