In [None]:
%pip install -Uqqq datasets openinference-semantic-conventions openinference-instrumentation-openai faker openai-responses openai tiktoken

In [2]:
from contextlib import ExitStack, contextmanager
from random import choice, choices, randint, random, shuffle

import numpy as np
import openai
import pandas as pd
from datasets import load_dataset
from faker import Faker
from openai_responses import OpenAIMock
from openinference.instrumentation import using_session, using_user
from openinference.instrumentation.openai import OpenAIInstrumentor
from openinference.semconv.trace import OpenInferenceSpanKindValues, SpanAttributes
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk.trace import SpanLimits, StatusCode, TracerProvider
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
from tiktoken import encoding_for_model

import phoenix as px
from phoenix.trace.span_evaluations import SpanEvaluations

fake = Faker(["ja_JP", "vi_VN", "ko_KR", "zh_CN", "th_TH", "bn_BD"])

# Download Data


In [None]:
df = load_dataset("GitBag/ultrainteract_multiturn_1_iter_processed_harvard")["train"].to_pandas()
convo = df.loc[df.chosen.apply(len) == 10, "chosen"]

# Tracer Provider


In [4]:
tracer_provider = TracerProvider(span_limits=SpanLimits(max_attributes=1_000_000))
in_memory_span_exporter = InMemorySpanExporter()
tracer_provider.add_span_processor(SimpleSpanProcessor(in_memory_span_exporter))
endpoint = "http://127.0.0.1:4317"
otlp_span_exporter = OTLPSpanExporter(endpoint=endpoint)

# Helpers


In [6]:
def gen_session_id():
    p = random()
    if p < 0.1:
        return ":" * randint(1, 5)
    if p < 0.9:
        return fake.address()
    return int(abs(random()) * 1_000_000_000)


def gen_user_id():
    p = random()
    if p < 0.1:
        return ":" * randint(1, 5)
    if p < 0.9:
        return fake.name()
    return int(abs(random()) * 1_000_000_000)


def export_spans(prob_drop_root):
    """Export spans in random order for receiver testing"""
    spans = list(in_memory_span_exporter.get_finished_spans())
    shuffle(spans)
    for span in spans:
        if span.parent is None and random() < prob_drop_root:
            continue
        otlp_span_exporter.export([span])
    in_memory_span_exporter.clear()
    session_count = len({id_ for span in spans if (id_ := span.attributes.get("session.id"))})
    trace_count = len({span.context.trace_id for span in spans})
    print(f"Exported {session_count} sessions, {trace_count} traces, {len(spans)} spans")
    return spans


def rand_span_kind():
    yield SpanAttributes.OPENINFERENCE_SPAN_KIND, choice(list(OpenInferenceSpanKindValues)).value


def rand_status_code():
    return choices(
        [StatusCode.OK, StatusCode.ERROR, StatusCode.UNSET], k=1, weights=[0.98, 0.01, 0.01]
    )[0]


@contextmanager
def trace_tree(tracer, n=5):
    if n <= 0:
        yield
        return
    has_yielded = False
    with tracer.start_as_current_span(
        fake.city(),
        attributes=dict(rand_span_kind()),
        end_on_exit=False,
    ) as root:
        for _ in range(randint(0, n)):
            with trace_tree(tracer, randint(0, n - 1)):
                if not has_yielded and random() < 0.5:
                    yield
                    has_yielded = True
                else:
                    pass
        if not has_yielded:
            yield
            has_yielded = True
        for _ in range(randint(0, n)):
            with trace_tree(tracer, randint(0, n - 1)):
                pass
    root.set_status(rand_status_code())
    root.end(int(fake.future_datetime("+5s").timestamp() * 10**9))

# Genarate Sessions


In [None]:
session_count = randint(5, 10)
tree_complexity = 4  # set to 0 for single span under root
prob_drop_root = 0.0  # probability that a root span gets dropped


def simulate_openai():
    user_id = gen_user_id() if random() < 0.9 else " "
    session_id = gen_session_id()
    client = openai.Client(api_key="sk-")
    model = "gpt-4o-mini"
    encoding = encoding_for_model(model)
    messages = np.concatenate(convo.sample(randint(1, 10)).values)
    counts = [len(encoding.encode(m["content"])) for m in messages]
    openai_mock = OpenAIMock()
    tracer = tracer_provider.get_tracer(__name__)
    with openai_mock.router:
        for i in range(1, len(messages), 2):
            openai_mock.chat.completions.create.response = dict(
                choices=[dict(index=0, finish_reason="stop", message=messages[i])],
                usage=dict(
                    prompt_tokens=sum(counts[:i]),
                    completion_tokens=counts[i],
                    total_tokens=sum(counts[: i + 1]),
                ),
            )
            with ExitStack() as stack:
                attributes = {
                    "input.value": messages[i - 1]["content"],
                    "output.value": messages[i]["content"],
                }
                if random() < 0.5:
                    attributes["session.id"] = session_id
                    attributes["user.id"] = user_id
                else:
                    stack.enter_context(using_session(session_id))
                    stack.enter_context(using_user(user_id))
                root = stack.enter_context(
                    tracer.start_as_current_span(
                        "root",
                        attributes=attributes,
                        end_on_exit=False,
                    )
                )
                with trace_tree(tracer, tree_complexity):
                    client.chat.completions.create(model=model, messages=messages[:i])
            root.set_status(rand_status_code())
            root.end(int(fake.future_datetime("+5s").timestamp() * 10**9))


OpenAIInstrumentor().instrument(tracer_provider=tracer_provider)
try:
    for _ in range(session_count):
        simulate_openai()
finally:
    OpenAIInstrumentor().uninstrument()
spans = export_spans(prob_drop_root)

# Annotate root spans
root_span_ids = pd.Series(
    [span.context.span_id.to_bytes(8, "big").hex() for span in spans if span.parent is None]
)
for name in "ABC":
    span_ids = root_span_ids.sample(frac=0.5)
    df = pd.DataFrame(
        {
            "context.span_id": span_ids,
            "score": np.random.rand(len(span_ids)),
            "label": np.random.choice(["👍", "👎"], len(span_ids)),
            "explanation": [fake.paragraph(10) for _ in range(len(span_ids))],
        }
    ).set_index("context.span_id")
    px.Client().log_evaluations(SpanEvaluations(name, df))