In [None]:
%pip install -Uqqq beautifulsoup

In [None]:
import urllib.request
from base64 import b64encode
from io import StringIO
from typing import Dict

import bs4 as bs
import openai
import pandas as pd
import phoenix as px
from faker import Faker
from openinference.instrumentation import TraceConfig
from openinference.instrumentation.openai import OpenAIInstrumentor
from openinference.semconv.resource import ResourceAttributes
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import SimpleSpanProcessor

In [None]:
endpoint = "http://127.0.0.1:4317"
tracer_provider = TracerProvider(
    resource=Resource({ResourceAttributes.PROJECT_NAME: "vision-fixture"})
)
tracer_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter(endpoint)))
config = TraceConfig(base64_image_max_length=1_000_000_000)
OpenAIInstrumentor().instrument(tracer_provider=tracer_provider, config=config)

In [None]:
source = urllib.request.urlopen("https://nextml.github.io/caption-contest-data/").read()
table = bs.BeautifulSoup(source).find_all("table")

In [None]:
df = pd.read_html(StringIO(str(table)))[0].iloc[:, [0, 2, -1]]
df.sort_values("Number of votes", ascending=False, inplace=True)
df = (
    df.set_index(df.iloc[:, 0].apply(lambda s: int(s.split()[0])))
    .rename_axis(None, axis=0)
    .iloc[:, [1, 2]]
)
df.rename(dict(zip(df.columns, ["caption", "votes"])), axis=1, inplace=True)
print(len(df))
df.head(5)

In [None]:
client = openai.OpenAI()

In [None]:
def message(idx: int, caption: str) -> Dict[str, str]:
    url = f"https://nextml.github.io/caption-contest-data/cartoons/{idx}.jpg"
    text = f"Explain like I'm five. What's funny about this caption?\n\n{caption}\n"
    return {
        "role": "user",
        "content": [
            {"type": "text", "text": text},
            {"type": "image_url", "image_url": {"url": f"{url}", "detail": "low"}},
        ],
    }

In [None]:
n, errors = 25, 0
for idx, caption, _ in df.itertuples():
    if n == 0 or errors > 3:
        break
    messages = [message(idx, caption)]
    try:
        client.chat.completions.create(model="gpt-4o-mini", messages=messages, max_tokens=1000)
    except BaseException:
        errors += 1
    else:
        errors = 0
        n -= 1

In [None]:
df = pd.read_parquet("hf://datasets/ChartMimic/ChartMimic/test.parquet")
df = df.loc[df.Difficulty == "hard"].sort_values(
    by="Instruction", key=lambda c: c.apply(len), ascending=False
)
df.head()

In [None]:
for _, instruction, input_figure in (
    df.loc[:, ["Instruction", "InputFigurePreview"]].iloc[1:25].itertuples()
):
    bytes = input_figure["bytes"]
    encoded_string = b64encode(bytes).decode()
    message = {
        "role": "user",
        "content": [
            {"type": "text", "text": instruction},
            {
                "type": "image_url",
                "image_url": {"url": f"data:image/png;base64,{encoded_string}", "detail": "low"},
            },
        ],
    }
    client.chat.completions.create(model="gpt-4o-mini", messages=[message], max_tokens=1000)

In [None]:
td = px.Client().get_trace_dataset(timeout=1000, project_name="vision-fixture")

In [None]:
fake = Faker()
start_time = pd.Series(
    [fake.date_time_between("-3d") for _ in range(len(td.dataframe))], index=td.dataframe.index
)
duration = td.dataframe.end_time - td.dataframe.start_time
end_time = start_time + duration
td.dataframe["start_time"] = start_time
td.dataframe["end_time"] = end_time

In [None]:
td.save()