In [None]:
from typing import Any, Iterable

import altair as alt
import pandas as pd

from analysis.models.openhands import Evaluation, EvaluationOutput, SWEBenchResult
from analysis.usage import total_resource_usage, per_iteration_resource_usage

# Altair stores the input data in all visualizations, and we're not being careful about the size of the data we're passing.
# If you want to export the visualizations and embed on the web, you might want to comment out this line and look into:
# https://altair-viz.github.io/user_guide/large_datasets.html#vegafusion-data-transformer
alt.data_transformers.disable_max_rows()

# Plug in filepaths to OpenHands evaluation data here -- anything produced using the OpenHands SWE-bench evaluation framework
# should be compatible.
filepaths = []
data = [Evaluation.from_filepath(filepath) for filepath in filepaths]

In [None]:
def classify_error(output: EvaluationOutput, result: SWEBenchResult) -> str:
    error = "other"
    if result.test_result.report.resolved:
        error = "resolved"
    if output.error and not result.test_result.report.resolved:
        if output.error.startswith('Agent reached maximum iteration'):
                error = 'iter. limit'
        if output.error.startswith('Agent got stuck in a loop'):
                error = 'event loop'

    if error == 'other':
        if result.test_result.report.empty_generation:
            error = 'empty gen.'
        if (
            result.test_result.report.error_eval
            or result.test_result.report.failed_apply_patch
            or result.test_result.report.test_timeout
        ):
            error = 'test failure'
    
    return error

def per_instance(output: EvaluationOutput, result: SWEBenchResult) -> dict[str, Any]:
    return {
        'error': classify_error(output, result),
        **total_resource_usage(output).model_dump(),
        'history_length': len(output.history),
        "finish": output.history[-1].get("action", "") == "finish",
        **result.test_result.report.model_dump(),
    }
        

df = pd.concat([d.to_dataframe(per_instance) for d in data])

In [None]:
# For each experiment, calcuate the ratio of instances resolved.

df.copy().groupby('experiment')[
    'resolved'
].agg(
    total_count='size', resolved_count='sum'
).assign(
    resolution_ratio=lambda x: x['resolved_count'] / x['total_count']
)

In [None]:
# Not all instances use every iteration allowed by the test harness. Plot the distribution of iterations used.
# The color indicates whether the instance was resolved or not.

alt.Chart(df).mark_bar().encode(
    alt.X('x:Q').title('Iterations'),
    alt.Y('count()').title('# of Instances'),
    alt.Color('resolved'),
    column='experiment',
).transform_calculate(x='datum.history_length / 2').properties(width=150, height=150)

In [None]:
# Each instance can be resolved or unresolved, and can finish early or late. Too many
# instances that finish late (i.e., hit the max iteration limit) might indicate that
# the agent needs more iterations to finish the instance.

df["outcome"] = df.apply(
    lambda row: {
        (True, True): "Resolved Early",
        (True, False): "Resolved Late",
        (False, True): "Unresolved Early",
        (False, False): "Unresolved Late",
    }[(row["resolved"], row["finish"])], axis=1
)

selection = alt.selection_point(fields=['outcome'])
color = (
    alt.when(selection)
    .then(alt.Color("outcome:N").legend(None))
    .otherwise(alt.value("lightgray"))
)


legend = alt.Chart(df).mark_point().encode(
    alt.Y('outcome').axis(orient='right'),
    color=color
).add_params(
    selection
)

arcs = alt.Chart(df, title="Outcomes").mark_arc().encode(
    alt.Theta("count()"), alt.Column("experiment").title(None), color=color
).properties(width=150, height=150)

arcs | legend

In [None]:
# The main point of condensers is to combat the ever-growing number of tokens per iteration.
# We can plot the average number of tokens per iteration for each experiment.

def per_step(output: EvaluationOutput, result: SWEBenchResult) -> Iterable[dict[str, Any]]:
    for step, step_usage in enumerate(per_iteration_resource_usage(output)):
        yield {
            "resolved": result.test_result.report.resolved,
            **step_usage.model_dump(),
            "iteration": step / 2,
        }

def post_process(df: pd.DataFrame) -> pd.DataFrame:
    df = df.sort_values(by="iteration")
    df["cumulative_cache_reads"] = df["cache_reads"].cumsum()
    df["cumulative_cache_writes"] = df["cache_writes"].cumsum()
    df["cache_efficiency"] = df["cumulative_cache_reads"] - (
        0.10 * df["cumulative_cache_reads"] + 1.25 * df["cumulative_cache_writes"]
    )
    return df

df_pi = pd.concat([d.multi_to_dataframe(per_step, post_callback=post_process) for d in data])

selection = alt.selection_point(fields=['experiment'])
color = (
    alt.when(selection)
    .then(alt.Color("experiment:N").legend(None))
    .otherwise(alt.value("lightgray"))
)


legend = alt.Chart(df).mark_point().encode(
    alt.Y('experiment').axis(orient='right'),
    color=color
).add_params(
    selection
)

line = (
    alt.Chart(df_pi, title="Avg. Token Usage per Iteration")
    .mark_line()
    .encode(
        alt.X("iteration").title("Iteration"),
        alt.Y("mean(prompt_tokens)").title("Prompt Tokens"),
        color=color,
    )
)


line | legend

In [None]:
# A secondary consequence of limiting the monotonic growth of tokens is that the LLM
# can compute responses faster. We can plot the average response latency per iteration.

selection = alt.selection_point(fields=['experiment'])
color = (
    alt.when(selection)
    .then(alt.Color("experiment:N").legend(None))
    .otherwise(alt.value("lightgray"))
)


legend = alt.Chart(df).mark_point().encode(
    alt.Y('experiment').axis(orient='right'),
    color=color
).add_params(
    selection
)

line = (
    alt.Chart(df_pi, title="Avg. Response Latency per Iteration")
    .mark_line()
    .encode(
        alt.X("iteration").title("Iteration"),
        alt.Y("mean(response_latency)").title("Response Latency"),
        color=color,
    )
)


line | legend

In [None]:
# Condensers often act by manipulating the event history sent to the LLM. This has the
# downside of invaldiating the LLM's cache, which can lead to more cache reads and writes.

# This graph uses Anthropic's cache costs to determine if condensers thrash the event
# stream enough to make the LLM's cache a detriment -- anything above the x-axis is able
# to utilize the cache, and the further above the line, the more efficient the cache is.

line = (
    alt.Chart(df_pi)
    .mark_line()
    .encode(
        alt.X('iteration').title('Iteration'),
        alt.Y('mean(cache_efficiency):Q').title('Reads - (0.1 * Reads + 1.25 * Writes)'),
        alt.Color('experiment').title('Experiment'),
    )
)

band = (
    alt.Chart(df_pi, title='Average Cache Cost Savings per Iteration')
    .mark_errorband(extent='ci')
    .encode(
        alt.X('iteration').title('Iteration'),
        alt.Y('cache_efficiency:Q').title('Reads - (0.1 * Reads + 1.25 * Writes)'),
        alt.Color('experiment').title('Experiment'),
    )
)

band + line

In [None]:
# To better understand how condensers impact something like SWE-bench performance, we can
# treat tokens, iterations, and response latency as resources and compute how many
# instances we could solve _in parallel_ with a certain amount of each resource.

# This is a cactus plot: each line represents a different experiment, the y-axis tracks
# the number of instances resolved, and the x-axis tracks the amount of each resource,
# and the line indicates how many instances are resolved using that amount of resources
# or fewer.

def per_instance(output: EvaluationOutput, result: SWEBenchResult) -> dict[str, Any]:
        return {
            'resolved': 1 if result.test_result.report.resolved else 0,
            'iteration': len(output.history) // 2,
            **total_resource_usage(output).model_dump(),
        }
        

def cactus_plot_post_process(df: pd.DataFrame) -> pd.DataFrame:
    df['total_tokens'] = df['prompt_tokens'] + df['completion_tokens']

    for field in (
        'iteration',
        'prompt_tokens',
        'completion_tokens',
        'total_tokens',
        'response_latency',
    ):
        df = df.sort_values(by=field)
        df[f'resolved_by_{field}'] = df['resolved'].cumsum()

    return df

df = pd.concat([cactus_plot_post_process(d.to_dataframe(per_instance)) for d in data])

selection = alt.selection_point(fields=['experiment'])
color = (
    alt.when(selection)
    .then(alt.Color("experiment:N").legend(None))
    .otherwise(alt.value("lightgray"))
)


legend = alt.Chart(df).mark_point().encode(
    alt.Y('experiment').axis(orient='right'),
    color=color
).add_params(
    selection
)

iter = alt.Chart(df, title='Cumulative Resolved by Iteration').mark_line().encode(
    alt.X('iteration:Q').title('Iteration'),
    # alt.Y('mean(resolved_by_iteration):Q').title('Cumulative Resolved').scale(domain=(0, 25)),
    alt.Y('mean(resolved_by_iteration):Q').title('Cumulative Resolved'),
    color=color,
).properties(width=150, height=250)

token = alt.Chart(df, title='Cumulative Resolved by Token Consumption').mark_line().encode(
    alt.X('total_tokens:Q').title('Total Tokens'),
    # alt.Y('mean(resolved_by_total_tokens):Q').title(None).scale(domain=(0, 25)),
    alt.Y('mean(resolved_by_total_tokens):Q').title(None),
    color=color,
).properties(width=150, height=250)

latency = alt.Chart(df, title='Cumulative Resolved by Latency').mark_line().encode(
    alt.X('response_latency:Q').title('Response Latency'),
    # alt.Y('mean(resolved_by_response_latency):Q').title(None).scale(domain=(0, 25)),
    alt.Y('mean(resolved_by_response_latency):Q').title(None),
    color=color,
).properties(width=150, height=250)

iter | token | latency | legend

In [None]:
# A useful spot check is to look at patterns in the event history. This graph plots each instance
# as a row of circles representing the steps taken by the agent. The color corresponds to the type
# of action taken by the agent, and the size of the circle corresponds to the size of the message
# sent to the LLM.

def per_step(output: EvaluationOutput, result: SWEBenchResult) -> Iterable[dict[str, Any]]:
    for i, step in enumerate(output.history):
        action = step.get("action", "observation")

        if action == "observation":
            message = step["content"]
        elif action in ("run", "run_ipython"):
            message = ":".join(step["message"].split(":")[1:])
        else:
            message = step["message"]

        yield {
            "action": action,
            # 'message': message[:100] + "...",
            "size": len(message),
            "step": i
        }



alt.Chart(pd.concat([d.multi_to_dataframe(per_step) for d in data])).mark_circle(opacity=1).encode(
    alt.X("step:O", title="Step").title(None).axis(None),
    alt.Y("instance_id").title(None).axis(None),
    alt.Color("action:N", title="Action").legend(None),
    alt.Size("size:Q", title="Message Size").scale(type="sqrt").legend(None),
    tooltip=[alt.Tooltip("size:Q"), alt.Tooltip("action:N"),],
    row="experiment",
).properties(width=1000, height=500)