In [67]:
import os
from typing import Iterable

import altair as alt
import pandas as pd
from pydantic import BaseModel

from evaluation.utils.shared import EvalMetadata, EvalOutput


class SWEBenchTestReport(BaseModel):
    empty_generation: bool
    resolved: bool
    failed_apply_patch: bool
    error_eval: bool
    test_timeout: bool


class SWEBenchTestResult(BaseModel):
    git_patch: str
    report: SWEBenchTestReport


class SWEBenchResult(BaseModel):
    instance_id: str
    test_result: SWEBenchTestResult


filepaths = [
    '/Users/calvin/all-hands/OpenHands/evaluation/evaluation_outputs/outputs/princeton-nlp__SWE-bench_Verified-test/CodeActAgent/claude-3-5-sonnet-20241022_maxiter_50_N_v0.15.2-no-hint-forget-keep-first-1-max-events-10-run_1',
    # '/Users/calvin/all-hands/OpenHands/evaluation/evaluation_outputs/outputs/princeton-nlp__SWE-bench_Verified-test/CodeActAgent/claude-3-5-sonnet-20241022_maxiter_50_N_v0.15.2-no-hint-forget-keep-first-1-max-events-20-run_1',
    '/Users/calvin/all-hands/OpenHands/evaluation/evaluation_outputs/outputs/princeton-nlp__SWE-bench_Verified-test/CodeActAgent/claude-3-5-sonnet-20241022_maxiter_50_N_v0.15.2-no-hint-recent-keep-first-1-max-events-10-run_1',
    '/Users/calvin/all-hands/OpenHands/evaluation/evaluation_outputs/outputs/princeton-nlp__SWE-bench_Verified-test/CodeActAgent/claude-3-5-sonnet-20241022_maxiter_50_N_v0.15.2-no-hint-no-condensation-run_1',
    '/Users/calvin/all-hands/OpenHands/evaluation/evaluation_outputs/outputs/princeton-nlp__SWE-bench_Verified-test/CodeActAgent/claude-3-5-sonnet-20241022_maxiter_50_N_v0.15.2-no-hint-run_1',
]


class Data(BaseModel):
    filepath: str
    metadata: EvalMetadata
    output: list[EvalOutput]
    results: list[SWEBenchResult]

    @staticmethod
    def from_filepath(filepath: str) -> 'Data':
        with open(os.path.join(filepath, 'metadata.json')) as f:
            metadata = EvalMetadata.model_validate_json(f.read())

        with open(os.path.join(filepath, 'output.jsonl')) as f:
            output = [EvalOutput.model_validate_json(line) for line in f.readlines()]

        with open(os.path.join(filepath, 'output.swebench_eval.jsonl')) as f:
            results = [
                SWEBenchResult.model_validate_json(line) for line in f.readlines()
            ]

        return Data(
            filepath=filepath, metadata=metadata, output=output, results=results
        )

    def get_output(self, instance_id: str) -> EvalOutput:
        for output in self.output:
            if output.instance_id == instance_id:
                return output

        raise KeyError

    def get_result(self, instance_id: str) -> SWEBenchResult:
        for result in self.results:
            if result.instance_id == instance_id:
                return result

        raise KeyError

    def instance_ids(self) -> Iterable[str]:
        for output in self.output:
            yield output.instance_id

    def experiment(self) -> str:
        return self.filepath[:-6].split('no-hint-')[-1]

In [68]:
data = [Data.from_filepath(filepath) for filepath in filepaths]

In [69]:
def usage(output: EvalOutput) -> Iterable[dict[str, int]]:
    for iteration, step in enumerate(output.history):
        try:
            response_id = step['tool_call_metadata']['model_response']['id']
            usage = step['tool_call_metadata']['model_response']['usage']
        except KeyError:
            continue

        # Prompt tokens
        try:
            prompt_tokens = usage['prompt_tokens']
        except KeyError:
            prompt_tokens = 0

        # Completion tokens
        try:
            completion_tokens = usage['completion_tokens']
        except KeyError:
            completion_tokens = 0

        # Cache reads
        try:
            cache_reads = usage['prompt_tokens_details']['cached_tokens']
        except KeyError:
            cache_reads = 0

        # Cache writes
        try:
            cache_writes = usage['cache_creation_input_tokens']
        except KeyError:
            cache_writes = 0

        # Response latency
        response_latency = 0
        for entry in output.metrics['response_latencies']:
            if entry['response_id'] == response_id:
                response_latency = entry['latency']
                break

        yield {
            'prompt_tokens': prompt_tokens,
            'completion_tokens': completion_tokens,
            'cache_reads': cache_reads,
            'cache_writes': cache_writes,
            'response_latency': response_latency,
            'iteration': iteration / 2,
        }


def total_usage(output: EvalOutput) -> dict[str, int]:
    prompt_tokens = 0
    completion_tokens = 0
    cache_reads = 0
    cache_writes = 0
    response_latency = 0
    for token_usage in usage(output):
        prompt_tokens += token_usage['prompt_tokens']
        completion_tokens += token_usage['completion_tokens']
        cache_reads += token_usage['cache_reads']
        cache_writes += token_usage['cache_writes']
        response_latency += token_usage['response_latency']

    return {
        'prompt_tokens': prompt_tokens,
        'completion_tokens': completion_tokens,
        'cache_reads': cache_reads,
        'cache_writes': cache_writes,
        'response_latency': response_latency,
    }

In [70]:
def summarize(data: Data) -> pd.DataFrame:
    table = []
    for instance_id in data.instance_ids():
        try:
            output = data.get_output(instance_id)
            result = data.get_result(instance_id)
        except KeyError:
            continue

        # Detect reason for failure.
        error = 'other'
        if result.test_result.report.resolved:
            error = 'resolved'

        if output.error and not result.test_result.report.resolved:
            if output.error.startswith('Agent reached maximum iteration'):
                error = 'iter. limit'
            if output.error.startswith('Agent got stuck in a loop'):
                error = 'event loop'

        if error == 'other':
            if result.test_result.report.empty_generation:
                error = 'empty gen.'
            if (
                result.test_result.report.error_eval
                or result.test_result.report.failed_apply_patch
                or result.test_result.report.test_timeout
            ):
                error = 'test failure'

        row = {
            'experiment': data.experiment(),
            'instance_id': instance_id,
            'error': error,
            **total_usage(output),
            'history_length': len(output.history),
            **result.test_result.report.model_dump(),
        }
        table.append(row)

    return pd.DataFrame(table)


df = pd.concat([summarize(d) for d in data])

In [71]:
alt.Chart(df, title='Resolution Rate').mark_arc().encode(
    alt.Color('resolved'), alt.Theta('count()'), alt.Column('experiment').title(None)
).properties(width=150)

In [72]:
alt.Chart(df).mark_bar().encode(
    alt.X('x:Q').title('Iterations'),
    alt.Y('count()').title('# of Instances'),
    alt.Color('resolved').legend(None),
    column='experiment',
).transform_calculate(x='datum.history_length / 2').properties(width=150, height=150)

In [73]:
alt.Chart(df, title='Benchmark Outcomes').mark_arc().encode(
    alt.Theta('count()').title('# of Instances'),
    alt.Color('error').title('Outcomes'),
    alt.Column('experiment').title(None),
).properties(width=150, height=150)

In [74]:
def per_iteration(data: Data) -> pd.DataFrame:
    tables = []
    for instance_id in data.instance_ids():
        try:
            output = data.get_output(instance_id)
            result = data.get_result(instance_id)
        except KeyError:
            continue

        table = []
        for step_usage in usage(output):
            row = {
                'experiment': data.experiment(),
                'instance_id': instance_id,
                'resolved': result.test_result.report.resolved,
                **step_usage,
            }
            table.append(row)

        table_frame = pd.DataFrame(table)
        table_frame = table_frame.sort_values(by='iteration')
        table_frame['cumulative_cache_reads'] = table_frame['cache_reads'].cumsum()
        table_frame['cumulative_cache_writes'] = table_frame['cache_writes'].cumsum()
        table_frame['cache_efficiency'] = (
            table_frame['cumulative_cache_reads']
            - table_frame['cumulative_cache_writes']
        )

        tables.append(table_frame)

    return pd.concat(tables)


df_pi = pd.concat([per_iteration(d) for d in data])

alt.data_transformers.enable('vegafusion')

line = (
    alt.Chart(df_pi)
    .mark_line()
    .encode(
        alt.X('iteration').title('Iteration'),
        alt.Y('mean(prompt_tokens)').title('Prompt Tokens'),
        alt.Color('experiment').title('Experiment'),
    )
)

band = (
    alt.Chart(df_pi, title='Average Token Usage per Iteration')
    .mark_errorband(extent='ci')
    .encode(
        alt.X('iteration').title('Iteration'),
        alt.Y('prompt_tokens').title('Prompt Tokens'),
        alt.Color('experiment').title('Experiment'),
    )
)

band + line

In [75]:
line = (
    alt.Chart(df_pi)
    .mark_line()
    .encode(
        alt.X('iteration').title('Iteration'),
        alt.Y('mean(cache_efficiency)').title('Cache Reads - Cache Writes'),
        alt.Color('experiment').title('Experiment'),
    )
)

band = (
    alt.Chart(df_pi, title='Average Cache Efficiency per Iteration')
    .mark_errorband(extent='ci')
    .encode(
        alt.X('iteration').title('Iteration'),
        alt.Y('cache_efficiency').title('Cache Reads - Cache Writes'),
        alt.Color('experiment').title('Experiment'),
    )
)

band + line

In [76]:
def per_instance(data: Data) -> pd.DataFrame:
    table = []
    for instance_id in data.instance_ids():
        try:
            output = data.get_output(instance_id)
            result = data.get_result(instance_id)
        except KeyError:
            continue

        # Grab the usage metrics across all steps in the history.

        row = {
            'experiment': data.experiment(),
            'instance_id': instance_id,
            'resolved': 1 if result.test_result.report.resolved else 0,
            'iteration': len(output.history) // 2,
            **total_usage(output),
        }
        table.append(row)

    result = pd.DataFrame(table)
    result['total_tokens'] = result['prompt_tokens'] + result['completion_tokens']

    for field in (
        'iteration',
        'prompt_tokens',
        'completion_tokens',
        'total_tokens',
        'response_latency',
    ):
        result = result.sort_values(by=field)
        result[f'resolved_by_{field}'] = result['resolved'].cumsum()

    return result


df = pd.concat([per_instance(d) for d in data])


alt.Chart(df, title='Cumulative Resolved by Iteration').mark_line().encode(
    alt.X('iteration:Q').title('Iteration'),
    alt.Y('mean(resolved_by_iteration):Q').title('Cumulative Resolved'),
    alt.Color('experiment').title('Experiment'),
)

In [77]:
alt.Chart(df, title='Cumulative Resolved by Token Consumption').mark_line().encode(
    alt.X('total_tokens:Q').title('Total Tokens'),
    alt.Y('mean(resolved_by_total_tokens):Q').title('Cumulative Resolved'),
    alt.Color('experiment').title('Experiment'),
)

In [78]:
alt.Chart(df, title='Cumulative Resolved by Latency').mark_line().encode(
    alt.X('response_latency:Q').title('Response Latency'),
    alt.Y('mean(resolved_by_response_latency):Q').title('Cumulative Resolved'),
    alt.Color('experiment').title('Experiment'),
)