In [1]:
import os
from typing import Any
from pydantic import BaseModel
from analysis.models.openhands import EvaluationMetadata, EvaluationOutput


# The file format isn't currently supported directly, so we'll find the data manually

class Report(BaseModel):
    resolved_ids: list[str]
    unresolved_ids: list[str]

def load(filepath: str) -> dict[str, Any]:
    result = {}
    result["filepath"] = filepath

    with open(os.path.join(filepath, "metadata.json")) as f:
        result["metadata"] = EvaluationMetadata.model_validate_json(f.read())

    with open(os.path.join(filepath, "output.swebench.jsonl")) as f:
        result["output"] = [EvaluationOutput.model_validate_json(line) for line in f.readlines()]

    with open(os.path.join(filepath, "report.json")) as f:
        result["results"] = Report.model_validate_json(f.read())

    return result

data = {
    "sysbox": load("/Users/calvin/all-hands/data/sysbox"),
    "gvisor": load("/Users/calvin/all-hands/data/gvisor"),
}

In [2]:
# Find instances solved by both, one, and the other
instance_ids_resolved_by_both = set(data["sysbox"]["results"].resolved_ids) & set(data["gvisor"]["results"].resolved_ids)
print(f"Instances resolved by both: {len(instance_ids_resolved_by_both)}")

instance_ids_resolved_by_sysbox = set(data["sysbox"]["results"].resolved_ids) - set(data["gvisor"]["results"].resolved_ids)
print(f"Instances resolved by sysbox: {len(instance_ids_resolved_by_sysbox)}")

instance_ids_resolved_by_gvisor = set(data["gvisor"]["results"].resolved_ids) - set(data["sysbox"]["results"].resolved_ids)
print(f"Instances resolved by gvisor: {len(instance_ids_resolved_by_gvisor)}")

Instances resolved by both: 181
Instances resolved by sysbox: 30
Instances resolved by gvisor: 48


In [3]:
# We'll pull all the data into one big dataframe
import pandas as pd

# And we'll have to pull in some mechanisms for manipulating instances and their features
from analysis.features.metrics import apply_metrics
from analysis.features.metrics.code_metrics import CodeMetrics
from analysis.features.metrics.dependency_metrics import DependencyMetrics
from analysis.features.metrics.error_metrics import ErrorMetrics
from analysis.features.metrics.instance_metrics import InstanceMetrics
from analysis.features.metrics.patch_metrics import PatchMetrics
from analysis.features.metrics.type_metrics import TypeMetrics
from analysis.models.patch import Patch
from analysis.models.swe_bench import Split, Dataset

dataset = Dataset.from_split(Split.VERIFIED)
instance_metrics: dict[str, Any] = {}

for instance in dataset.instances:
    try:
        patch = Patch.from_instance(instance)
    except Exception as e:
        print(f"Failed to compute metrics for instance {instance.instance_id}: {e}")
        continue

    # Compute the metrics that act over diffs
    metrics = apply_metrics(
        patch,
        {
            "code": CodeMetrics,
            "type": TypeMetrics,
            "error": ErrorMetrics,
            "dependency": DependencyMetrics,
        },
    )

    # Build a row, making sure to add metrics for the patch and instance structure
    instance_metrics[instance.instance_id] = {
        **metrics,
        **PatchMetrics.from_patch(patch).to_dict(prefix="patch"),
        **InstanceMetrics.from_instance(instance).to_dict(prefix="instance"),
        "instance_id": instance.instance_id,
    }


# Each row will have columns for the system, instance, repo, features of the history and of the input/output
rows = []
for system, system_data in data.items():
    for output in system_data["output"]:
        row = {
            "system": system,
            "repo": output.instance_id.split("__")[0],
            "resolved_by_both": output.instance_id in instance_ids_resolved_by_both,
            "resolved_by_sysbox": output.instance_id in instance_ids_resolved_by_sysbox,
            "resolved_by_gvisor": output.instance_id in instance_ids_resolved_by_gvisor,
            **instance_metrics.get(output.instance_id, {}),
        }
        rows.append(row)

df = pd.DataFrame(rows)

  from .autonotebook import tqdm as notebook_tqdm


Failed to compute metrics for instance astropy__astropy-13398: 404 Client Error: Not Found for url: https://raw.githubusercontent.com/astropy/astropy/6500928dc0e57be8f06d1162eacc3ba5e2eff692/astropy/coordinates/builtin_frames/itrs_observed_transforms.py




In [4]:
# Now, we can extend the dataframe with more columns computed from the existing

# RESOLVED BY ONE BUT NOT THE OTHER
df["resolved_by_xor"] = df["resolved_by_sysbox"] ^ df["resolved_by_gvisor"]

In [None]:
# To see if there are any significant patterns, we'll check each feature against the
# classification targets with a simple Mann-Whitney U test.
from itertools import product
from analysis.features.relevance import feature_relevance

# The classification targets are all features (columns in the dataframe) that start
# with resolved_".
classification_targets = [col for col in df.columns if col.startswith("resolved_")]

# Now the "features" are all other columns with float/integer values
features = [col for col in df.columns if col not in classification_targets]
# features = [col for col in features if df[col].dtype in [float, int]]
print(f"Possible features: {features}")

# Since we're not comparing any feautres of the model output, we can focus on
# just one system (since the relevant features are the same for both)
df = df[df["system"] == "sysbox"]

# For every feature/target pair, split the dataset by the truthiness of the target
# and run the Mann-Whitney U test.
relevances = []
for (feature, target) in product(features, classification_targets):
    # Skip features that are not numeric
    if df[feature].dtype not in [float, int]:
        continue
    
    try:
        relevance = feature_relevance(df, feature, target)
        relevances.append(relevance)
    except ValueError:
        pass

# Grab the most relevant features by effect size
for relevance in list(sorted(relevances, key=lambda x: x.effect_size, reverse=True))[:10]:
    print(f"{relevance.feature} ({relevance.target}): p={relevance.p_value:.3f} (effect size {relevance.effect_size:.2f})")


Possible features: ['system', 'repo', 'code/number_of_functions/before', 'code/number_of_classes/before', 'code/number_of_methods/before', 'code/max_nested_depth/before', 'code/number_of_lines/before', 'code/number_of_comment_lines/before', 'code/number_of_docstring_lines/before', 'code/number_of_control_statements/before', 'code/number_of_variables/before', 'code/average_function_length/before', 'code/max_function_length/before', 'code/number_of_function_parameters/before', 'code/number_of_returns/before', 'code/number_of_imports/before', 'code/number_of_decorators/before', 'code/number_of_functions/after', 'code/number_of_classes/after', 'code/number_of_methods/after', 'code/max_nested_depth/after', 'code/number_of_lines/after', 'code/number_of_comment_lines/after', 'code/number_of_docstring_lines/after', 'code/number_of_control_statements/after', 'code/number_of_variables/after', 'code/average_function_length/after', 'code/max_function_length/after', 'code/number_of_function_paramet