In [1]:
from analysis.models.data import Data

with open("../data.json") as f:
    data = Data.model_validate_json(f.read())

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from analysis.models.openhands import Evaluation
import os

# Load all data in the indicated directory
EXPERIMENT_DIRECTORY = "/Users/calvin/all-hands/data/localization"
subdirs = [d for d in os.listdir(EXPERIMENT_DIRECTORY) if os.path.isdir(os.path.join(EXPERIMENT_DIRECTORY, d))]
evals = {
    subdir: Evaluation.from_filepath(os.path.join(EXPERIMENT_DIRECTORY, subdir))
    for subdir in subdirs
}

print(f"Loaded {len(evals)} evaluations: {', '.join(evals.keys())}")

Loaded 5 evaluations: localization-phase-prompt, localization-prompt-3, localization-prompt-2, localization-prompt-1, localization-default-prompt


In [3]:
from typing import Any, Iterable
# Utility functions for checking if a step is a read/write, what the location is, etc.

def is_a_read(step: dict[str, Any], location: str | None = None, excluding: Iterable[str] | None = None) -> bool:
    if location is not None:
        if location not in str(step):
            return False
        
    if excluding is not None:
        if any(ex in str(step) for ex in excluding):
            return False
        
    return "read" in step.get("action", "")

def is_a_write(step: dict[str, Any], location: str | None = None, excluding: Iterable[str] | None = None) -> bool:
    if location is not None:
        if location not in str(step):
            return False
        
    if excluding is not None:
        if any(ex in str(step) for ex in excluding):
            return False
        
    return "edit" in step.get("observation", "")

def get_touched_file(step: dict[str, Any]) -> str:
    return step["message"].split(" ")[-1]


In [4]:
from typing import Any, Iterable
import pandas as pd

from analysis.models.patch import Patch

def localization_score(evaluation_output: Evaluation, experiment: str | None = None) -> pd.DataFrame:
    rows = []
    for trajectory in evaluation_output.output:
        # Load the instance and generate a patch. We'll use this to find all files that have to be touched by a solution
        # and to compute metrics over the file complexity (to see if these are correlated with OH's ability to localize)
        instance = data.get_instance(trajectory.instance_id)
        gold_patch = Patch.from_instance(instance)

        for file in gold_patch.diffs.keys():
            all_reads = [i for i, step in enumerate(trajectory.history) if is_a_read(step, file)]
            all_writes = [i for i, step in enumerate(trajectory.history) if is_a_write(step, file)]
            
            rows.append({
                "experiment": experiment,
                "repo": trajectory.instance_id.split("__")[0],
                "instance_id": trajectory.instance_id,
                "resolved": evaluation_output.is_resolved(trajectory.instance_id),
                "file": file,
                "file_in_problem_statement": file.split("/")[-1] in instance.problem_statement,
                "path_in_problem_statement": file in instance.problem_statement,
                "file_depth": len(file.split("/")),
                "steps": len(trajectory.history),
                "reads": len(all_reads),
                "writes": len(all_writes),
                "first_read": all_reads[0] if all_reads else -1,
                "last_read": all_reads[-1] if all_reads else -1,
                "first_write": all_writes[0] if all_writes else -1,
                "last_write": all_writes[-1] if all_writes else -1,
                "total_reads": sum(1 for step in trajectory.history if is_a_read(step)),
                "total_writes": sum(1 for step in trajectory.history if is_a_write(step)),
                "other_reads": sum(1 for step in trajectory.history if is_a_read(step, excluding=[file])),
                "other_writes": sum(1 for step in trajectory.history if is_a_write(step, excluding=[file])),
            })

    return pd.DataFrame(rows)

localization = pd.concat([localization_score(evaluation_output, experiment) for experiment, evaluation_output in evals.items()])
localization

Unnamed: 0,experiment,repo,instance_id,resolved,file,file_in_problem_statement,path_in_problem_statement,file_depth,steps,reads,writes,first_read,last_read,first_write,last_write,total_reads,total_writes,other_reads,other_writes
0,localization-phase-prompt,astropy,astropy__astropy-14309,True,astropy/io/fits/connect.py,True,True,4,34,1,2,3,3,12,16,2,3,1,1
1,localization-phase-prompt,sympy,sympy__sympy-15599,False,sympy/core/mod.py,True,True,3,40,2,3,7,21,10,20,2,6,0,3
2,localization-phase-prompt,scikit-learn,scikit-learn__scikit-learn-14983,True,sklearn/model_selection/_split.py,False,False,3,38,4,2,7,19,22,26,4,4,0,2
3,localization-phase-prompt,scikit-learn,scikit-learn__scikit-learn-25232,True,sklearn/impute/_iterative.py,False,False,3,70,1,12,3,3,14,48,2,16,1,4
4,localization-phase-prompt,django,django__django-13279,True,django/contrib/sessions/backends/base.py,False,False,5,34,1,2,9,9,6,12,1,5,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52,localization-default-prompt,sympy,sympy__sympy-22080,False,sympy/printing/precedence.py,False,False,3,101,0,0,-1,-1,-1,-1,8,21,8,21
53,localization-default-prompt,astropy,astropy__astropy-14369,False,astropy/units/format/cds.py,False,False,4,46,1,9,9,9,6,32,2,10,1,1
54,localization-default-prompt,astropy,astropy__astropy-14369,False,astropy/units/format/cds_parsetab.py,False,False,4,46,0,0,-1,-1,-1,-1,2,10,2,10
55,localization-default-prompt,sphinx-doc,sphinx-doc__sphinx-9281,False,sphinx/util/inspect.py,False,False,3,62,3,2,15,29,32,36,9,8,6,6


In [5]:
# Total number of files
print(f"Total number of files: {len(localization)}")

# How many files are never read?
never_read = localization[localization["reads"] == 0]
print(f"Files that are never read: {len(never_read)}")

# Are any of those files in the problem statement?
print(f"...and in the problem statement: {len(never_read[never_read['file_in_problem_statement']])}")

# Do we write to files that are never read?
print(f"...and written to: {len(never_read[never_read['writes'] > 0])}")

# How many instances with a file never read are resolved?
by_instance = never_read.groupby("instance_id").agg({"resolved": "max"})
print(f"...and still resolved: {len(by_instance[by_instance['resolved']])}")

# The instances that are resolved without reading from a location
resolved = by_instance[by_instance["resolved"]].index.tolist()
print(f"\nThe culprits: {resolved}")

# Instances without a location referenced in the problem statemen
wo_referenced_location = localization.groupby("instance_id").agg({"path_in_problem_statement": "max"})
print(f"\nInstances without a location referenced in the problem statement: {len(wo_referenced_location[wo_referenced_location['path_in_problem_statement'] == 0])}")

Total number of files: 288
Files that are never read: 58
...and in the problem statement: 5
...and written to: 2
...and still resolved: 3

The culprits: ['django__django-12155', 'django__django-12663', 'matplotlib__matplotlib-25775']

Instances without a location referenced in the problem statement: 42


In [6]:
# Grab the read/write-based column names
read_write_columns = [col for col in localization.columns if col in ["reads", "writes", "total_reads", "total_writes", "first_read", "last_read", "first_write", "last_write", "other_reads", "other_writes"]]

In [7]:
# Check how much overlap there is for each of the read/write columns using a Mann-Whitney U test
from scipy.stats import mannwhitneyu

mwu = localization.groupby("instance_id").agg({
    "resolved": "max",
    "reads": "sum",
    "writes": "sum",
    "total_reads": "sum",
    "total_writes": "sum",
    "other_reads": "sum",
    "other_writes": "sum",
    "first_read": "min",
    "last_read": "max",
    "first_write": "min",
    "last_write": "max",
    })

resolved = mwu[mwu["resolved"] == 1]
not_resolved = mwu[mwu["resolved"] == 0]

for column in read_write_columns:
    result = mannwhitneyu(resolved[column], not_resolved[column])
    print(f"Metric: {column}, stat: {result.statistic}, p-value: {result.pvalue:.3f}")

Metric: reads, stat: 379.0, p-value: 0.167
Metric: writes, stat: 271.0, p-value: 0.475
Metric: first_read, stat: 443.5, p-value: 0.004
Metric: last_read, stat: 270.0, p-value: 0.463
Metric: first_write, stat: 478.0, p-value: 0.001
Metric: last_write, stat: 239.0, p-value: 0.174
Metric: total_reads, stat: 236.5, p-value: 0.165
Metric: total_writes, stat: 191.0, p-value: 0.023
Metric: other_reads, stat: 178.0, p-value: 0.011
Metric: other_writes, stat: 217.5, p-value: 0.078


In [8]:
import altair as alt

# Visualizing the differences in the distributions of the top 3 features based on resolved or not
plots = []
for feature in read_write_columns:
    chart = alt.Chart(mwu).mark_boxplot().encode(
        alt.X("resolved").title(None).axis(labels=False),
        y=feature,
        color="resolved:N"   
    ).properties(
        width=50,
        height=300
    )
    plots.append(chart)

alt.hconcat(*plots).properties(title="Distribution of read/write metrics based on resolved or not")

In [None]:
# Build a new dataframe based on the read/write performance of each step

rows = []

for trajectory in output.trajectories:
    locations = ["workspace/reproduce.py", *localization[localization["instance_id"] == trajectory.instance_id]["location"]]

    correct_reads, total_reads = 0, 0
    correct_writes, total_writes = 0, 0

    for index, step in enumerate(trajectory.history):
        location: str = ""
        behavior: str = "other"
        # Handle the five cases: good read, bad read, good write, bad write, other
        if is_a_read(step):
            total_reads += 1
            behavior = "bad read"
            message = step.get("message", "")
            location = step["message"].split(" ")[-1]
            
            if any(loc in message for loc in locations) or not location.endswith(".py"):
                correct_reads += 1
                behavior = "good read"
            
            location = step["message"].split(" ")[-1]


        elif is_a_write(step):
            total_writes += 1
            behavior = "bad write"
            message = step.get("message", "")
            if any(location in message for location in locations):
                correct_writes += 1
                behavior = "good write"

            location = step["message"].split(" ")[-1]

        rows.append({
            "repo": trajectory.instance_id.split("__")[0],
            "instance_id": trajectory.instance_id,
            "step": index,
            "behavior": behavior,
            "correct_reads": correct_reads,
            "total_reads": total_reads,
            "read_accuracy": correct_reads / total_reads if total_reads > 0 else 0,
            "correct_writes": correct_writes,
            "total_writes": total_writes,
            "write_accuracy": correct_writes / total_writes if total_writes > 0 else 0,
            "localization_accuracy": (correct_reads + correct_writes) / (total_reads + total_writes) if total_reads + total_writes > 0 else 0,
            "resolved": output.is_resolved(trajectory.instance_id),
            "location": location,
        })

stepwise_localization = pd.DataFrame(rows)

NameError: name 'output' is not defined

In [None]:
stepwise_localization['resolved'].value_counts()

In [None]:
alt.data_transformers.disable_max_rows()

In [None]:
# Graph average read accuracy by repo
alt.Chart(stepwise_localization).mark_line().encode(
    x="step",
    y="mean(read_accuracy)",
    color="repo")

In [None]:
# Graph average write accuracy by repo
alt.Chart(stepwise_localization).mark_line().encode(
    x="step",
    y="mean(write_accuracy)",
    color="repo")

In [None]:
# Graph average localization accuracy by repo
alt.Chart(stepwise_localization).mark_line().encode(
    x="step",
    y="mean(localization_accuracy)",
    color="repo")

In [None]:
# Graph each trajectory by per-step behavior
alt.Chart(stepwise_localization).mark_point(filled=True, size=50).encode(
    alt.X("step:O").title(None).axis(None),
    alt.Y("instance_id:N").title(None),
    alt.Color("behavior:N", title="Behavior", scale=alt.Scale(domain=["good read", "bad read", "good write", "bad write", "other"], range=["green", "red", "blue", "yellow", "gray"])),
    alt.Shape("resolved", scale=alt.Scale(domain=[True, False], range=["square", "cross"])),
    alt.Tooltip("location")
).properties(width=1500, height=4000)