In [10]:
import os
from typing import Any
from pydantic import BaseModel

import pandas as pd

from analysis.models.openhands import EvaluationMetadata, EvaluationOutput
from analysis.models.swe_bench import Dataset, Split
from analysis.models.patch import Patch


# The file format isn't currently supported directly, so we'll find the data manually

class Report(BaseModel):
    resolved_ids: list[str]
    unresolved_ids: list[str]

def load(filepath: str) -> dict[str, Any]:
    result = {}
    result["filepath"] = filepath

    with open(os.path.join(filepath, "metadata.json")) as f:
        result["metadata"] = EvaluationMetadata.model_validate_json(f.read())

    with open(os.path.join(filepath, "output.jsonl")) as f:
        result["output"] = [EvaluationOutput.model_validate_json(line) for line in f.readlines()]

    with open(os.path.join(filepath, "report.json")) as f:
        result["results"] = Report.model_validate_json(f.read())

    return result

data = load("/Users/calvin/all-hands/data/finish")
dataset = Dataset.from_split(Split.VERIFIED)

In [None]:
from analysis.features.metrics import apply_metrics
from analysis.features.metrics import (
    CodeMetrics,
    InstanceMetrics,
    PatchMetrics,
    TypeMetrics,
    ErrorMetrics,
    DependencyMetrics,
)
from analysis.features.metrics.localization_metrics import (
    LocalizationMetrics,
    in_patch,
    is_reproduction_attempt,
)
import json


def get_finish_results(step: dict[str, Any]) -> dict[str, Any]:
    # assert 'action' in action, f"Last action is not finish: {action}"
    if not step.get("action", "") == "finish":
        result = {
            "message": "",
            "task_completed": "unknown"
        }
        print(step)
        return result

    print(step["args"]["task_completed"])

    result = json.loads(step["tool_call_metadata"]["model_response"]["choices"][0]["message"]["tool_calls"][0]["function"]["arguments"])
    print(result)
    return result
    


rows = []
for output in data["output"]:
    # Grab the instance from the dataset and the associated patch
    instance = [
        instance
        for instance in dataset.instances
        if instance.instance_id == output.instance_id
    ][0]
    try:
        patch = Patch.from_instance(instance)
    except Exception as e:
        print(f"Failed to load patch for {instance.instance_id}: {e}")
        continue

    if not output.history:
        continue

    # Now compute a bunch of metrics:
    row = {
        "repo": output.instance_id.split("__")[0],
        "instance_id": output.instance_id,
        "resolved": output.instance_id in data["results"].resolved_ids,
        **InstanceMetrics.from_instance(instance).to_dict(),
        **apply_metrics(
            patch,
            metrics={
                "code": CodeMetrics,
                "dependency": DependencyMetrics,
                "error": ErrorMetrics,
                "type": TypeMetrics,
            },
        ),
        **PatchMetrics.from_patch(patch).to_dict(),
        **LocalizationMetrics.from_trajectory(
            output.history,
            lambda loc: in_patch(loc, patch)
            or is_reproduction_attempt(loc, output.instance_id.split("__")[0]),
        ).to_dict(),
        **get_finish_results(output.history[-1])
    }
    rows.append(row)

df = pd.DataFrame(rows)

{'message': 'I\'ve successfully implemented a fix for the issue with duplicate locations in the Message class of Sphinx\'s gettext builder. Here\'s a summary of what I did:\n\n1. **Identified the Issue**: The problem was in the `Message` class in `sphinx/builders/gettext.py`, where duplicate locations were not being removed, causing redundant entries in the generated .pot files.\n\n2. **Implemented the Fix**: I modified the `Message` class to include a `uniqueLocation` method that removes duplicate locations while preserving order:\n   ```python\n   def uniqueLocation(self, locations: List[Tuple[str, int]]) -> List[Tuple[str, int]]:\n       """Remove duplicate locations while preserving order."""\n       seen = set()\n       unique_locations = []\n       for location in locations:\n           if location not in seen:\n               seen.add(location)\n               unique_locations.append(location)\n       return unique_locations\n   ```\n   And updated the `__init__` method to use t



{'id': 201, 'timestamp': '2025-02-27T22:27:40.705533', 'source': 'agent', 'message': 'Command `find /workspace/sphinx-doc__sphinx__3.1/tests -name "test_*.py" | grep -i autodoc` executed with exit code 0.', 'cause': 200, 'observation': 'run', 'tool_call_metadata': {'function_name': 'execute_bash', 'tool_call_id': 'toolu_01UMzsuqXtYZSpBuh1dd5cNs', 'model_response': {'id': 'chatcmpl-314c5b1d-6b39-4ccc-8063-35fbdeb23c1b', 'created': 1740695259, 'model': 'litellm_proxy/claude-3-7-sonnet-20250219', 'object': 'chat.completion', 'system_fingerprint': None, 'choices': [{'finish_reason': 'tool_calls', 'index': 0, 'message': {'content': "Great! All the tests pass. Let's also run the tests for the `autodoc` module:", 'role': 'assistant', 'tool_calls': [{'index': 1, 'function': {'arguments': '{"command": "find /workspace/sphinx-doc__sphinx__3.1/tests -name \\"test_*.py\\" | grep -i autodoc", "is_input": "false"}', 'name': 'execute_bash'}, 'id': 'toolu_01UMzsuqXtYZSpBuh1dd5cNs', 'type': 'function'}



{'message': "I've successfully implemented the necessary changes to fix the issue with `model_to_dict()` not returning an empty dict when called with an empty list of fields.\n\nHere's a summary of what I did:\n\n1. First, I explored the repository structure and located the `model_to_dict()` function in `/workspace/django__django__3.0/django/forms/models.py`.\n\n2. I created a script to reproduce the issue, which confirmed that when `model_to_dict()` was called with `fields=[]`, it was returning all fields instead of an empty dict.\n\n3. I identified the root cause of the issue:\n   - The condition `if fields and f.name not in fields:` was evaluating to `False` when `fields=[]` because an empty list is falsy in Python.\n   - As a result, the `continue` statement was not executed, and all fields were included in the result.\n\n4. I implemented a two-part solution:\n   - Changed the condition to `if fields is not None and f.name not in fields:` to handle empty lists correctly.\n   - Adde



{'message': 'I\'ve successfully fixed the issue with Unicode method names causing UnicodeDecodeError in Python 2.7.2. Here\'s a summary of what I did:\n\n1. **Identified the root cause**: When a Unicode method name (e.g., `u\'POST\'`) is used with files in Python 2.7.2, it remains a Unicode string throughout the request process, which causes a `UnicodeDecodeError` when it\'s concatenated with binary data.\n\n2. **Implemented a fix**:\n   - Modified the `prepare_method` function in `models.py` to convert the uppercased method to a native string using `to_native_string()`:\n     ```python\n     def prepare_method(self, method):\n         """Prepares the given HTTP method."""\n         self.method = method\n         if self.method is not None:\n             self.method = to_native_string(self.method.upper())\n     ```\n   - Removed the redundant `.upper()` call in `sessions.py` when creating the Request object, as the uppercasing will be handled in the `prepare_method` function.\n\n3. **C



{'message': "I've successfully implemented the necessary changes to fix the issue with the migration autodetector and FK field with hardcoded reference. Here's a summary of what I did:\n\n1. **Identified the Issue**: The problem was in the `only_relation_agnostic_fields` method of the `MigrationAutodetector` class. When a custom field's `deconstruct` method removes the 'to' key, the autodetector tries to delete it again with `del deconstruction[2]['to']`, causing a KeyError.\n\n2. **Created a Reproduction Script**: I created a script that demonstrates the issue with a custom ForeignKey field that removes the 'to' key in its `deconstruct` method.\n\n3. **Fixed the Issue**: I replaced `del deconstruction[2]['to']` with `deconstruction[2].pop('to', None)` in the `only_relation_agnostic_fields` method. This safely removes the key if it exists and does nothing if it doesn't.\n\n4. **Added Comprehensive Tests**: I created tests for various edge cases:\n   - A custom field that removes 'to' i



{'message': "I've successfully implemented a fix for the issue where `django.utils.formats.get_format` should allow lazy parameters.\n\nHere's a summary of what I did:\n\n1. **Identified the issue**: The `get_format` function in `django/utils/formats.py` was not handling lazy strings properly, causing a TypeError when using the date template filter with a lazy string like `some_date|date:_('Y-m-d')`.\n\n2. **Created a reproduction script**: I created scripts to reproduce the error and confirm that it was occurring when using lazy strings with the date filter.\n\n3. **Implemented the fix**: I modified the `get_format` function to handle lazy strings by:\n   - Adding a check for None values with a clear error message\n   - Converting the `format_type` parameter to a string using `str()` before using it with `getattr()`\n\n4. **Tested the fix**: I created comprehensive tests to verify that:\n   - Basic lazy strings work correctly\n   - Nested lazy strings work correctly\n   - Template fil



{'message': "I've successfully fixed the issue with the PostgreSQL database client's `runshell` function not respecting `os.environ` values in some cases.\n\n### The Problem\nThe issue was that the PostgreSQL client's `settings_to_cmd_args_env` method was returning an empty dictionary (`{}`) when no PostgreSQL-specific environment variables were needed, instead of returning `None`. This caused the base client's `runshell` method to ignore `os.environ` and pass an empty environment to the subprocess, which prevented environment variables from being properly passed to the PostgreSQL client.\n\n### The Solution\nI modified the PostgreSQL client implementation in `/workspace/django__django__4.0/django/db/backends/postgresql/client.py` to:\n\n1. Initialize `env` as `None` instead of an empty dictionary\n2. Collect any PostgreSQL-specific environment variables in a separate `env_vars` dictionary\n3. Only set `env` to the `env_vars` dictionary if there are actually environment variables to se

In [44]:
# Add a column for all instances where `resolved` and `task_completed` do not agree
df["final_thought_mismatch"] = df.apply(lambda row: (row["resolved"] and row["task_completed"] != "true") or ((not row["resolved"]) and row["task_completed"] != "false"), axis=1)

In [50]:
from analysis.features.relevance import feature_relevance
from itertools import product

targets = ["resolved", "task_completed", "final_thought_mismatch"]
# The features are all columns containing ints or floats that arent targets
features = [col for col in df.columns if col not in targets and df[col].dtype in [int, float]]

relevances = []
for feature, target in product(features, targets):
    try:
        relevance = feature_relevance(df, feature, target)
        relevances.append(relevance)
    except Exception as e:
        continue

# Find the 10 most relevant features (with the largest effect size)
relevances = sorted(relevances, key=lambda rel: rel.effect_size, reverse=True)[:10]

In [51]:
import altair as alt

# Create a chart for each of the top 10 features
charts = []
for relevance in relevances:
    chart = (
        alt.Chart(df)
        .mark_boxplot()
        .encode(
            x=alt.X(relevance.target),
            y=alt.Y(relevance.feature),
        )
    )
    charts.append(chart)

# And lay the charts out horizotnally
alt.hconcat(*charts)

In [54]:
df['task_completed'].value_counts()

task_completed
true       344
unknown    155
Name: count, dtype: int64