# EquiBench
Coding benchmark for evaluating the performance of LLMs in both unserstanding and generating code. Specifically benchmark focuses on performance of LLMs as coding agents within large codebases, and their ability to understand and generate code in a way that is consistent and compatible with the existing codebase.
v1 React repo; PR 32224 (this is probbably in the training set, but proves the concept and provides a starting point for future iterations)

In [None]:
import kaggle_benchmarks as kbench
import subprocess
import os
import re
from pathlib import Path

# ---------------------------------------------------------------------------
# Benchmark metadata
# ---------------------------------------------------------------------------

TASK_NAME = "react_fiber_suspense_hydration_anywhere_pr_32224"
REPO_PATH = "./react"
PR_NUMBER = 32224
REPO_SLUG  = "facebook/react"

PR_SUMMARY = """
[Fiber] support hydration when rendering Suspense anywhere #32224

**Description:**
This PR addresses a long-standing issue where React could not correctly hydrate a
tree that contained a Suspense boundary if the boundary was not also present on the
server-rendered HTML. This would often lead to the entire tree being de-optimized
and client-rendered from scratch.

The core change is to allow the hydration process to "skip over" unexpected Suspense
boundaries found on the client. Instead of throwing an error, React will now treat
the content inside the Suspense boundary as a client-only insertion.

**Key Changes:**
1. **`react-reconciler`:**
   - Modified `hydrateSuspenseBoundary` and related functions in
     `ReactFiberHydrationContext.js`.
   - Introduced logic to handle cases where a Suspense boundary is encountered
     during hydration but no corresponding server-rendered content exists.
   - The reconciler now attempts to hydrate the children of the Suspense boundary.
     If it can't find matching DOM nodes, it switches to client-rendering mode for
     that subtree.
2. **Fiber Flags:**
   - New or modified flags to track the hydration state of Suspense boundaries.
3. **Testing:**
   - New test cases in `ReactSuspenseHydration-test.js` covering these hydration
     scenarios, including nested Suspense boundaries and client-only boundaries.
"""

# ---------------------------------------------------------------------------
# Ground-truth fallbacks
# These are documented here for reference and used as a sanity check.
# The authoritative GT_FILES / GT_CONTENTS / PR_TEST_FILES are computed
# dynamically in Phase 1 by diffing the actual PR against its merge base.
# ---------------------------------------------------------------------------

FALLBACK_GT_FILES = {
    "packages/react-reconciler/src/ReactFiberHydrationContext.js",
    "packages/react-reconciler/src/ReactFiberWorkLoop.js",
    "packages/react-reconciler/src/__tests__/ReactSuspenseHydration-test.js",
}


## Utilities

In [None]:
def run_command(command: str, cwd: str, timeout: int = 600) -> subprocess.CompletedProcess:
    """Run a shell command and return the CompletedProcess result."""
    try:
        return subprocess.run(
            command,
            cwd=cwd,
            capture_output=True,
            text=True,
            shell=True,
            timeout=timeout,
            check=False,
        )
    except subprocess.TimeoutExpired:
        return subprocess.CompletedProcess(
            args=command,
            returncode=1,
            stdout="",
            stderr=f"Command timed out after {timeout} seconds.",
        )


def get_git_modified_files(repo_path: str) -> set[str]:
    """Return the set of files modified (vs HEAD) according to git."""
    result = run_command("git diff --name-only HEAD", cwd=repo_path, timeout=30)
    if result.returncode != 0:
        return set()
    return {line.strip() for line in result.stdout.splitlines() if line.strip()}


def strip_code_fences(text: str) -> str:
    """Remove opening and closing markdown code fences from LLM output."""
    text = re.sub(r"^```[a-zA-Z]*\n", "", text.strip())
    text = re.sub(r"\n```$", "", text.strip())
    return text


## Tools

Both phases run inside an agent loop and share the same tool set.  Tools are
scoped to the repository root so the model cannot escape it.

| Tool | Purpose |
|------|---------|
| `read_file` | Read a source file by repo-relative path |
| `search_in_file` | Grep for a regex pattern within a single file |
| `grep_repo` | Grep for a regex pattern across the entire repo (respects `.gitignore`) |
| `list_directory` | List entries in a repo-relative directory |
| `write_file` | Write (create or overwrite) a file — *implementation phase only* |

In [None]:
# Maximum characters returned by read_file tool — prevents context explosions on
# large source files while still giving the model enough to work with.
MAX_TOOL_FILE_CHARS = 12_000


def make_repo_tools(repo_path: str, *, allow_writes: bool = False) -> list:
    """
    Return a list of plain Python tool functions scoped to `repo_path`.

    Pass allow_writes=True for the implementation phase; keep it False for the
    read-only analysis/planning phase.

    Tools are plain functions — no decorator needed; they are passed directly
    to llm.prompt(tools=[...]).
    """
    root = Path(repo_path).resolve()

    def _safe_path(relative: str) -> Path:
        """Resolve a repo-relative path and reject path-escape attempts."""
        resolved = (root / relative).resolve()
        if not str(resolved).startswith(str(root)):
            raise ValueError(f"Path '{relative}' escapes the repository root.")
        return resolved

    def read_file(path: str) -> str:
        """
        Read the contents of a file in the repository.

        Args:
            path: Repo-relative path, e.g.
                  "packages/react-reconciler/src/ReactFiberHydrationContext.js"

        Returns the file contents as a string, truncated to 12,000 characters
        for large files (a truncation notice is appended when this occurs).
        Raises FileNotFoundError if the file does not exist.
        """
        text = _safe_path(path).read_text(encoding="utf-8")
        if len(text) > MAX_TOOL_FILE_CHARS:
            notice = (
                f"\n\n[TRUNCATED: file is {len(text):,} chars; "
                f"only the first {MAX_TOOL_FILE_CHARS:,} are shown. "
                "Use search_in_file or grep_repo to locate specific sections.]"
            )
            return text[:MAX_TOOL_FILE_CHARS] + notice
        return text

    def search_in_file(path: str, pattern: str) -> str:
        """
        Search for a regex pattern within a single file, returning matching lines
        with their line numbers.

        Args:
            path: Repo-relative path to the file.
            pattern: Python regex pattern to search for.
        """
        text = _safe_path(path).read_text(encoding="utf-8")
        matches = [
            f"L{i+1}: {line}"
            for i, line in enumerate(text.splitlines())
            if re.search(pattern, line)
        ]
        return "\n".join(matches) if matches else f"No matches for '{pattern}' in {path}."

    def grep_repo(pattern: str, file_glob: str = "**/*.js") -> str:
        """
        Search for a regex pattern across all files matching a glob in the repo.

        Args:
            pattern: Python regex pattern to search for.
            file_glob: Glob pattern relative to repo root. Default: "**/*.js".

        Returns up to 50 matching lines with file paths and line numbers.
        """
        results = []
        for filepath in root.glob(file_glob):
            if ".git" in filepath.parts:
                continue
            try:
                for i, line in enumerate(
                    filepath.read_text(encoding="utf-8", errors="ignore").splitlines()
                ):
                    if re.search(pattern, line):
                        rel = str(filepath.relative_to(root))
                        results.append(f"{rel}:L{i+1}: {line}")
                        if len(results) >= 50:
                            results.append("... (truncated at 50 matches)")
                            return "\n".join(results)
            except (PermissionError, IsADirectoryError):
                continue
        return "\n".join(results) if results else f"No matches for '{pattern}'."

    def list_directory(path: str = "") -> str:
        """
        List the contents of a directory in the repository.

        Args:
            path: Repo-relative directory path (empty string = repo root).

        Returns a newline-separated list of entries; directories are suffixed '/'.
        """
        target = _safe_path(path) if path else root
        if not target.is_dir():
            return f"'{path}' is not a directory."
        entries = sorted(target.iterdir(), key=lambda p: (p.is_file(), p.name))
        return "\n".join((e.name + "/" if e.is_dir() else e.name) for e in entries)

    tools = [read_file, search_in_file, grep_repo, list_directory]

    if allow_writes:
        def write_file(path: str, content: str) -> str:
            """
            Write (create or overwrite) a file in the repository.

            Args:
                path: Repo-relative path of the file to write.
                content: Complete new file content. Do not include markdown code
                         fences — they will be stripped automatically.

            Returns a confirmation message with the path and byte count written.
            """
            target = _safe_path(path)
            target.parent.mkdir(parents=True, exist_ok=True)
            cleaned = strip_code_fences(content)
            target.write_text(cleaned, encoding="utf-8")
            return f"Written: {path} ({len(cleaned):,} chars)"

        tools.append(write_file)

    return tools


## Agent Tool Loop

`llm.prompt(tools=...)` with automatic tool-calling only works when the model is
loaded with `api="genai"` (Google GenAI).  For all other providers and APIs —
OpenAI-compatible, Anthropic, open-weights models, etc. — tools must be driven
manually.

`run_tool_loop()` implements a **provider-agnostic agentic loop** that works
identically across every model and provider kbench supports:

1. The system prompt describes available tools in plain text and instructs the
   model to emit a single JSON object `{"tool": "...", "args": {...}}` when it
   wants to call a tool, or plain text when it is done.
2. Responses are inspected for a JSON tool-call object.  If found, the tool is
   executed and the result is fed back via `llm.send()` + `llm.respond()`.
3. The loop continues until the model produces a plain-text final response or
   `max_iterations` is exhausted (which is recorded as a model failure).
4. All turns occur inside a single `kbench.chats.new()` context so the model
   has full conversation history throughout.

In [None]:
import inspect
import json

# Regex to detect a JSON tool-call object anywhere in a model response.
# Matches the outermost {...} that contains a "tool" key.
_TOOL_CALL_RE = re.compile(r'\{\s*"tool"\s*:', re.DOTALL)


def _format_tool_descriptions(tools: list) -> str:
    """
    Build a plain-text listing of available tools to embed in a prompt.

    Uses each function's name, docstring, and type-annotated signature so the
    model knows exactly what arguments to pass.
    """
    lines = []
    for fn in tools:
        sig = inspect.signature(fn)
        params = []
        for name, param in sig.parameters.items():
            ann = param.annotation
            type_str = ann.__name__ if hasattr(ann, "__name__") else str(ann)
            if type_str == "_empty":
                type_str = "str"
            default = f" = {param.default!r}" if param.default is not inspect.Parameter.empty else ""
            params.append(f"{name}: {type_str}{default}")
        signature = f"{fn.__name__}({', '.join(params)})"
        doc = (inspect.getdoc(fn) or "").strip()
        lines.append(f"### `{signature}`\n{doc}\n")
    return "\n".join(lines)


def _extract_tool_call(response: str) -> dict | None:
    """
    Try to find and parse a JSON tool-call object from a model response.

    Returns a dict with keys 'tool' and 'args', or None if the response is a
    plain-text final answer (no tool call found or JSON is malformed).
    """
    match = _TOOL_CALL_RE.search(response)
    if not match:
        return None
    # Scan forward from the opening brace to find the matching closing brace.
    start = match.start()
    depth = 0
    for i, ch in enumerate(response[start:], start):
        if ch == "{":
            depth += 1
        elif ch == "}":
            depth -= 1
            if depth == 0:
                candidate = response[start : i + 1]
                try:
                    parsed = json.loads(candidate)
                    if "tool" in parsed:
                        return parsed
                except json.JSONDecodeError:
                    return None
    return None


def run_tool_loop(
    llm,
    prompt: str,
    tools: list,
    max_iterations: int,
    chat_name: str,
) -> str:
    """
    Provider-agnostic agentic tool-calling loop.

    Works with every model and API supported by kbench — no `api="genai"` required.

    The loop sends `prompt` as the first message.  If the model emits a JSON
    tool-call object, the named tool is executed and its result is fed back as
    a user message.  This continues until the model gives a plain-text final
    response or `max_iterations` is reached.

    Args:
        llm:            The kbench LLM object for the model under evaluation.
        prompt:         The full initial prompt, including embedded tool descriptions.
        tools:          List of plain Python tool functions.
        max_iterations: Maximum tool-call turns before declaring a model failure.
        chat_name:      Name passed to kbench.chats.new() for context isolation.

    Returns:
        The model's final plain-text response.

    Raises:
        RuntimeError: If max_iterations is exhausted without a plain-text response.
                      Callers should catch this and call kbench.assertions.assert_fail().
    """
    tool_map = {fn.__name__: fn for fn in tools}

    with kbench.chats.new(chat_name):
        response = llm.prompt(prompt)

        for iteration in range(max_iterations):
            tool_call = _extract_tool_call(response)

            if tool_call is None:
                # Model gave a plain-text response — treat as final answer.
                return response

            tool_name = tool_call.get("tool", "")
            tool_args = tool_call.get("args", {})

            if tool_name not in tool_map:
                tool_result = (
                    f"Error: unknown tool '{tool_name}'. "
                    f"Available tools: {sorted(tool_map.keys())}"
                )
            else:
                try:
                    tool_result = str(tool_map[tool_name](**tool_args))
                except Exception as e:
                    tool_result = f"Error executing {tool_name}({tool_args}): {type(e).__name__}: {e}"

            print(f"  [{iteration + 1}/{max_iterations}] {tool_name} → {tool_result[:120]}...")

            # Feed the result back and get the next model turn.
            llm.send(f"Tool result ({tool_name}):\n{tool_result}")
            response = llm.respond()

        # After the final iteration, check once more before raising — the model
        # may have produced a plain-text answer on the last turn.
        if _extract_tool_call(response) is None:
            return response

        raise RuntimeError(
            f"Tool loop exhausted {max_iterations} iterations without producing "
            "a final plain-text response. The model may be stuck in a tool-calling loop."
        )


## Prompts

Both phases use `run_tool_loop()` — the provider-agnostic loop defined above —
so the benchmark works identically across every model and API.

Tool descriptions are embedded directly into the prompt as plain text, and the
model is instructed to emit a structured JSON object when calling a tool:

```
{"tool": "<name>", "args": {"<param>": "<value>", ...}}
```

Plain-text output (no JSON tool-call object) signals the final answer.

### Planning phase (judge LLM — read-only tools)
The judge sees the PR summary and file list, then must actively explore the
codebase using tools before writing its plan.  No file content is pre-loaded —
the judge must discover the relevant code itself.

### Implementation phase (agent LLM — read + write tools)
The agent receives the plan and has both read tools and `write_file`.  Because
all turns occur in a single `kbench.chats.new()` context, the model has full
history of every tool call and response — it knows what it has already read or
written without any external bookkeeping.

### Failure conditions
Exhausting `max_iterations`, a context-limit error, or any other exception is
caught and recorded as a model failure via `assert_fail`.

In [None]:
# Agent loop budgets — exhausting these is treated as a model failure.
MAX_ANALYSIS_ITERATIONS = 30   # judge: explore repo + write plan
MAX_IMPL_ITERATIONS     = 60   # agent: read + write all files

# Tool-call instruction block injected at the top of every agent prompt.
_TOOL_INSTRUCTIONS = """\
## How to Use Tools
You operate in a tool-calling loop. At each step you may either:

**Call a tool** — respond with ONLY a JSON object in this exact format:
{{"tool": "<tool_name>", "args": {{"<param>": "<value>"}}}}

Example:
{{"tool": "read_file", "args": {{"path": "packages/react-reconciler/src/ReactFiberHydrationContext.js"}}}}

Rules:
- Emit ONLY the JSON object when calling a tool — no text before or after it.
- Call one tool at a time. Wait for the result before calling the next.
- Use the exact tool names listed under "Available Tools" below.

**Give your final response** — when you are done calling tools, respond with
plain text (no JSON tool-call object). This plain-text response is what gets
recorded as your answer.

## Available Tools
{tool_descriptions}
---
"""


def build_analysis_prompt(pr_summary: str, files_to_modify: list[str], tools: list) -> str:
    """
    Prompt for the judge LLM's planning phase.

    Tool descriptions are embedded directly so the model knows what it can call.
    No file content is pre-loaded — the judge must explore the codebase itself.
    """
    file_list = "\n".join(f"  - {f}" for f in sorted(files_to_modify))
    tool_descriptions = _format_tool_descriptions(tools)
    tool_header = _TOOL_INSTRUCTIONS.format(tool_descriptions=tool_descriptions)

    return f"""{tool_header}You are a senior React core team member producing an \
implementation plan for a Pull Request.

## What You Must Do
1. **Explore the codebase first.** Read the files listed below and grep for \
relevant symbols (e.g. `hydrateSuspenseBoundary`, `SuspenseState`, fiber flags). \
Do not rely on the PR summary alone — look at the actual code.
2. **Produce a precise implementation plan.** For each file, describe:
   - *Why* it needs to change, grounded in what you actually read.
   - *What* to change — specific function names, new logic, new flags, new tests.
   - *How* — key algorithmic steps, referencing real symbol names you found.

## PR Summary
{pr_summary}

## Files to Modify
{file_list}

## Completion
When you have read enough to write a specific, grounded plan, output it as \
plain text (no JSON tool-call object). Do not write any code."""


def build_implementation_prompt(
    pr_summary: str,
    implementation_plan: str,
    files_to_modify: list[str],
    tools: list,
) -> str:
    """
    Prompt for the agent LLM's implementation phase.

    The agent has read + write tools and operates in a single continuous session,
    so it sees the full history of everything it has already read or written.
    """
    file_list = "\n".join(f"  - {f}" for f in sorted(files_to_modify))
    tool_descriptions = _format_tool_descriptions(tools)
    tool_header = _TOOL_INSTRUCTIONS.format(tool_descriptions=tool_descriptions)

    return f"""{tool_header}You are an expert React core contributor implementing \
a Pull Request.

## What You Must Do
1. **Read before writing.** Use `read_file` to inspect each file before modifying \
it. Use `grep_repo` or `search_in_file` to locate symbols referenced in the plan.
2. **Follow the plan exactly.** Implement every change across all listed files.
3. **Write complete files.** When calling `write_file`, pass the *entire* updated \
file — not a diff or partial snippet. Do not include markdown code fences.
4. **Stay consistent.** You can re-read files you have already written to verify \
consistency before writing the next one.

## PR Summary
{pr_summary}

## Implementation Plan
{implementation_plan}

## Files to Modify
{file_list}

## Completion
Call `write_file` for every file in the list above. Once all files are written, \
output a plain-text summary of which files you modified (no JSON tool-call object)."""


## Implementation Phase

Both phases call `run_tool_loop()`, which drives the tool-calling conversation
manually using `llm.prompt()` → `llm.send()` → `llm.respond()`.  This works
identically across every model and provider — no `api="genai"` required.

All turns occur within a single `kbench.chats.new()` context, so the model
retains full history of every tool call and result for the duration of the phase.

Any exception (context limit, API error, tool error) is caught and recorded as
an explicit benchmark failure, not an infrastructure crash.

In [None]:
def run_analysis_phase(judge_llm, repo_path: str, gt_files: set[str]) -> str:
    """
    Phase 2: judge LLM explores the codebase and produces an implementation plan.

    Uses run_tool_loop() — works across all model providers and APIs.
    After the loop, assess_response_with_judge evaluates whether the plan is
    grounded in actual code rather than just restating the PR summary.

    Args:
        judge_llm:  The fixed judge LLM (kbench.judge_llm).
        repo_path:  Path to the React repo checked out at the merge-base commit.
        gt_files:   Set of files changed by the PR — computed dynamically in Phase 1.

    Returns the implementation plan as a string.
    """
    print("PHASE 2: Analysis — judge LLM exploring codebase and creating plan.")

    tools = make_repo_tools(repo_path, allow_writes=False)
    prompt = build_analysis_prompt(
        pr_summary=PR_SUMMARY,
        files_to_modify=sorted(gt_files),
        tools=tools,
    )

    plan = None
    try:
        plan = run_tool_loop(
            llm=judge_llm,
            prompt=prompt,
            tools=tools,
            max_iterations=MAX_ANALYSIS_ITERATIONS,
            chat_name="analysis_phase",
        )
    except RuntimeError as e:
        kbench.assertions.assert_fail(
            f"Judge LLM failed to complete the analysis phase: {e}"
        )
        return ""
    except Exception as e:
        kbench.assertions.assert_fail(
            f"Judge LLM errored during the analysis phase: {type(e).__name__}: {e}"
        )
        return ""

    kbench.assertions.assert_not_empty(
        plan,
        expectation="Judge LLM must produce a non-empty implementation plan.",
    )

    gt_file_list = ", ".join(sorted(gt_files))
    assess_report = kbench.assertions.assess_response_with_judge(
        criteria=(
            "The plan references specific JavaScript function names or variable names "
            "that would only be known from reading the actual source files "
            "(e.g. hydrateSuspenseBoundary, SuspenseState, ReactFiberWorkLoop internals), "
            "not just terms mentioned in the PR summary.",
            f"The plan describes concrete, file-specific changes for each of the "
            f"following files: {gt_file_list} — including which functions to modify "
            "and the new logic to introduce.",
            "The plan describes specific new test cases covering the hydration scenarios, "
            "including what inputs and expected outcomes each test exercises.",
            "The plan is actionable: a developer could implement the changes described "
            "without needing to read the PR summary again.",
        ),
        response_text=plan,
        judge_llm=judge_llm,
    )
    for criterion_result in assess_report.results:
        kbench.assertions.assert_true(
            criterion_result.passed,
            expectation=(
                f"Plan quality — {criterion_result.criterion} | "
                f"Judge reasoning: {criterion_result.reason}"
            ),
        )

    print(f"Analysis complete — plan received ({len(plan)} chars).")
    return plan


def run_implementation_phase(
    llm,
    judge_llm,
    implementation_plan: str,
    repo_path: str,
    gt_files: set[str],
) -> None:
    """
    Phase 3: agent LLM reads and writes all gt_files using the tool loop.

    Uses run_tool_loop() — works across all model providers and APIs.
    Full conversation history is preserved throughout the session, so the model
    is aware of every file it has already read or written.

    Args:
        llm:                 The model under evaluation.
        judge_llm:           Used for assess_response_with_judge checks.
        implementation_plan: The plan produced by run_analysis_phase.
        repo_path:           Path to the React repo at the merge-base commit.
        gt_files:            Set of files the agent must modify — from Phase 1 diff.
    """
    print("PHASE 3: Implementation — agent LLM modifying files.")

    tools = make_repo_tools(repo_path, allow_writes=True)
    prompt = build_implementation_prompt(
        pr_summary=PR_SUMMARY,
        implementation_plan=implementation_plan,
        files_to_modify=sorted(gt_files),
        tools=tools,
    )

    completion_response = None
    try:
        completion_response = run_tool_loop(
            llm=llm,
            prompt=prompt,
            tools=tools,
            max_iterations=MAX_IMPL_ITERATIONS,
            chat_name="implementation_phase",
        )
    except RuntimeError as e:
        kbench.assertions.assert_fail(
            f"Agent LLM failed to complete the implementation phase: {e}"
        )
        return
    except Exception as e:
        kbench.assertions.assert_fail(
            f"Agent LLM errored during the implementation phase: {type(e).__name__}: {e}"
        )
        return

    kbench.assertions.assert_not_empty(
        completion_response,
        expectation="Agent LLM must produce a non-empty completion response.",
    )

    gt_file_list = ", ".join(sorted(gt_files))
    assess_report = kbench.assertions.assess_response_with_judge(
        criteria=(
            f"The response confirms that all required files were modified or created: "
            f"{gt_file_list}.",
            "The response is consistent with having actually written files — it does "
            "not merely describe what changes should be made without confirming they "
            "were written.",
        ),
        response_text=completion_response,
        judge_llm=judge_llm,
    )
    for criterion_result in assess_report.results:
        kbench.assertions.assert_true(
            criterion_result.passed,
            expectation=(
                f"Implementation completion — {criterion_result.criterion} | "
                f"Judge reasoning: {criterion_result.reason}"
            ),
        )

    print(f"Implementation complete. Agent summary: {completion_response[:300]}...")


## Evaluation

Checks run after implementation, ordered cheapest → most expensive.
`assess_response_with_judge` is used for qualitative checks that can't be
captured by simple regex or exit codes.

| # | Check | Kind | What it validates |
|---|-------|------|-------------------|
| 1 | **File set similarity** | Code | Agent touched roughly the right files (git-diff based) |
| 2 | **PR-specific tests pass** | Code | New/modified test file exists and all its tests are green |
| 3 | **No unexpected warnings** | Code | Test run is clean — no `console.warn` / `console.error` noise |
| 4 | **Full test suite** | Code | No regressions across the entire React test suite |

In [None]:
FILE_OVERLAP_THRESHOLD = 0.80


def check_file_set_similarity(repo_path: str, gt_files: set[str]) -> None:
    """
    Check 1 — File set similarity.

    Uses `git diff --name-only HEAD` to determine which files the agent actually
    modified, then measures overlap against the dynamically computed gt_files.
    """
    print("CHECK 1: Verifying modified file set matches PR ground truth.")
    modified = get_git_modified_files(repo_path)

    if not modified:
        print("Warning: git diff returned no files. Falling back to checking gt_files exist on disk.")
        for f in gt_files:
            kbench.assertions.assert_true(
                (Path(repo_path) / f).exists(),
                expectation=f"Expected file not found on disk: {f}",
            )
        return

    intersection = gt_files.intersection(modified)
    overlap = len(intersection) / len(gt_files)
    kbench.assertions.assert_true(
        overlap >= FILE_OVERLAP_THRESHOLD,
        expectation=(
            f"File modification overlap should be ≥ {FILE_OVERLAP_THRESHOLD}. "
            f"Got {overlap:.2f}. Missing from diff: {gt_files - modified}"
        ),
    )


def check_pr_tests_pass(repo_path: str, pr_test_files: set[str]) -> dict[str, subprocess.CompletedProcess]:
    """
    Check 2 — PR-specific tests exist and pass.

    Returns test_file → CompletedProcess so check_no_unexpected_warnings can
    reuse the results without re-running the suite.

    Args:
        pr_test_files: Subset of gt_files that are test files — computed in Phase 1.
    """
    print("CHECK 2: Verifying PR-specific tests pass.")
    results = {}
    for test_file in pr_test_files:
        full_path = Path(repo_path) / test_file
        kbench.assertions.assert_true(
            full_path.exists(),
            expectation=f"PR test file must exist after implementation: {test_file}",
        )
        result = run_command(f"yarn test {test_file}", cwd=repo_path)
        kbench.assertions.assert_equal(
            0,
            result.returncode,
            expectation=f"All tests in '{test_file}' must pass. Stderr: {result.stderr[-2000:]}",
        )
        results[test_file] = result
    return results


def check_no_unexpected_warnings(
    pr_test_results: dict[str, subprocess.CompletedProcess],
) -> None:
    """
    Check 3 — Test output is free of console noise.

    Reuses CompletedProcess objects from check_pr_tests_pass — tests are NOT
    run a second time.
    """
    print("CHECK 3: Scanning test output for unexpected console warnings.")
    for test_file, result in pr_test_results.items():
        kbench.assertions.assert_not_contains_regex(
            r"console\.warn",
            result.stderr,
            expectation=f"Test output for {test_file} must not contain unexpected console.warn calls.",
        )
        kbench.assertions.assert_not_contains_regex(
            r"console\.error",
            result.stderr,
            expectation=f"Test output for {test_file} must not contain unexpected console.error calls.",
        )


def check_full_test_suite(repo_path: str) -> None:
    """
    Check 4 — Full regression suite (most expensive, runs last).

    Assumes yarn install has already run in Phase 1.
    """
    print("CHECK 4: Running full test suite (this may take several minutes).")

    build_result = run_command("yarn build", cwd=repo_path)
    kbench.assertions.assert_equal(
        0,
        build_result.returncode,
        expectation=f"yarn build must succeed. Stderr: {build_result.stderr[-2000:]}",
    )

    test_result = run_command("yarn test", cwd=repo_path, timeout=1200)
    kbench.assertions.assert_equal(
        0,
        test_result.returncode,
        expectation=f"Full test suite must pass. Stderr: {test_result.stderr[-2000:]}",
    )
    kbench.assertions.assert_not_contains_regex(
        r"Test Suites: \d+ failed",
        test_result.stdout,
        expectation="No test suites should be failing after implementation.",
    )
    kbench.assertions.assert_not_contains_regex(
        r"snapshot.*written",
        test_result.stderr,
        expectation="No unexpected snapshots should be written — check for unintended changes.",
    )


## Task Runner

Phase 1 sets up the evaluation environment dynamically — using only standard git, no GitHub CLI required:

1. **Clone** `facebook/react` into `./react` (skipped if already present).
2. **`git fetch origin pull/32224/head`** + **`git checkout FETCH_HEAD`** — puts the repo at the PR tip (the *after* state, with the real changes).
3. **`git merge-base HEAD origin/main`** — identifies the last commit before the PR diverged (the *before* state the agent will actually work from).
4. **Diff** — computes `GT_FILES` and `pr_test_files` from the real `git diff`, capturing ground-truth file contents at the PR tip.
5. **`git checkout <merge-base>`** — reverts to the clean before-state; the agent sees no PR changes.
6. **`yarn install`** — installs dependencies at the base commit.

This means ground-truth files and test files are always derived from the actual PR, not hardcoded. `FALLBACK_GT_FILES` in the metadata cell is documentation only — used for a sanity-check warning if the dynamic diff diverges unexpectedly.


In [None]:
@kbench.task(name=TASK_NAME)
def react_pr_reimplementation(llm, judge_llm):
    """
    Evaluates an LLM's ability to re-implement React PR #32224.

    Phase 1 — Setup:
        Clone facebook/react (if not already present), fetch the PR ref and
        check out FETCH_HEAD to reach the PR tip (after state), capture the
        ground-truth file contents and diff, then revert to the merge-base
        commit so the agent works on a clean "before" state.  GT_FILES and
        PR_TEST_FILES are computed from the real PR diff — not hardcoded.

    Phase 2 — Analysis:    judge LLM explores the codebase and plans changes.
    Phase 3 — Implementation: agent LLM implements changes via tool-calling loop.
    Phase 4 — Evaluation:  code checks in increasing cost order.

    Phases 2 and 3 use run_tool_loop(), which drives tool-calling manually via
    llm.prompt() / llm.send() / llm.respond() — compatible with all providers.

    Multi-model: Kaggle swaps `llm` for each model on the leaderboard automatically.
    `judge_llm` is fixed (kbench.judge_llm) and used for plan/completion assessment.
    """
    # ------------------------------------------------------------------
    # Phase 1: Setup — clone, checkout PR, extract ground truth, revert
    # ------------------------------------------------------------------
    print("PHASE 1: Setup — cloning repo and extracting ground truth from PR.")

    # 1a. Clone the repo if it isn't already present.
    if not os.path.isdir(REPO_PATH):
        print(f"Cloning {REPO_SLUG} into {REPO_PATH} …")
        clone_result = run_command(
            f"git clone https://github.com/{REPO_SLUG} {REPO_PATH}",
            cwd=".",
            timeout=300,
        )
        kbench.assertions.assert_equal(
            0,
            clone_result.returncode,
            expectation=f"git clone must succeed. Stderr: {clone_result.stderr[-2000:]}",
        )

    # 1b. Fetch the PR ref and check out FETCH_HEAD — puts the repo at the PR tip.
    #     Uses only standard git; no GitHub CLI required.
    print(f"Fetching PR #{PR_NUMBER} …")
    fetch_result = run_command(
        f"git fetch origin pull/{PR_NUMBER}/head",
        cwd=REPO_PATH,
        timeout=120,
    )
    kbench.assertions.assert_equal(
        0,
        fetch_result.returncode,
        expectation=(
            f"git fetch origin pull/{PR_NUMBER}/head must succeed. "
            f"Stderr: {fetch_result.stderr[-2000:]}"
        ),
    )
    checkout_result = run_command(
        "git checkout FETCH_HEAD",
        cwd=REPO_PATH,
        timeout=60,
    )
    kbench.assertions.assert_equal(
        0,
        checkout_result.returncode,
        expectation=f"git checkout FETCH_HEAD must succeed. Stderr: {checkout_result.stderr[-2000:]}",
    )

    # 1c. Find the merge base — the last commit on main before the PR diverged.
    #     This is the "before" state the agent will work from.
    base_result = run_command(
        "git merge-base HEAD origin/main",
        cwd=REPO_PATH,
        timeout=30,
    )
    kbench.assertions.assert_equal(
        0,
        base_result.returncode,
        expectation=f"git merge-base must succeed. Stderr: {base_result.stderr[-2000:]}",
    )
    merge_base_sha = base_result.stdout.strip()
    print(f"Merge base: {merge_base_sha}")

    # 1d. Compute GT_FILES dynamically from the actual PR diff.
    diff_result = run_command(
        f"git diff {merge_base_sha} HEAD --name-only",
        cwd=REPO_PATH,
        timeout=30,
    )
    gt_files: set[str] = {
        line.strip()
        for line in diff_result.stdout.splitlines()
        if line.strip()
    }
    kbench.assertions.assert_true(
        len(gt_files) > 0,
        expectation="PR diff must contain at least one changed file.",
    )
    print(f"Ground-truth files ({len(gt_files)}): {sorted(gt_files)}")

    # Sanity-check: warn if the diff diverges significantly from expectations.
    overlap = len(gt_files & FALLBACK_GT_FILES) / max(len(FALLBACK_GT_FILES), 1)
    if overlap < 0.5:
        print(
            f"Warning: dynamic GT_FILES overlaps only {overlap:.0%} with FALLBACK_GT_FILES. "
            "Double-check that the correct PR was checked out."
        )

    # 1e. Capture the ground-truth file contents at the PR tip.
    #     These are the actual post-PR files; we use them for evaluation later.
    gt_contents: dict[str, str] = {}
    for f in gt_files:
        p = Path(REPO_PATH) / f
        if p.exists():
            gt_contents[f] = p.read_text(encoding="utf-8")
    print(f"Captured ground-truth contents for {len(gt_contents)} files.")

    # 1f. Identify test files within the PR diff.
    pr_test_files: set[str] = {
        f for f in gt_files
        if "/__tests__/" in f or f.endswith("-test.js") or f.endswith(".test.js")
    }
    print(f"PR test files: {sorted(pr_test_files)}")

    # 1g. Revert to the merge-base commit — agent now works on the "before" state.
    print(f"Reverting to merge base {merge_base_sha} …")
    revert_result = run_command(
        f"git checkout {merge_base_sha}",
        cwd=REPO_PATH,
        timeout=60,
    )
    kbench.assertions.assert_equal(
        0,
        revert_result.returncode,
        expectation=f"Checkout to merge base must succeed. Stderr: {revert_result.stderr[-2000:]}",
    )

    # 1h. Install dependencies at the base commit.
    print("Running yarn install …")
    install_result = run_command("yarn install", cwd=REPO_PATH, timeout=300)
    kbench.assertions.assert_equal(
        0,
        install_result.returncode,
        expectation=f"yarn install must succeed. Stderr: {install_result.stderr[-2000:]}",
    )

    # ------------------------------------------------------------------
    # Phase 2: Analysis (judge LLM — codebase exploration + planning)
    # ------------------------------------------------------------------
    implementation_plan = run_analysis_phase(judge_llm, REPO_PATH, gt_files)

    # ------------------------------------------------------------------
    # Phase 3: Implementation (agent LLM — tool-calling loop)
    # ------------------------------------------------------------------
    run_implementation_phase(llm, judge_llm, implementation_plan, REPO_PATH, gt_files)

    # ------------------------------------------------------------------
    # Phase 4: Evaluation
    # ------------------------------------------------------------------
    print("PHASE 4: Evaluation.")

    check_file_set_similarity(REPO_PATH, gt_files)

    pr_test_results = check_pr_tests_pass(REPO_PATH, pr_test_files)
    check_no_unexpected_warnings(pr_test_results)

    check_full_test_suite(REPO_PATH)


    print("All checks passed.")

    )

        f"Error: {e}"

try:        f"Task execution skipped: benchmark requires a pre-configured environment.\n"

    react_pr_reimplementation.run(kbench.llm, judge_llm=kbench.judge_llm)    print(
except Exception as e: