# Result Analysis

In this notebook we plot the patch assessement metrics, and other useful ones from the patch evaluation journey.

First, we start by loading the results.

Note that we remove three bugs (Math-28, Math-44, JacksonDatabind-82) from the results since the function they change is included in Megadiff.

In [1]:
import json

def read_jsonl_file(file_path):
    with open(file_path, 'r') as f:
        bugs = [json.loads(line) for line in f]

    # Keep only bugs in the single-function benchmarks
    defects4j_sf_bugs = "../../results/benchmarks/defects4j_sf.txt"
    with open(defects4j_sf_bugs, "r") as f:
        sf_bugs = set([line.strip() for line in f.readlines()])
    humanevaljava_sf_bugs = "../../results/benchmarks/humanevaljava_sf.txt"
    with open(humanevaljava_sf_bugs, "r") as f:
        sf_bugs.update([line.strip() for line in f.readlines()])

    bugs = [bug for bug in bugs if bug["identifier"] in sf_bugs]

    # Remove bugs that might be leaked by Megadiff
    to_remove = ["Math-28", "Math-44", "JacksonDatabind-82"]
    bugs = [bug for bug in bugs if bug["identifier"] not in to_remove]

    return bugs

def read_multi_loc_bugs(file_path):
    with open(file_path, "r") as f:
        bugs = set([line.strip() for line in f.readlines()])
    return bugs

We now plot the results we present in the paper.

In [2]:
from pathlib import Path
from typing import List, Tuple
import pandas as pd
import re

def plot_table(experiments: List[Tuple[str, List[dict]]]):
    # Plot a table with the results of each experiment
    # The table has the following columns:
    # - Model name
    # - Representation
    # - Benchmark
    # - Number of bugs with an exact match fix
    # - Number of bugs with a AST match fix
    # - Number of bugs with a semantical match fix
    # - Number of bugs with a plausible fix
    # - Total number of bugs with patches

    # Define the table data
    data = []
    multi_loc_bugs = read_multi_loc_bugs("multi-loc-bugs.txt")
    for file_path, bugs in experiments:
        # Compute metrics
        exact_match = sum(1 for bug in bugs if "evaluation" in bug and bug["evaluation"] != None and (any(patch["exact_match"] for patch in bug["evaluation"])))
        ast_match = sum(1 for bug in bugs if "evaluation" in bug and bug["evaluation"] != None and (
            any(patch["ast_match"] for patch in bug["evaluation"])
        or any(patch["exact_match"] for patch in bug["evaluation"])
        ))
        semantical_match = sum(1 for bug in bugs if "evaluation" in bug and bug["evaluation"] != None and (
           any("semantical_match" in patch and patch["semantical_match"] == True for patch in bug["evaluation"])
        or any(patch["exact_match"] for patch in bug["evaluation"])
        or any(patch["ast_match"] for patch in bug["evaluation"])
        ))
        plausible = sum(1 for bug in bugs if "evaluation" in bug and bug["evaluation"] != None and (
            any(patch["test"] for patch in bug["evaluation"])
        or any(patch["exact_match"] for patch in bug["evaluation"])
        or any(patch["ast_match"] for patch in bug["evaluation"])
        or any("semantical_match" in patch and patch["semantical_match"] == True for patch in bug["evaluation"])
        ))
        total = sum(1 for bug in bugs if "evaluation" in bug and bug["evaluation"] != None)
        correct_multi_loc = sum(1 for bug in bugs if "evaluation" in bug and bug["evaluation"] != None and bug["identifier"] in multi_loc_bugs and (
           any("semantical_match" in patch and patch["semantical_match"] == True for patch in bug["evaluation"])
        or any(patch["exact_match"] for patch in bug["evaluation"])
        or any(patch["ast_match"] for patch in bug["evaluation"])
        ))

        # Extract meta-data from file_path
        benchmark = "Defects4J" if "defects4j" in file_path else "HumanEval-Java"
        model = "repairllama-fft" if "repairllama-fft" in file_path else "repairllama" if "repairllama" in file_path else "gpt4" if "gpt4" in file_path else "gpt35" if "gpt35" in file_path else "codellama-ir4" if "codellama-ir4" in file_path else "codellama"
        pattern = r"ir\d+_or\d+"
        ir_or = re.search(pattern, file_path)
        if ir_or:
            ir_or = ir_or.group()
        representation = {"gpt4": "gpt", "gpt35": "gpt", "repairllama": ir_or, "repairllama-fft": "ir4_or2", "codellama": "ir3_or2", "codellama-ir4": "ir4_or2"}[model]

        data.append([model, representation, benchmark, total, plausible, semantical_match, ast_match, exact_match, correct_multi_loc])

    # Sort the data according to representation
    data = sorted(data, key=lambda x: x[1])
    # Sort the data according to model
    data = sorted(data, key=lambda x: x[0])
    # Sort the data according to benchmark
    data = sorted(data, key=lambda x: x[2])

    # Show the table with pandas, do not split the table
    df = pd.DataFrame(data, columns=["Model", "Repr", "Benchmark", "Total", "Plausible", "Semantical", "AST", "Exact", "Correct Multi-loc"])
    print(df.to_string(index=False))

def plot_experiments(experiments_path: str):
    experiments = []
    for file_path in Path(experiments_path).glob("*.jsonl"):
        experiments.append((file_path.stem, read_jsonl_file(file_path)))

    plot_table(experiments)

plot_experiments("../../results/3_martin")

          Model    Repr      Benchmark  Total  Plausible  Semantical  AST  Exact  Correct Multi-loc
      codellama ir3_or2      Defects4J    478        131          83   70     52                 14
  codellama-ir4 ir4_or2      Defects4J    476        107          69   60     50                  7
          gpt35     gpt      Defects4J    483         71          45   33     23                 11
           gpt4     gpt      Defects4J    483        119          72   60     47                 20
    repairllama ir1_or1      Defects4J    476         79          45   31     29                  7
    repairllama ir1_or3      Defects4J    476         41          24   17     15                  6
    repairllama ir1_or4      Defects4J    477         12           3    2      2                  0
    repairllama ir2_or2      Defects4J    477        198         139  122    121                 32
    repairllama ir3_or2      Defects4J    480        153         102   86     83                 13


## Statistical significant and effect size analysis

We measure the statistical significance and effect size

In [3]:
defects4j_set = set()
humanevaljava_set = set()

with open("../../results/benchmarks/defects4j_sf.txt", "r") as f:
    defects4j_set.update([line.strip() for line in f.readlines()])

with open("../../results/benchmarks/humanevaljava_sf.txt", "r") as f:
    humanevaljava_set.update([line.strip() for line in f.readlines()])

In [4]:
import pandas as pd

def generate_df(experiments: List[Tuple[str, List[dict]]]):
    # Generate a granular dataframe with the results of each experiment
    # The table has the following columns:
    # - Model name
    # - Representation
    # - Benchmark
    # - Bitvector of bugs with an exact match fix
    # - Bitvector of bugs with a AST match fix
    # - Bitvector of bugs with a semantical match fix
    # - Bitvector of bugs with a plausible fix

    # Define the table data
    data = []
    multi_loc_bugs = read_multi_loc_bugs("multi-loc-bugs.txt")
    for file_path, bugs in experiments:
        # Extract meta-data from file_path
        benchmark = "Defects4J" if "defects4j" in file_path else "HumanEval-Java"
        model = "repairllama-fft" if "repairllama-fft" in file_path else "repairllama" if "repairllama" in file_path else "gpt4" if "gpt4" in file_path else "gpt35" if "gpt35" in file_path else "codellama-ir4" if "codellama-ir4" in file_path else "codellama"
        pattern = r"ir\d+_or\d+"
        ir_or = re.search(pattern, file_path)
        if ir_or:
            ir_or = ir_or.group()
        representation = {"gpt4": "gpt", "gpt35": "gpt", "repairllama": ir_or, "repairllama-fft": "ir4_or2", "codellama": "ir3_or2", "codellama-ir4": "ir4_or2"}[model]

        # Bitvector index
        index = defects4j_set if benchmark == "Defects4J" else humanevaljava_set

        # Compute bitvectors
        exp_df = pd.DataFrame(bugs)
        exact_match = []
        ast_match = []
        semantical_match = []
        plausible = []

        for bug in index:
            bug_df = exp_df[exp_df["identifier"] == bug]
            if bug_df.empty or "evaluation" not in bug_df or bug_df["evaluation"].values[0] is None:
                exact_match.append(False)
                ast_match.append(False)
                semantical_match.append(False)
                plausible.append(False)
            else:
                evaluation = bug_df["evaluation"].values[0]

                exact_match.append(any(patch["exact_match"] for patch in evaluation))
                ast_match.append(any(patch["ast_match"] for patch in evaluation) 
                                 or any(patch["exact_match"] for patch in evaluation))
                semantical_match.append(any("semantical_match" in patch and patch["semantical_match"] == True for patch in evaluation) 
                                        or any(patch["exact_match"] for patch in evaluation) 
                                        or any(patch["ast_match"] for patch in evaluation))
                plausible.append(any(patch["test"] for patch in evaluation) 
                                 or any(patch["exact_match"] for patch in evaluation) 
                                 or any(patch["ast_match"] for patch in evaluation) 
                                 or any("semantical_match" in patch and patch["semantical_match"] == True for patch in evaluation))

        data.append([model, representation, benchmark, plausible, semantical_match, ast_match, exact_match])

    # Sort the data according to representation
    data = sorted(data, key=lambda x: x[1])
    # Sort the data according to model
    data = sorted(data, key=lambda x: x[0])
    # Sort the data according to benchmark
    data = sorted(data, key=lambda x: x[2])

    # Show the table with pandas, do not split the table
    df = pd.DataFrame(data, columns=["Model", "Repr", "Benchmark", "Plausible", "Semantical", "AST", "Exact"])
    return df

def generate_granular_df(experiments_path: str):
    experiments = []
    for file_path in Path(experiments_path).glob("*.jsonl"):
        experiments.append((file_path.stem, read_jsonl_file(file_path)))

    return generate_df(experiments)

experiments_df = generate_granular_df("../../results/3_martin")

In [13]:
import pandas as pd
import numpy as np
from statsmodels.stats.contingency_tables import cochrans_q, mcnemar
from itertools import combinations

def analyze_model_results(df, results_column='Semantical', benchmark="Defects4J", alpha=0.05):
    """
    Perform statistical analysis on binary results from multiple models and display results in a binary table.
    """
    print(f"Analyzing {results_column} for {benchmark}")

    # Keep only the benchmark we are interested in
    df = df[df['Benchmark'] == benchmark]

    # Extract binary vectors and build dataframe
    ext_df = pd.DataFrame(np.vstack(df[results_column].values), index=df.index)

    # Compute cochran's q
    q_stat, q_p_value, _ = cochrans_q(ext_df.T, return_object=False)
    print(f"Cochran's Q: {q_stat:.2f}, p-value: {q_p_value:.4f}")

    # Create empty dataframe for p-values and effect size
    p_values = np.zeros((len(df.index), len(df.index)))
    effect_size = np.zeros((len(df.index), len(df.index)))
    
    # Compute mcnemar's tests
    for i, j in combinations(df.index, 2):
        cur_df = ext_df.T[[i, j]]

        # Compute contingency table
        a = sum(cur_df[i] & cur_df[j])
        b = sum(~cur_df[i] & cur_df[j])
        c = sum(cur_df[i] & ~cur_df[j])
        d = sum(~cur_df[i] & ~cur_df[j])

        # Compute mcnemar's test
        bunch = mcnemar(np.array([[a, b], [c, d]]))
        p_values[i - len(df.index), j - len(df.index)] = bunch.pvalue
        p_values[j - len(df.index), i - len(df.index)] = bunch.pvalue

        # Compute effect size (cohen's g)
        g = np.sqrt((a*d - b*c)**2 / ((a+b)*(c+d)*(a+c)*(b+d)))
        effect_size[i - len(df.index), j - len(df.index)] = g
        effect_size[j - len(df.index), i - len(df.index)] = g

    # Create binary significance matrix
    sig_matrix = p_values < alpha
    
    # Create DataFrame for better visualization
    model_names = [f"{row['Model']} ({row['Repr']})" for _, row in df.iterrows()]
    sig_df = pd.DataFrame(sig_matrix, 
                         index=model_names,
                         columns=model_names)
    effect_size_df = pd.DataFrame(effect_size,
                                  index=model_names,
                                  columns=model_names)

    # Print df as markdown table
    print(f"Significance matrix ({results_column})")
    print(sig_df.to_markdown())
    print(f"Effect size matrix ({results_column})")
    print(effect_size_df.to_markdown())

analyze_model_results(experiments_df, "Exact", "Defects4J")
analyze_model_results(experiments_df, "AST", "Defects4J")
analyze_model_results(experiments_df, "Semantical", "Defects4J")
analyze_model_results(experiments_df, "Plausible", "Defects4J")

analyze_model_results(experiments_df, "Exact", "HumanEval-Java")
analyze_model_results(experiments_df, "AST", "HumanEval-Java")
analyze_model_results(experiments_df, "Semantical", "HumanEval-Java")
analyze_model_results(experiments_df, "Plausible", "HumanEval-Java")

Analyzing Exact for Defects4J
Cochran's Q: 492.20, p-value: 0.0000
Significance matrix (Exact)
|                           |   codellama (ir3_or2) |   codellama-ir4 (ir4_or2) |   gpt35 (gpt) |   gpt4 (gpt) |   repairllama (ir1_or1) |   repairllama (ir1_or3) |   repairllama (ir1_or4) |   repairllama (ir2_or2) |   repairllama (ir3_or2) |   repairllama (ir4_or2) |   repairllama-fft (ir4_or2) |
|:--------------------------|----------------------:|--------------------------:|--------------:|-------------:|------------------------:|------------------------:|------------------------:|------------------------:|------------------------:|------------------------:|----------------------------:|
| codellama (ir3_or2)       |                     1 |                         0 |             1 |            0 |                       1 |                       1 |                       1 |                       1 |                       1 |                       1 |                           0 |
| codell

## Semantic equivalence assessement

Another interesting thing to look at is the agreement between raters (André, Sen, Martin).

The following process was adopted during manual patch assessement:
1. André and Sen both analyse all plausible patches independently
2. André and Sen's results are merged, with the patches whose assessement is not agreed upon being flagged
3. Martin looks at the flagged patches and breaks the tie (note that if Martin selects one patch as equivalent, the possible remaining flagged patches in the same bug are skipped since they won't change the result)

We now want to look at the agreement between:
1. André and Sen (across all patches)
2. André and Martin (across the patches Martin looked at)
3. Sen and Martin (across the patches Martin looked at)

For this we compute Cohen's kappa.

In [6]:
from pathlib import Path

andre_experiments = {}
for file_path in Path("../../results/1_andre").glob("*.jsonl"):
    experiment_name = file_path.stem.replace("_andre", "")
    andre_experiments[experiment_name] = read_jsonl_file(file_path)

sen_experiments = {}
for file_path in Path("../../results/1_sen").glob("*.jsonl"):
    experiment_name = file_path.stem.replace("_sen", "")
    sen_experiments[experiment_name] = read_jsonl_file(file_path)

martin_experiments = {}
for file_path in Path("../../results/3_martin").glob("*.jsonl"):
    experiment_name = file_path.stem.replace("_martin", "")
    martin_experiments[experiment_name] = read_jsonl_file(file_path)

# Ensure that the experiments are the same
assert set(andre_experiments.keys()) == set(sen_experiments.keys()) == set(martin_experiments.keys())

# We now need to match the experiments between raters. We want to get a single list with tuples of the form (experiment_andre, experiment_sen, experiment_martin, experiment_merged)
experiments = []
for experiment in andre_experiments.keys():
    experiments.append((experiment, andre_experiments[experiment], sen_experiments[experiment], martin_experiments[experiment]))

assert len(experiments) == 22

In [7]:
from sklearn.metrics import cohen_kappa_score

def get_bug_labels_first_pass(bug):
    labels = []
    if "evaluation" in bug and bug["evaluation"] != None:
        # Skip bugs that were not evaluated
        if any(x["exact_match"] or x["ast_match"] for x in bug["evaluation"]) or not any(x["test"] for x in bug["evaluation"]):
            return labels

        for i, evaluation in enumerate(bug["evaluation"]):
            # Skip if the patch is not plausible
            if not evaluation["test"]:
                continue

            assert "semantical_match" in evaluation, f"Missing semantical_match for {bug['identifier']}"
            labels.append((i, evaluation["semantical_match"]))
    
    return labels

def get_labels_first_pass(exp_a, exp_b):
    # Sort bugs by bug-id so we get the same order of labels for everyone
    exp_a = sorted(exp_a, key=lambda x: x["identifier"])
    exp_b = sorted(exp_b, key=lambda x: x["identifier"])

    # Ensure they have the same bugs
    assert len(exp_a) == len(exp_b), f"Number of bugs do not match ({len(exp_a)} vs {len(exp_b)})"
    assert all(a["identifier"] == b["identifier"] for a, b in zip(exp_a, exp_b))

    labels_a = []
    labels_b = []
    for bug_a, bug_b in zip(exp_a, exp_b):
        labels_bug_a = get_bug_labels_first_pass(bug_a)
        labels_bug_b = get_bug_labels_first_pass(bug_b)
        if len(labels_bug_a) == len(labels_bug_b) and all([a[0] == b[0] for a, b in zip(labels_bug_a, labels_bug_b)]):
            labels_a.extend([a[1] for a in labels_bug_a])
            labels_b.extend([b[1] for b in labels_bug_b])
        else:
            # HACK: In some of Sen's files, the original evaluation did not include any exact_match or ast_match
            # This means that the get_labels function_first_pass will return more patches than it should.
            # To overcome this we only keep the patches that André looked at by removing the patches only Sen (potentially) looked at
            labels_a.extend(labels_bug_a)
            labels_b.extend([b[1] for b in labels_bug_b if b[0] in [a[0] for a in labels_bug_a]])

    return labels_a, labels_b

def compute_kappa(experiments):
    # First look at the agreement between André and Sen
    andre_labels = []
    sen_labels = []

    # We will only consider bugs that have been evaluated by both André and Sen
    for experiment, andre_experiment, sen_experiment, _ in experiments:
        # print(f"Computing kappa for {experiment}")
        labels_a, labels_b = get_labels_first_pass(andre_experiment, sen_experiment)
        andre_labels.extend(labels_a)
        sen_labels.extend(labels_b)

    assert len(andre_labels) == len(sen_labels), f"Number of bugs evaluated by André and Sen do not match ({len(andre_labels)} vs {len(sen_labels)})"

    kappa = cohen_kappa_score(andre_labels, sen_labels)
    print(f"Number of patches evaluated by both André and Sen: {len(andre_labels)}")
    print(f"Kappa between André and Sen: {kappa}")
    print(f"Raw agreement between André and Sen: {sum(1 for a, b in zip(andre_labels, sen_labels) if a == b) / len(andre_labels)}")

compute_kappa(experiments)

Number of patches evaluated by both André and Sen: 2547
Kappa between André and Sen: 0.7021540700176891
Raw agreement between André and Sen: 0.8511974872398901


In [8]:
from sklearn.metrics import cohen_kappa_score

def get_bug_labels_martin_pass(bug):
    labels = []
    if "evaluation" in bug and bug["evaluation"] != None:
        # Skip bugs that were not evaluated
        if any(x["exact_match"] or x["ast_match"] for x in bug["evaluation"]) or not any(x["test"] for x in bug["evaluation"]):
            return labels

        for i, evaluation in enumerate(bug["evaluation"]):
            # Skip if the patch is not plausible
            if not evaluation["test"]:
                continue

            # Skip those that are still disagree, which means Martin did not evaluate
            if evaluation["semantical_match"] == "Disagree":
                continue

            if evaluation["semantical_match"] == True:
                labels.append((i, True))
                break

            labels.append((i, evaluation["semantical_match"]))
    
    return labels

def get_labels_second_pass(exp_a, exp_b, exp_c):
    # Sort bugs by bug-id so we get the same order of labels for everyone
    exp_a = sorted(exp_a, key=lambda x: x["identifier"])
    exp_b = sorted(exp_b, key=lambda x: x["identifier"])
    exp_c = sorted(exp_c, key=lambda x: x["identifier"])

    # Ensure they have the same bugs
    assert len(exp_a) == len(exp_b) == len(exp_c), f"Number of bugs do not match ({len(exp_a)} vs {len(exp_b)} vs {len(exp_c)})"
    assert all(a["identifier"] == b["identifier"] == c["identifier"] for a, b, c in zip(exp_a, exp_b, exp_c))

    labels_a = []
    labels_b = []
    labels_c = []
    for bug_a, bug_b, bug_c in zip(exp_a, exp_b, exp_c):
        labels_bug_a = get_bug_labels_first_pass(bug_a)
        labels_bug_b = get_bug_labels_first_pass(bug_b)
        labels_bug_c = get_bug_labels_martin_pass(bug_c)
        # HACK: same as above, plus we only keep those which have different labels in André and Sen's eval round
        labels_bug_b = [b for b in labels_bug_b if b[0] in [a[0] for a in labels_bug_a] and b[0] in [c[0] for c in labels_bug_c] and b[1] != {a[0]: a[1] for a in labels_bug_a}[b[0]]]
        labels_bug_a = [a for a in labels_bug_a if a[0] in [b[0] for b in labels_bug_b]]
        labels_bug_c = [c for c in labels_bug_c if c[0] in [a[0] for a in labels_bug_a]]

        # Extend
        labels_a.extend([a[1] for a in labels_bug_a])
        labels_b.extend([b[1] for b in labels_bug_b])
        labels_c.extend([c[1] for c in labels_bug_c])

    return labels_a, labels_b, labels_c

def compute_kappa(experiments):
    andre_labels = []
    sen_labels = []
    martin_labels = []

    for experiment, andre_experiment, sen_experiment, martin_experiment in experiments:
        # print(f"Computing kappa for {experiment}")
        labels_a, labels_b, labels_c = get_labels_second_pass(andre_experiment, sen_experiment, martin_experiment)
        andre_labels.extend(labels_a)
        sen_labels.extend(labels_b)
        martin_labels.extend(labels_c)

    assert len(andre_labels) == len(sen_labels), f"Number of bugs evaluated by André and Sen do not match ({len(andre_labels)} vs {len(sen_labels)})"
    assert len(andre_labels) == len(martin_labels), f"Number of bugs evaluated by André and Martin do not match ({len(andre_labels)} vs {len(martin_labels)})"
    assert len(sen_labels) == len(martin_labels), f"Number of bugs evaluated by Sen and Martin do not match ({len(sen_labels)} vs {len(martin_labels)})"

    kappa_andre_sen = cohen_kappa_score(andre_labels, sen_labels, labels=[True, False])
    kappa_andre_martin = cohen_kappa_score(andre_labels, martin_labels, labels=[True, False])
    kappa_sen_martin = cohen_kappa_score(sen_labels, martin_labels, labels=[True, False])
    print(f"Number of disagreeing patches evaluated by Martin: {len(andre_labels)}")
    print(f"Kappa between André and Sen: {kappa_andre_sen}")
    print(f"Kappa between André and Martin: {kappa_andre_martin}")
    print(f"Kappa between Sen and Martin: {kappa_sen_martin}")

    print(f"Raw agreement between André and Sen: {sum(a == b for a, b in zip(andre_labels, sen_labels)) / len(andre_labels)}")
    print(f"Raw agreement between André and Martin: {sum(a == b for a, b in zip(andre_labels, martin_labels)) / len(andre_labels)}")
    print(f"Raw agreement between Sen and Martin: {sum(a == b for a, b in zip(sen_labels, martin_labels)) / len(andre_labels)}")

compute_kappa(experiments)

Number of disagreeing patches evaluated by Martin: 248
Kappa between André and Sen: -0.8751219512195123
Kappa between André and Martin: 0.03241296518607428
Kappa between Sen and Martin: -0.024793388429751984
Raw agreement between André and Sen: 0.0
Raw agreement between André and Martin: 0.5806451612903226
Raw agreement between Sen and Martin: 0.41935483870967744


In [9]:
def print_multi_loc_bugs():
    multi_loc_bugs = read_multi_loc_bugs("multi-loc-bugs.txt")
    experiments = []
    for file_path in Path("../../results/3_martin").glob("*.jsonl"):
        if "ir4_or2" in file_path.stem:
            experiments.append((file_path.stem, read_jsonl_file(file_path)))

    for file_path, bugs in experiments:
        print(f"Experiment: {file_path}")
        correct_multi_loc = list(bug for bug in bugs if "evaluation" in bug and bug["evaluation"] != None and bug["identifier"] in multi_loc_bugs and (
           any("semantical_match" in patch and patch["semantical_match"] == True for patch in bug["evaluation"])
        or any(patch["exact_match"] for patch in bug["evaluation"])
        or any(patch["ast_match"] for patch in bug["evaluation"])
        ))
        print(f"Number of correct multi-loc bugs: {len(correct_multi_loc)}")
        for bug in sorted(correct_multi_loc, key=lambda x: x["identifier"]):
            print(f"{bug['identifier']}")

print_multi_loc_bugs()

Experiment: evaluation_defects4j_repairllama_ir4_or2_martin
Number of correct multi-loc bugs: 35
Chart-26
Chart-4
Chart-7
Closure-101
Closure-102
Closure-115
Closure-124
Closure-128
Closure-13
Compress-32
Compress-44
Compress-45
Compress-7
Csv-5
Csv-6
Gson-16
Gson-6
JacksonDatabind-24
JacksonDatabind-47
JacksonDatabind-49
JacksonDatabind-54
JacksonDatabind-67
JacksonDatabind-83
Jsoup-49
Jsoup-6
Jsoup-64
Jsoup-80
Jsoup-85
Lang-10
Math-3
Math-72
Math-79
Math-8
Math-86
Math-95
Experiment: evaluation_humanevaljava_repairllama_ir4_or2_martin
Number of correct multi-loc bugs: 13
ANTI_SHUFFLE
CORRECT_BRACKETING
COUNT_UP_TO
FLIP_CASE
GREATEST_COMMON_DIVISOR
IS_PALINDROME
LARGEST_SMALLEST_INTEGERS
MODP
NUMERICAL_LETTER_GRADE
RESCALE_TO_UNIT
SORT_ARRAY
STRONGEST_EXTENSION
X_OR_Y
