In [28]:
import json
from typing import List
# TODO: Add pathlib, p.glob('**/*') to get all files in a folder
import os
import subprocess
from pathlib import Path
from tqdm.notebook import tqdm
from collections import defaultdict

tmp_dir = "tmprepo"

def read_jsonl(file_path) -> List[dict]:
    with open(file_path, 'r') as f:
        lines = f.readlines()
    return [json.loads(line) for line in lines]

def is_github_url(url: str) -> bool:
    return "github.com" in url

def git_clone(url: str, folder: str) -> bool:
    res = subprocess.run(["git", "clone", url, folder, "--quiet"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    return res.returncode == 0

def search_string_in_files(folder_path, search_string):
    """
    Recursively searches for a string in all files within a folder and its subfolders.

    Args:
        folder_path (str): The path to the folder to search in.
        search_string (str): The string to search for.

    Returns:
        list: A list of file paths containing the search string.
    """
    matching_files = []
    file_mathes = Path(folder_path).rglob('*.java')
    for file_path in file_mathes:
        if file_path.is_dir():
            continue
        # # path contains
        if any(p in [".git", "dists", "resource", "com"] for p in file_path.parts):
            continue
        try:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                if search_string in f.read():
                    matching_files.append(file_path)
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
    return matching_files

def exec(bug: str):
    bug_solution_str = bug_solution_str_map[bug]
    uris = bug_uri_map[bug]
    for uri in uris:
        label = uri_tag_map[uri]
        tmp_repo_path = os.path.join(tmp_dir, label)
        # # Check if repo already exists and is a valid repo
        # if not os.path.exists(tmp_repo_path):
        #     # Clone repository
        #     success = git_clone(uri, tmp_repo_path)
        #     if not success:
        #         return

        # Check for string in tmp_path
        if not os.path.exists(tmp_repo_path):
            return

        matching_files = search_string_in_files(tmp_repo_path, bug_solution_str)
        if not matching_files:
            continue
        repo_file_matches[bug].append(label)


In [29]:
# Generate mappings between bugs and repositories

benchmarks = [
    ("defect4j", "data/defects4j/candidates_Defects4J_sigonly-instruct_google_model_name=gemini-1.5-flash_max_length=4096_generation_strategy=beam_search_num_return_sequences=1.jsonl"),
]

uri_cache = set()
uri_tag_map = {}  # repo_uri -> repo_tag
bugs = []  # List of bug ids
bug_uri_map = defaultdict(list)  # bug_id -> [repo_uri]
bug_solution_str_map = {}  # bug_id -> fixed_bug_str
for benchmark_label, path in benchmarks:
    data = read_jsonl(path)
    for d in tqdm(data, desc="Generate bug and repo uri mappings"):
        bug_id = f"{benchmark_label}-{d['identifier']}"
        # Get bug to fix string mapping
        fixed_code = d["fixed_code"]
        if not fixed_code:
            continue
        bug_solution_str_map[bug_id] = fixed_code

        # Get github urls
        generations = d.get("generation", None)
        if not generations:
            continue
        candidates = generations[0]["candidates"][0]
        # If no matches, skip
        if not "citation_metadata" in candidates:
            continue
        sources = candidates["citation_metadata"]["citation_sources"]
        # Go though sources
        for source in sources:
            # If source has no uri or is not a github url, skip
            if not "uri" in source:
                continue
            uri: str = source["uri"]
            if not is_github_url(uri):
                continue
            if uri in uri_cache:
                continue
            uri_cache.add(uri)
            # Create repo tag
            repo_tag = uri.removeprefix("https://github.com/")
            repo_tag = repo_tag.removesuffix(".git")
            repo_tag = repo_tag.replace("/", "_")
            uri_tag_map[uri] = repo_tag
            # Add repo uri match to bug
            bug_uri_map[bug_id].append(uri)
        
        # Add bug to list of bugs with repos to be scanned if there are any repos
        if not bug_uri_map[bug_id]:
            continue
        bugs.append(bug_id)


Generate bug and repo uri mappings:   0%|          | 0/835 [00:00<?, ?it/s]

In [22]:
# Execute scans on single repository

from pathlib import Path

tmp_dir = "tmprepo"

label = uri_tags[2]
print(label)
uri = uri_map[label]
print(uri)
tmp_repo_path = os.path.join(tmp_dir, label)
print(tmp_repo_path)
# Get all files in folder
files = Path(tmp_repo_path).rglob('*.java')
for file in files:
    if file.is_dir():
        continue
    # # path contains
    if any(p in [".git", "dists", "com"] for p in file.parts):
        continue
    # if file.name in [".gitignore", ".DS_Store", "LICENSE"]:
    #     continue
    # if file.suffix in [".png", ".jpg", ".md", ".pdf", ".xlsx", ".mod", ".xml", ".conf", ".html", ".class", ".sh", ".pem"]:
    #     continue


defect4j-Mockito-28
https://github.com/lx0704/Defects4J-Maven


NameError: name 'label' is not defined

In [30]:
# Execute search on repositories mapped to bugs

import shutil
from tqdm.notebook import tqdm
import os
from concurrent.futures import ThreadPoolExecutor, as_completed

repo_file_matches = defaultdict(list)

with ThreadPoolExecutor() as executor:

    futures = {executor.submit(exec, bug): bug for bug in bugs}

    for future in tqdm(as_completed(futures), total=len(bugs), desc="Search for bug solution instances in repositories"):
        future.result()


repo_file_matches

  0%|          | 0/129 [00:00<?, ?it/s]

defaultdict(list,
            {'TruX-DTF_FL-VS-APR': ['defect4j-Lang-5'],
             'SpoonLabs_gumtree-spoon-ast-diff': ['defect4j-Lang-57'],
             'GumTreeDiff_datasets': ['defect4j-JacksonDatabind-96'],
             'ali-ghanbari_shibboleth-demo': ['defect4j-Time-25'],
             'SpoonLabs_nopol-experiments': ['defect4j-Math-96'],
             'SpoonLabs_coming': ['defect4j-Math-97'],
             'easy-software-ufal_nimrod-hunor-subjects': ['defect4j-Codec-10'],
             'OpenJML_OpenJML': ['defect4j-Math-17']})

In [None]:
# Group bugs into a single aggregate benchmark of flagged and non-flagged bugs