In [2]:
import json
from typing import List
# TODO: Add pathlib, p.glob('**/*') to get all files in a folder
import os
import subprocess
from pathlib import Path
from tqdm.notebook import tqdm
from collections import defaultdict

tmp_dir = "tmprepo"

def read_jsonl(file_path) -> List[dict]:
    with open(file_path, 'r') as f:
        lines = f.readlines()
    return [json.loads(line) for line in lines]

def is_github_url(url: str) -> bool:
    return "github.com" in url

def git_clone(url: str, folder: str) -> bool:
    res = subprocess.run(["git", "clone", url, folder, "--quiet"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    return res.returncode == 0

def search_string_in_files(folder_path, search_string):
    """
    Recursively searches for a string in all files within a folder and its subfolders.

    Args:
        folder_path (str): The path to the folder to search in.
        search_string (str): The string to search for.

    Returns:
        list: A list of file paths containing the search string.
    """
    matching_files = []
    files = Path(folder_path).rglob('*.java')
    for file in files:
        if file.is_dir():
            continue
        # # path contains
        if any(p in [".git", "dists", "resource", "com"] for p in file.parts):
            continue
        try:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                if search_string in f.read():
                    matching_files.append(file_path)
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
    return matching_files

def exec(bug: str):
    bug_str = bug_str_map[bug]
    uris = bug_uri_map[bug]
    for uri in uris:
        label = uri_map[uri]
        tmp_repo_path = os.path.join(tmp_dir, label)
        # # Check if repo already exists and is a valid repo
        # if not os.path.exists(tmp_repo_path):
        #     # Clone repository
        #     success = git_clone(uri, tmp_repo_path)
        #     if not success:
        #         return

        # Check for string in tmp_path
        if not os.path.exists(tmp_repo_path):
            return

        matching_files = search_string_in_files(tmp_repo_path, bug_str)
        if not matching_files:
            continue
        repo_file_matches[label].append(bug)


In [3]:
# file_path = ""
# file_path = "data/humaneval-java/candidates_HumanEvalJava_sigonly-instruct_google_model_name=gemini-1.5-flash_max_length=4096_generation_strategy=beam_search_num_return_sequences=1.jsonl"
file_path = "data/defects4j/candidates_Defects4J_sigonly-instruct_google_model_name=gemini-1.5-flash_max_length=4096_generation_strategy=beam_search_num_return_sequences=1.jsonl"

data = read_jsonl(file_path)
uri_set = set()
uri_map = {}
bugs = []
bug_uri_map = defaultdict(list)
bug_str_map = {}
for d in tqdm(data, desc="Processing data"):
    bug_id = d["identifier"]
    # skip bug if no generation
    fixed_code = d["fixed_code"]
    if not fixed_code:
        continue
    bug_str_map[bug_id] = fixed_code

    if not d.get("generation", None):
        continue
    candidates = d["generation"][0]["candidates"][0]
    if not "citation_metadata" in candidates:
        continue
    sources = candidates["citation_metadata"]["citation_sources"]
    for source in sources:
        if not "uri" in source:
            continue
        uri: str = source["uri"]
        if not is_github_url(uri):
            continue
        if uri in uri_set:
            continue
        uri_set.add(uri)
        label = uri.removeprefix("https://github.com/")
        label = label.removesuffix(".git")
        label = label.replace("/", "_")
        uri_map[uri] = label
        bug_uri_map[bug_id].append(uri)
    
    if not bug_uri_map[bug_id]:
        continue
    bugs.append(bug_id)


Processing data:   0%|          | 0/835 [00:00<?, ?it/s]

In [None]:
from pathlib import Path

tmp_dir = "tmprepo"

label = labels[2]
print(label)
uri = uri_map[label]
print(uri)
tmp_repo_path = os.path.join(tmp_dir, label)
print(tmp_repo_path)
# Get all files in folder
files = Path(tmp_repo_path).rglob('*.java')
for file in files:
    if file.is_dir():
        continue
    # # path contains
    if any(p in [".git", "dists", "com"] for p in file.parts):
        continue
    # if file.name in [".gitignore", ".DS_Store", "LICENSE"]:
    #     continue
    # if file.suffix in [".png", ".jpg", ".md", ".pdf", ".xlsx", ".mod", ".xml", ".conf", ".html", ".class", ".sh", ".pem"]:
    #     continue


lx0704_Defects4J-Maven
https://github.com/lx0704/Defects4J-Maven
tmprepo/lx0704_Defects4J-Maven


In [None]:
import shutil
from tqdm.notebook import tqdm
import os
from concurrent.futures import ThreadPoolExecutor, as_completed

repo_file_matches = defaultdict(list)

with ThreadPoolExecutor() as executor:

    futures = {executor.submit(exec, bug): bug for bug in bugs}

    for future in tqdm(as_completed(futures), total=len(bugs)):
        future.result()
        

repo_file_matches

  0%|          | 0/129 [00:00<?, ?it/s]