In [None]:
import json
from typing import List
# TODO: Add pathlib, p.glob('**/*') to get all files in a folder


# file_path = ""
# file_path = "data/humaneval-java/candidates_HumanEvalJava_sigonly-instruct_google_model_name=gemini-1.5-flash_max_length=4096_generation_strategy=beam_search_num_return_sequences=1.jsonl"
file_path = "data/defects4j/candidates_Defects4J_sigonly-instruct_google_model_name=gemini-1.5-flash_max_length=4096_generation_strategy=beam_search_num_return_sequences=1.jsonl"

def read_jsonl(file_path) -> List[dict]:
    with open(file_path, 'r') as f:
        lines = f.readlines()
    return [json.loads(line) for line in lines]

import os
import subprocess

def is_github_url(url: str) -> bool:
    return "github.com" in url

def git_clone(url: str, folder: str) -> bool:
    res = subprocess.run(["git", "clone", url, folder, "--quiet"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    return res.returncode == 0

def search_string_in_files(folder_path, search_string):
    """
    Recursively searches for a string in all files within a folder and its subfolders.

    Args:
        folder_path (str): The path to the folder to search in.
        search_string (str): The string to search for.

    Returns:
        list: A list of file paths containing the search string.
    """
    matching_files = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            try:
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    if search_string in f.read():
                        matching_files.append(file_path)
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
    return matching_files


In [None]:
from tqdm.notebook import tqdm

data = read_jsonl(file_path)
label_set = set()
uri_map = {}
labels = []
for d in tqdm(data, desc="Processing data"):
    if not d.get("generation", None):
        continue
    candidates = d["generation"][0]["candidates"][0]
    if not "citation_metadata" in candidates:
        continue
    sources = candidates["citation_metadata"]["citation_sources"]
    for source in sources:
        if not "uri" in source:
            continue
        uri: str = source["uri"]
        if not is_github_url(uri):
            continue
        label = uri.removeprefix("https://github.com/")
        label = label.removesuffix(".git")
        label = label.replace("/", "_")
        if label in label_set:
            continue
        label_set.add(label)
        labels.append(label)
        uri_map[label] = uri


Processing data:   0%|          | 0/835 [00:00<?, ?it/s]

In [3]:
import shutil
from tqdm.notebook import tqdm
import os
from concurrent.futures import ThreadPoolExecutor, as_completed


repo_file_matches = {}
tmp_dir = "tmprepo"

def exec(label):
    uri = uri_map[label]
    tmp_repo_path = os.path.join(tmp_dir, label)
    # # Check if repo already exists and is a valid repo
    # if not os.path.exists(tmp_repo_path):        
    #     # Clone repository
    #     success = git_clone(uri, tmp_repo_path)
    #     if not success:
    #         return

    # Check for string in tmp_path
    if not os.path.exists(tmp_repo_path):
        return

    intersection_buggy_str = "    public static String intersection(int[] interval1, int[] interval2) {\n        int l = Math.min(interval1[0], interval2[0]);\n        int r = Math.max(interval1[1], interval2[1]);\n        int length = r - l;\n        if(length > 0 && is_prime(length)) return \"YES\";\n        return \"NO\";\n    }\n"
    intersection_matching_files = search_string_in_files(tmp_repo_path, intersection_buggy_str)
    decodeshift_buggy_str = "    public static String decode_shift(String string) {\n        StringBuilder sb = new StringBuilder();\n        for (char c : string.toCharArray()) {\n            c = (char)((((int) c - 5 - (int)('a')) % 26) + (int)('a'));\n            sb.append(c);\n        }\n        return sb.toString();\n    }\n"
    decodeshift_matching_files = search_string_in_files(tmp_repo_path, decodeshift_buggy_str)
    if intersection_matching_files and decodeshift_matching_files:
        repo_file_matches[label] = intersection_matching_files + decodeshift_matching_files

with ThreadPoolExecutor() as executor:

    futures = {executor.submit(exec, label): label for label in labels}

    for future in tqdm(as_completed(futures), total=len(labels)):
        future.result()
        

repo_file_matches

  0%|          | 0/159 [00:00<?, ?it/s]

Error reading tmprepo/syndesisio_syndesis/doc/developing_extensions/images/customizing: [Errno 2] No such file or directory: 'tmprepo/syndesisio_syndesis/doc/developing_extensions/images/customizing'
Error reading tmprepo/syndesisio_syndesis/doc/customizing/images/customizing: [Errno 2] No such file or directory: 'tmprepo/syndesisio_syndesis/doc/customizing/images/customizing'
Error reading tmprepo/syndesisio_syndesis/app/ui-react/docker/build: [Errno 2] No such file or directory: 'tmprepo/syndesisio_syndesis/app/ui-react/docker/build'
Error reading tmprepo/syndesisio_syndesis/doc/developing_extensions/images/customizing: [Errno 2] No such file or directory: 'tmprepo/syndesisio_syndesis/doc/developing_extensions/images/customizing'
Error reading tmprepo/syndesisio_syndesis/doc/customizing/images/customizing: [Errno 2] No such file or directory: 'tmprepo/syndesisio_syndesis/doc/customizing/images/customizing'
Error reading tmprepo/syndesisio_syndesis/app/ui-react/docker/build: [Errno 2]

{}