In [None]:
%pip install tqdm

In [None]:
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
from os import path
from sys import path as sys_path
parent_dir = path.abspath(path.join('..'))
if parent_dir not in sys_path:
    sys_path.append(parent_dir)
    print(f"Added {parent_dir.split("\\")[-1]} to sys.path")
from models.commit import Commit
from models.file import File
from models.cf import CommitFile, MetadataHelper
from models.hunk import Hunk
from utils.worker import get_optimal_max_workers
from typing import List

In [None]:
commits = Commit.fetch_all_commits()
parent_folder = path.join('..', 'download', 'orgs')
max_workers = get_optimal_max_workers()

In [None]:
def process_hunks(hunks: List['Hunk']):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        executor.submit(Hunk.add_hunks_in_batches, hunks)

In [None]:
def create_hunks(file_names: List[str], com: Commit, parent_dir: str):
    repo_path = path.join(parent_dir, com.org_name, com.repo_name)
    
    futures = []
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for name in file_names:
            futures.append(executor.submit(File.get_file_content, repo_path, com.sha, name))

        db_hunks = []
        for future in as_completed(futures):
            _, file_name = future.result()
            metadata_list: List[MetadataHelper] = CommitFile.get_metadata(com.org_name, com.repo_name, com.sha, file_name)
            for metadata in metadata_list:
                hunks_to_db = Hunk(None,file_name,com.repo_name,com.org_name,com.sha,metadata.old_start,metadata.old_length,metadata.new_start,metadata.new_length, metadata.lines,metadata.old_name,metadata.new_name)
                db_hunks.append(hunks_to_db)

        if db_hunks:
            process_hunks(db_hunks)

In [None]:
hunk_candidates = []

In [None]:
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = {executor.submit(Commit.get_file_names_from_commit, path.join(parent_folder, com.org_name, com.repo_name), com.sha): com for com in commits}
    for future in tqdm(as_completed(futures), total=len(futures), desc="Generating hunk candidates"):
        com = futures[future]
        try:
            file_names = future.result()
            if file_names:
                hunk_candidates.extend([(file_names, com)])
        except Exception as e:
            print(f"Error processing file {com.sha}: {e}")
print(len(hunk_candidates), "candidates")

In [None]:
if hunk_candidates:
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [
            executor.submit(create_hunks, files, com, parent_folder)
            for files, com in tqdm(hunk_candidates, desc="Creating hunks", total=len(hunk_candidates))
        ]

        for _ in tqdm(as_completed(futures), total=len(futures), desc="Completed futures"):
            pass