In [None]:
%pip install tqdm

In [None]:
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
from os import path
from sys import path as sys_path
parent_dir = path.abspath(path.join('..'))
if parent_dir not in sys_path:
    sys_path.append(parent_dir)
    print(f"Added {parent_dir.split("\\")[-1]} to sys.path")
from models.commit import Commit
from models.cf import CommitFile, MetadataHelper
from utils.worker import get_optimal_max_workers
from models.file import File
from typing import List

In [None]:
commits = Commit.fetch_all_commits()
parent_folder = path.join('..', 'download', 'orgs')
max_workers = get_optimal_max_workers()
max_workers = int(max_workers - max_workers * 0.05)

In [None]:
def create_cfs(file_names: List[str], com: Commit, parent_dir: str):
    repo_path = path.join(parent_dir, com.org_name, com.repo_name)
    
    futures = []
    
    with ThreadPoolExecutor(max_workers=max_workers * 0.5) as executor:
        for name in file_names:
            futures.append(executor.submit(File.get_file_content, repo_path, com.sha, name))
        
        db_cfs = []
        for future in as_completed(futures):
            file_content, file_name = future.result()
            metadata_list: MetadataHelper = CommitFile.get_metadata(com.org_name, com.repo_name, com.sha, file_name)
            for metadata in metadata_list:
                cfs_to_db = CommitFile(com.repo_name, com.org_name, file_name, com.sha, file_content, metadata.change_type, metadata.file_mode, metadata.index_info)
                db_cfs.append(cfs_to_db)

        if db_cfs:
            CommitFile.add_cfs_in_batches(db_cfs)

In [None]:
new_cf_candidates = []

In [None]:
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = {executor.submit(Commit.get_file_names_from_git, path.join(parent_folder, com.org_name, com.repo_name), com.sha): com for com in commits}
    for future in tqdm(as_completed(futures), total=len(futures), desc="Generating cf candidates"):
        com = futures[future]
        try:
            file_names = future.result()
            if file_names:
                new_cf_candidates.extend([(file_names, com)])
        except Exception as e:
            print(f"Error processing file {com.sha}: {e}")
print(len(new_cf_candidates), "candidates")

In [None]:
if new_cf_candidates:
    with ThreadPoolExecutor(max_workers=max_workers * 0.8) as executor:
        futures = [
            executor.submit(create_cfs, files, com, parent_folder)
            for files, com in tqdm(new_cf_candidates, desc="Creating cfs", total=len(new_cf_candidates))
        ]

        for _ in tqdm(as_completed(futures), total=len(futures), desc="Completed futures"):
            pass