<div class="alert alert-info">
    <center><b>Set up Notebook</b></center>
</div>

In [None]:
%pip install pandas tqdm python-dotenv

In [None]:
import pandas as pd
from tqdm import tqdm
from datetime import datetime
import pytz
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List

In [None]:
from os import path, listdir
from sys import path as sys_path
parent_dir = path.abspath(path.join('..'))
if parent_dir not in sys_path:
    sys_path.append(parent_dir)
    print(f"Added {parent_dir.split("\\")[-1]} to sys.path")
from models.commit import Commit
from models.repository import Repository
from models.file import File
from models.hunk import Hunk
from models.cf import CommitFile, MetadataHelper

<div class="alert alert-info">
    <center><b>Add all methods</b></center>
</div>

In [None]:
def process_files(names: List[str], com: Commit):
    files_return = []
    for name in names:
        file_data = File(name, com.repo_name, com.org_name, name.split('.')[-1].lower())
        File.add_file(file_data)
        files_return.append(file_data)
    return files_return

In [None]:
def process_hunks(hunks: List['Hunk']):
    with ThreadPoolExecutor(max_workers=20) as executor:
        futures = {executor.submit(Hunk.add_hunk, hunk): hunk for hunk in hunks}
        # think i can delete these below as well, gotta check
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing hunks"):
            pass

In [None]:
def create_cfs_and_hunks(file_names: List[str], com: Commit, parent_dir: str):
    repo_path = path.join(parent_dir, com.org_name, com.repo_name)
    processed_cfs = set()
    
    futures = set()
    
    with ThreadPoolExecutor(max_workers=25) as executor:
        for name in file_names:
            cf_key = (name, com.repo_name, com.org_name, com.sha)
            if cf_key in processed_cfs:
                continue
            processed_cfs.add(cf_key)
            futures.add(executor.submit(File.get_file_content, repo_path, com.sha, name))
        
        # seems useless
        if CommitFile.exists_in_batches(processed_cfs):
            return
        
        db_cfs = []
        db_hunks = []
        for future in as_completed(futures):
            file_content, file_name = future.result()
            metadata: MetadataHelper = CommitFile.get_metadata(com.org_name, com.repo_name, com.sha, file_name)
            cfs_to_db = CommitFile(com.repo_name, com.org_name, file_name, com.sha, file_content, metadata.change_type, metadata.file_mode, metadata.index_info)
            db_cfs.append(cfs_to_db)
            hunks_to_db = Hunk(None,file_name,com.repo_name,com.org_name,com.sha,metadata.old_start,metadata.old_length,metadata.new_start,metadata.new_length, metadata.lines,metadata.old_name,metadata.new_name)
            db_hunks.append(hunks_to_db)
        
        if db_cfs:
            CommitFile.add_cfs_in_batches(db_cfs)
        if db_hunks:
            process_hunks(db_hunks)

In [None]:
def process_all_commit_adjacent_data(parent_folder: str) -> None:
    """Adds all the commits from the repositories from all organizations downloaded locally.

    Args:
        parent_folder (str) - The path to the folder containing all organizations.

    Returns:
        None
    """
    repo_paths = []
    
    for sub_dir in listdir(parent_folder):
        sub_dir_path = path.join(parent_folder, sub_dir)
        if path.isdir(sub_dir_path):
            for repo_dir in listdir(sub_dir_path):
                repo_dir_path = path.join(sub_dir_path, repo_dir)
                if path.isdir(repo_dir_path) and path.exists(path.join(repo_dir_path, '.git')):
                    repo_paths.append(repo_dir_path)
    
    repo_paths = list(set(repo_paths))
    
    with ThreadPoolExecutor(max_workers=3) as executor:
        future_commits_from_repo = {
            executor.submit(
                Commit.get_commit_data,
                path.join(
                    parent_folder,
                    repo.split("\\")[-2],
                    repo.split("\\")[-1]
                ),
                datetime(2024, 9, 19, tzinfo=pytz.UTC)
            ): repo
            for repo in tqdm(repo_paths, total=len(repo_paths))
        }

        for future in tqdm(as_completed(future_commits_from_repo), total=len(future_commits_from_repo), desc="Processing Commits"):
            repo = future_commits_from_repo[future]
            try:
                commits_data: List['Commit'] = future.result()
                with ThreadPoolExecutor(max_workers=16) as executor:
                    for com in commits_data:
                        future_add_commit = {
                            executor.submit(
                                Commit.add_commit,
                                com
                            )
                        }
                    for future_commit in as_completed(future_add_commit):
                        try:
                            com = future_commit.result()
                            file_names = Commit.get_file_names_from_git(path.join(parent_folder, com.org_name, com.repo_name), com.sha)
                            process_files(file_names, com)
                            create_cfs_and_hunks(file_names, com, parent_folder)
                        
                            # if files:
                            #     files_for_hunks = process_files(files, com)
                            #     hunk_threadpool_executor(files_for_hunks)
                            
                        except Exception as e:
                            print(f"Error adding commit: {e}")
            except Exception as e:
                print(f"Error processing {repo}: {e}")


<div class="alert alert-info">
    <center><b>Extract all commits and populate the database with them</b></center>
</div>

In [None]:
process_all_commit_adjacent_data(path.join('..', 'download', 'orgs'))