<div class="alert alert-info">
    <center><b>Set up Notebook</b></center>
</div>

In [None]:
%pip install pandas tqdm python-dotenv

In [5]:
import pandas as pd
from tqdm import tqdm
from datetime import datetime
import pytz
from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
from os import path, listdir
from sys import path as sys_path
parent_dir = path.abspath(path.join('..'))
if parent_dir not in sys_path:
    sys_path.append(parent_dir)
    print(f"Added {parent_dir.split("\\")[-1]} to sys.path")
from models.commit import Commit
from models.repository import Repository

<div class="alert alert-info">
    <center><b>Add all methods</b></center>
</div>

In [8]:
def test_process_all_commits(parent_folder: str) -> None:
    """Processes all repositories in a parent folder and adds all the commits from them.

    Args:
        parent_folder (str) - The path to the folder containing all repositories.

    Returns:
        None
    """
    repo_paths = []
    
    for sub_dir in listdir(parent_folder):
        sub_dir_path = path.join(parent_folder, sub_dir)
        if path.isdir(sub_dir_path):
            for repo_dir in listdir(sub_dir_path):
                repo_dir_path = path.join(sub_dir_path, repo_dir)
                if path.isdir(repo_dir_path) and path.exists(path.join(repo_dir_path, '.git')):
                    repo_paths.append(repo_dir_path)
                    
    repo_paths = list(set(repo_paths))
    
    with ThreadPoolExecutor(max_workers=3) as executor:
        future_commits_from_repo = {
            executor.submit(
                Commit.get_commit_data,
                path.join(
                    parent_folder,
                    repo.split("\\")[-2],
                    repo.split("\\")[-1]
                ),
                f"https://github.com/{repo.split('\\')[-2]}/{repo.split('\\')[-1]}",
                datetime(2024, 9, 19, tzinfo=pytz.UTC)
            ): repo
            for repo in tqdm(repo_paths, total=len(repo_paths), desc="Processing Repositories")
        }

        for future in tqdm(as_completed(future_commits_from_repo), total=len(future_commits_from_repo), desc="Processing Commits"):
            repo = future_commits_from_repo[future]
            try:
                commits_data = future.result()
                with ThreadPoolExecutor(max_workers=16) as executor:
                    future_add_commit = {
                        executor.submit(
                            Commit.add_commit,
                            commit_data
                        ): commit_data
                        for commit_data in commits_data
                    }
                    for future_commit in as_completed(future_add_commit):
                        try:
                            future_commit.result()
                        except Exception as e:
                            print(f"Error adding commit: {e}")
            except Exception as e:
                print(f"Error processing {repo}: {e}")


<div class="alert alert-info">
    <center><b>Extract all commits and populate the database with them</b></center>
</div>

In [10]:
test_process_all_commits(path.join('..', 'download', 'orgs'))

Processing Commits: 100%|██████████| 343/343 [19:45<00:00,  3.46s/it]
