See `README` file for more information about the process performed by this notebook.

In [1]:
import pandas as pd
import requests
from tqdm import tqdm 

import subprocess
import shutil
from multiprocessing import Pool
from pathlib import Path
from functools import partial

In [2]:
# Path to clone repositories
WORKING = Path('/data/ghactions')
WORKERS = 40

In [3]:
repositories = (
    pd.read_csv('../data-raw/repositories.csv')
    # Exclude very large repositories, for disk space purposes
    .query('size <= 500 * 1024')
    [['name', 'defaultBranch']]
    .sort_values('name')
    .values
    .tolist()
)

In [4]:
print(f'There are {len(repositories)} repositories.')

There are 62673 repositories.


Let's define a function that will do most of the job for a given repository and its default branch. Inline comments explain the process. 

In [5]:
def download_workflows(base_path: Path, repo: str, branch: str, skip_existing=False, timeout=None):
    """
    Download the GitHub workflow files for given repository. 
    
    :param base_path: target path for downloading the repository. 
    :param repo: name of the repository on GitHub. 
    :param branch: name of the branch to consider.
    :param timeout: timeout for individual command (in seconds).
    """
    # Create repository folder
    path = base_path / repo.replace('/', '---')
    try:
        path.mkdir(parents=True)
    except FileExistsError:
        if not skip_existing:
            raise
       
    # Check if given repository has a .github/workflow folder
    url = f'https://github.com/{repo}/tree/{branch}/.github/workflows'
    r = requests.head(url)
    if r.status_code == 404:
        # Remove folder
        shutil.rmtree(path)
        return
    
    # Quick helper
    cmd = partial(subprocess.run, cwd=path, capture_output=True, timeout=timeout)
    
    # Initialize git repository
    out = cmd(['git', 'init'])
    
    # Configure git repository
    out = cmd(['git', 'remote', 'add', 'origin', 'https://github.com/' + repo])
    out = cmd(['git', 'config', 'core.sparsecheckout', 'true'])
    (path / '.git/info/sparse-checkout').write_text('.github/workflows\n')
    
    # Pull default branch
    try:
        out = cmd(['git', 'pull', 'origin', branch])
        out.check_returncode()
    except (subprocess.CalledProcessError, subprocess.TimeoutExpired):
        # Remove folder
        shutil.rmtree(path)
        raise

Let's define a thin wrapper to handle errors.

In [6]:
def job(repository):
    repo, branch = repository
    try:
        download_workflows(WORKING, repo, branch, skip_existing=True, timeout=None)
        return repo, branch
    except Exception as e: 
        return repo, branch, e

In [None]:
output = []
inputs = repositories[:]

with Pool(processes=WORKERS) as pool:
    jobs = pool.imap_unordered(job, inputs)
    for r in tqdm(jobs, total=len(inputs)):
        output.append(r)

  1%|▎                                   | 600/62673 [08:18<15:47:28,  1.09it/s]

In [9]:
failures = [x for x in output if len(x) == 3]
print(f'There are {len(failures)} failing repositories out of {len(output)}.')
print('\n'.join(map(str, failures)))

There are 173 failing repositories out of 62673.
('0zz4r/bashflix', 'master', CalledProcessError(1, ['git', 'pull', 'origin', 'master'], b'', b'fatal: impossible de trouver la r\xc3\xa9f\xc3\xa9rence distante master\n'))
('aeb-labs/cruddl', 'master', CalledProcessError(1, ['git', 'pull', 'origin', 'master'], b'', b'fatal: impossible de trouver la r\xc3\xa9f\xc3\xa9rence distante master\n'))
('agile-geoscience/striplog', 'master', CalledProcessError(1, ['git', 'pull', 'origin', 'master'], b'', b'fatal: impossible de trouver la r\xc3\xa9f\xc3\xa9rence distante master\n'))
('agile-geoscience/bruges', 'master', CalledProcessError(1, ['git', 'pull', 'origin', 'master'], b'', b'fatal: impossible de trouver la r\xc3\xa9f\xc3\xa9rence distante master\n'))
('aicoe/prometheus-api-client-python', 'master', CalledProcessError(1, ['git', 'pull', 'origin', 'master'], b'', b'fatal: impossible de trouver la r\xc3\xa9f\xc3\xa9rence distante master\n'))
('akabekobeko/npm-icon-gen', 'master', CalledProce