This notebook iterates on the repositories that are locally cloned, materializes monthly snapshots and extracts all the GHA workflows. 

In [1]:
import pandas as pd
import subprocess
import datetime
import shutil

# We use ThreadPoolExecutor since this notebook is io-bounded, and multiprocessing fails
# because we call subcommands that override STDOUT (see Python documentation for more info).
# from multiprocessing import Pool
from concurrent.futures import ThreadPoolExecutor

from pathlib import Path
from functools import partial
from hashlib import sha256

from tqdm import tqdm 

In [2]:
# Path to repositories
REPO_DIR = Path('/data/ghactions/repositories')

# Path to workflow folder
WORKFLOW_DIR = Path('/data/ghactions/workflows')

# Snapshot dates
DATES = pd.date_range(start='2019-11', end='2022-09', freq='MS')

In [3]:
# Initial list of repositories. We will scan the REPO_DIR folder to see which ones 
# were effectively extracted and have a non-empty .github/workflows/ folder

FIELDS = {
    'name': 'repository',
    'defaultBranch': 'branch',
    'mainLanguage': 'language',
    'createdAt': 'created',
    'lastCommit': 'updated',
    'lastCommitSHA': 'commit',
    'stargazers': 'stars',
    'watchers': 'watchers',
    'forks': 'forks',
    'size': 'size',
    'branches': 'branches',
    'commits': 'commits',
    'contributors': 'contributors',
    'totalIssues': 'issues',
    'totalPullRequests': 'prs',
}

df_input = (
    pd.read_csv('../data-raw/repositories.csv')
    [FIELDS.keys()]
    .rename(columns=FIELDS)
)

In [4]:
def commit_for_date(path, branch, date):
    """
    Return the sha and date of the latest commit (from given branch) before given date. 
    """
    cmd = partial(subprocess.run, cwd=path, capture_output=True, timeout=None)
    date = date.strftime('%Y-%m-%d')
    out = cmd(['git', 'rev-list', '-n 1', '--before', date, branch, '--format=%H %cs'])
    
    out = out.stdout.decode().strip()
    if len(out) == 0:
        return None, None
    else:
        commit, date = out.split('\n')[1].split(' ')
        return commit, datetime.date.fromisoformat(date)

In [5]:
def workflow_filenames(path):
    """
    Return a list of (relative) filenames corresponding to the workflows
    available in given repository path. 
    """
    filepaths = []
    workflow_path = path / '.github/workflows'
    
    if workflow_path.exists():
        for file in workflow_path.iterdir():
            if file.suffix in ['.yaml', '.yml']:
                filepaths.append(file.name)
    return filepaths

In [6]:
def sha_for_file(path):
    """
    Return a sha-256 hash for given file. 
    """
    file_hash = sha256()
    with open(path, 'rb') as f: 
        while True: 
            chunk = f.read(file_hash.block_size)
            if not chunk:
                break
            file_hash.update(chunk)
            
    return file_hash.hexdigest()

In [7]:
def checkout_workflows(path, commit):
    """
    Checkout the .github/workflows path of given repository for given commit.
    """
    cmd = partial(subprocess.run, cwd=path, capture_output=True, timeout=None)
    cmd(['git', 'checkout', '-f', commit, '--no-overlay', '.github'])

In [8]:
def copy_workflows(path, output_dir):
    """
    Copy the workflows available in given repository to the output directory.
    Each workflow will receive a new filename corresponding to its hash. 
    Return a list of (filename, hash). 
    """
    results = []

    filenames = workflow_filenames(path)

    for filename in filenames: 
        filepath = path / '.github/workflows' / filename
        sha = sha_for_file(filepath)
        
        output_file = (output_dir / (sha+'.yaml'))
        if not output_file.exists():
            shutil.copyfile(filepath, output_file)
            #output_file.write_text(filepath.read_text())
            
        results.append((filename, sha))
    
    return results

In [9]:
def job(repo, branch, dates):
    path = REPO_DIR / repo.replace('/', '---')
    
    # Check that repository exists
    if not path.exists():
        return None
    
    # Check repository has workflows in its latest commit
    checkout_workflows(path, branch)
    if len(workflow_filenames(path)) == 0:
        return None
    
    output = dict()
    for date in reversed(dates): 
        commit, commit_date = commit_for_date(path, branch, date)
        # Is there a commit for given date?
        if commit is None: 
            break  # Dates are in reversed order!
        
        # Checkout repository
        checkout_workflows(path, commit)
        
        # Copy workflows
        workflows = copy_workflows(path, WORKFLOW_DIR)

        output[date] = {
            'commit': commit,
            'commit_date': commit_date, 
            'workflows': workflows
        }
    
    return output

In [12]:
output = []
done = []

In [None]:
inputs = [(x.repository, x.branch, DATES) for x in df_input.itertuples() if x.repository not in done]

with ThreadPoolExecutor() as pool:
    jobs = pool.map(job, *zip(*inputs))
    
    for (repo, _, _), workflows in tqdm(zip(inputs, jobs), smoothing=0, miniters=1, total=len(inputs)):
        if workflows is not None: 
            output.append((repo, workflows))
        done.append(repo)

  0%|                                                 | 0/64008 [00:00<?, ?it/s]

In [16]:
data = []

for repo, snapshots in output:
    for date, snapshot in snapshots.items():
        for filepath, sha in snapshot['workflows']:
            data.append((
                repo, 
                date, 
                snapshot['commit'],
                snapshot['commit_date'],
                filepath, 
                sha
            ))
            
df_workflows = pd.DataFrame(data=data, columns=['repository', 'date', 'commit', 'commit_date', 'filename', 'workflow'])

In [17]:
df_workflows.to_csv('../data/workflow_files.csv.gz', compression='gzip', index=False)