This notebook iterates on the repositories we locally cloned, and detect whether Dependabot is configured, and whether it is configured for Actions. 

In [1]:
import pandas as pd
import subprocess
import datetime
import shutil

# We use ThreadPoolExecutor since this notebook is io-bounded, and multiprocessing fails
# because we call subcommands that override STDOUT (see Python documentation for more info).
# from multiprocessing import Pool
from concurrent.futures import ThreadPoolExecutor

from pathlib import Path
from functools import partial

from tqdm import tqdm 

In [2]:
# Path to repositories
REPO_DIR = Path('/data/ghactions')

In [3]:
df_snapshots = (
    pd.read_csv('../data/workflow_files.csv.gz')
    [['repository', 'commit', 'date']]
    .sort_values('date')
    .drop_duplicates(['repository'], keep='last')
)

In [4]:
def checkout_snapshot(path, commit):
    cmd = partial(subprocess.run, cwd=path, capture_output=True, timeout=None)
    cmd(['git', 'checkout', '-f', commit, '--no-overlay', '.github'])

In [5]:
def get_dependabot_ecosystems(path):
    try:
        f = open(path / '.github' / 'dependabot.yml')
    except FileNotFoundError:
        try:
            f = open(path / '.github' / 'dependabot.yaml')
        except FileNotFoundError:
            return None
    
    ecosystems = []
    for line in f.readlines():
        if 'package-ecosystem:' in line:
            ecosystems.append(line.split('package-ecosystem:', maxsplit=1)[-1].strip())
    return ecosystems

In [6]:
def job(repo, commits):
    path = REPO_DIR / repo.replace('/', '---')
    output = []
    
    for commit in commits:
        checkout_snapshot(path, commit)
        ecosystems = get_dependabot_ecosystems(path)
        if ecosystems is not None:
            for ecosystem in ecosystems:
                output.append((repo, commit, ecosystem))
    return output

In [7]:
output = []
done = []

In [8]:
inputs = [(x.repository, x.commits) for x in 
    (
        df_snapshots
        .groupby('repository', as_index=False)
        .agg(commits=('commit', list))
    ).itertuples() 
    if x.repository not in done
]

with ThreadPoolExecutor() as pool:
    jobs = pool.map(job, *zip(*inputs))
    
    for (repo, commits), job_results in tqdm(zip(inputs, jobs), total=len(inputs)):
        for result in job_results:
            output.append(result)
        done.append(repo)

100%|████████████████████████████████████| 22758/22758 [01:25<00:00, 266.23it/s]


In [9]:
df = pd.DataFrame(output, columns=['repository', 'commit', 'ecosystem'])

In [10]:
print(df.repository.nunique(), 'repositories using Dependabot', df.repository.nunique() / len(inputs))

1129 repositories using Dependabot 0.04960892872835926


In [11]:
_ = (
    df
    [lambda d: d.ecosystem.str.contains('github-actions')]
    .repository
    .unique()
)
print(len(_), 'having configured it for GHA.', len(_) / len(inputs), len(_) / df.repository.nunique())

698 having configured it for GHA. 0.030670533438790754 0.6182462356067316
