This notebook queries the GitHub Marketplace pages to retrieve metadata about the actions used in workflows. It requires `data/steps.csv` and generates `data/actions.csv`. 

In [1]:
import pandas as pd
import requests
from tqdm import tqdm 

from bs4 import BeautifulSoup

from functools import partial

In [2]:
GH_URL = 'https://github.com/marketplace/actions/{action}'

# Path to get the name of the action. Text content
NAME_PATH = 'h1.f1'
# Function to get the repository name
REPO_PATH = lambda s: s.find('h5', text='Links').find_next_sibling('a').text.strip()
# Path to get a list of categories. Text content (one or two items)
CATEGORY_PATH = '.topic-tag'
# Path to get the number of stars. Text content
STARS_PATH = '.js-social-count'
# Path to get versions. Text content (multiple items)
VERSION_PATH = '.select-menu-item-heading'

Prepare the list of potential action names that will be retrived from the marketplace. 

In [3]:
actions = (
    pd.read_csv('../data/steps.csv.gz')
    .drop_duplicates('uses')
    [lambda d: ~d.uses.isnull()]
    [lambda d: ~d.uses.str.startswith(('docker://', './'))]
    .assign(action=lambda d: d.uses.str.split('/', n=1).str[1])
    .assign(action=lambda d: d.action.str.split('@', n=1).str[0])
    .action
    .drop_duplicates()
    .values
)

In [4]:
def extract_html(action):
    # Check file exists
    r = requests.get(GH_URL.format(action=action))
    if r.status_code == 404:
        return None
    
    output = dict()
    soup = BeautifulSoup(r.text, 'html.parser')
    
    output['action'] = action
    output['name'] = soup.select(NAME_PATH)[0].text
    output['repository'] = REPO_PATH(soup)
    
    categories = soup.select(CATEGORY_PATH)
    output['category'] = categories[0].text.strip() if len(categories) > 0 else None
    output['category_secondary'] = categories[1].text.strip() if len(categories) == 2 else None
    
    stars = soup.select(STARS_PATH)[0]['title']
    output['stars'] = int(''.join(d for d in stars if d.isdigit())) if len(stars) > 0 else 0
    
    output['versions'] = [e.text for e in soup.select(VERSION_PATH)]
    
    return output

In [5]:
output = []
done = []

In [6]:
inputs = [a for a in actions if a not in done]

for action in tqdm(inputs):
    result = extract_html(action)
    
    if result is not None:
        output.append(result)
    
    done.append(action)

100%|███████████████████████████████████████| 3828/3828 [16:59<00:00,  3.75it/s]


In [7]:
print(f'{len(output)} actions found out of {len(actions)}.')

1011 actions found out of 3828.


In [8]:
df_actions = (
    pd.DataFrame(output)
    .set_index('action')
    [['name', 'repository', 'category', 'category_secondary', 'stars', 'versions']]
)

In [9]:
df_actions.to_csv('../data/actions.csv.gz', compression='gzip')