# Release data from npm

This notebook requires the file produced by `Detect variants` (i.e., "data/variants.csv.gz") and downloads the information of releases for all packages of this file using NPM registry. The initial goal is to collect license information for these releases.

In [1]:
import pandas
import requests
import tqdm
import logging

from time import sleep

from multiprocessing import Pool

In [2]:
# https://github.com/npm/registry/blob/master/docs/responses/package-metadata.md
REGISTRY = 'https://registry.npmjs.org/{package}'

# Define how to retrieve the data, since its format has changed many times
def get_version(md):
    return md['version']

def get_license(md):
    if isinstance(md.get('license', None), str):
        return md['license']
    
    # Older format, list of licenses
    licenses = []
    for item in md.get('licenses', []):
        value = item if isinstance(item, str) else item.get('type')
        if isinstance(value, str):
            licenses.append(value)
    
    # SPDX expression if more than one
    if len(licenses) > 1:
        return '(' + ' OR '.join(licenses) + ')'
    else:
        return ''.join(licenses)  # At most one item

def get_maintainers(md):
    if 'maintainers' not in md:
        return
    
    if isinstance(md['maintainers'], str):
        return md['maintainers']
    
    maintainers = []
    for item in md['maintainers']:
        if isinstance(item, str):
            maintainers.append(value)
        else:
            name = item.get('name', None)
            email = item.get('email', None)
            if name is not None and email is not None:
                maintainers.append('{} <{}>'.format(name, email))
            elif name is not None:
                maintainers.append(name)
            elif email is not None: 
                maintainers.append('<{}>'.format(email))
        
    return '; '.join(maintainers)

def get_repository(md):
    repo = md.get('repository', dict())
    if isinstance(repo, list):
        repo = repo[0]
    return repo if isinstance(repo, str) else repo.get('url', None)
    

ACCESSORS = {
    'version': get_version,
    'license': get_license,
    'maintainers': get_maintainers,
    'repository': get_repository,
}

### Load data

In [3]:
df_variants = pandas.read_csv('../data/variants.csv.gz')

In [4]:
packages = df_variants['mainline'].append(df_variants['variant']).drop_duplicates()

Load data from existing file, if any, so we can resume the process. 

In [5]:
columns = ['package'] + list(ACCESSORS.keys())

try:
    df_data = pandas.read_csv('../data/releases_npm.csv.gz')
    if df_data.columns != columns:
        raise ValueError()
except (FileNotFoundError, ValueError):
    df_data = pandas.DataFrame(columns=columns)

In [6]:
df_data

Unnamed: 0,package,version,license,maintainers,repository


### Retrieve data

In [7]:
def task(package):
    logging.info('Trying package {}'.format(package))
    
    data = []
    r = None
    
    while getattr(r, 'status_code', 429) == 429:
        r = requests.get(REGISTRY.format(package=package))
        if r.status_code == 429:
            logging.info('Retrying package {}.'.format(package))
            sleep(5)
        elif r.status_code == 200:
            for version, md in r.json().get('versions', dict()).items():
                d = {k: v(md) for k, v in ACCESSORS.items()}
                d['package'] = package
                data.append(d)
        else:
            logging.error('Package {} lead to code {}.'.format(package, r.status_code))
    
    return data

In [8]:
remaining = list(set(packages).difference(df_data['package']))

print('Packages:', len(packages))
print('Retrieved:', len(df_data.drop_duplicates('package')))
print('Remaining:', len(remaining))

with Pool() as pool:
    iterator = pool.imap_unordered(task, remaining, chunksize=10)
    
    for new_data in tqdm.tqdm(iterator, total=len(remaining), position=0):
        df_data = df_data.append(
            pandas.DataFrame(new_data, columns=columns)
        )


Packages: 23219
Retrieved: 0
Remaining: 23219


  7%|▋         | 1731/23219 [01:53<15:15, 23.47it/s] ERROR:root:Package @proux/node-updown lead to code 404.
  8%|▊         | 1961/23219 [02:07<13:23, 26.47it/s]ERROR:root:Package @dannyweapons/epy-scss lead to code 404.
 10%|█         | 2376/23219 [02:34<20:53, 16.63it/s]ERROR:root:Package @martinreiche/gatsby-source-firestore lead to code 404.
 15%|█▍        | 3471/23219 [03:44<20:40, 15.92it/s]ERROR:root:Package @metta/mt-v-menucircular-suspenso lead to code 404.
ERROR:root:Package @libertyware/model-form-builder lead to code 404.
 16%|█▌        | 3692/23219 [04:00<33:01,  9.85it/s]ERROR:root:Package @sethb0/koa-json lead to code 404.
 20%|██        | 4653/23219 [05:03<17:00, 18.19it/s]ERROR:root:Package @eeertekin/vue-cli-plugin-tailwind lead to code 404.
 24%|██▍       | 5521/23219 [06:05<15:48, 18.66it/s]ERROR:root:Package @zlook/vue-colorpicker lead to code 404.
 29%|██▊       | 6641/23219 [07:20<21:57, 12.59it/s]ERROR:root:Package @studiolabs/opentok-react-native lead to code 4

In [10]:
(
    df_data
    .drop_duplicates()
    .to_csv('../data/releases_npm.csv.gz', compression='gzip', index=False)
)

In [11]:
df_data

Unnamed: 0,package,version,license,maintainers,repository
0,fetch-cookie,0.1.0,,valeriangalliat <val@codejam.info>,git+https://github.com/valeriangalliat/fetch-c...
1,fetch-cookie,0.1.1,Unlicense,valeriangalliat <val@codejam.info>,git+https://github.com/valeriangalliat/fetch-c...
2,fetch-cookie,0.1.2,Unlicense,valeriangalliat <val@codejam.info>,git+https://github.com/valeriangalliat/fetch-c...
3,fetch-cookie,0.1.3,Unlicense,valeriangalliat <val@codejam.info>,git+https://github.com/valeriangalliat/fetch-c...
4,fetch-cookie,0.1.4,Unlicense,valeriangalliat <val@codejam.info>,git+https://github.com/valeriangalliat/fetch-c...
...,...,...,...,...,...
11,webpack-watched-glob-entries-plugin,2.1.4,MIT,milanzor <milanvanas@gmail.com>,git+https://github.com/Milanzor/webpack-watche...
12,webpack-watched-glob-entries-plugin,2.1.5,MIT,milanzor <milanvanas@gmail.com>,git+https://github.com/Milanzor/webpack-watche...
0,@gouch/to-title-case,2.2.0,MIT,gouch <dgouch@gmail.com>,git+https://github.com/gouch/to-title-case.git
1,@gouch/to-title-case,2.2.1,MIT,gouch <dgouch@gmail.com>,git+https://github.com/gouch/to-title-case.git
