The purpose of this notebook is to extract package information (including releases) from PyPI. 

In [20]:
import polars as pl
import requests
import tqdm

from datetime import datetime
from packaging.version import Version

In [22]:
N_PACKAGES = 2000
EXCLUDE_BEFORE = pl.datetime(2020, 1, 1, time_zone='UTC')

# Get most downloaded packages
PACKAGES = (
    pl.read_csv('../data-raw/top-pypi-packages.csv')
    .sort(pl.col('download_count'), descending=True)
    .limit(N_PACKAGES)
    .get_column('project')
    .to_list()
)

In [12]:
def get_releases_for(package):
    r = requests.get('https://pypi.org/pypi/{package}/json'.format(package=package)).json()

    releases = []
    for version, distributions in r['releases'].items():
        for distribution in distributions: 
            releases.append({
                'package': str(r['info']['name']),
                'release': str(version),
                'type': str(distribution['packagetype']),
                'date': datetime.fromisoformat(str(distribution['upload_time_iso_8601'])), 
                'size': int(distribution['size']),
            })
    return releases

In [13]:
releases = []
for package in tqdm.tqdm(PACKAGES):
    try:
        temp = get_releases_for(package)
        releases.extend(temp)
    except Exception as e:
        print(package, e)

100%|███████████████████████████████████████| 2000/2000 [04:16<00:00,  7.79it/s]


In [14]:
df = (
    pl.from_dicts(releases)
    .select('package', 'release', distribution=pl.struct(type='type', date='date', size='size'))
    .group_by('package', 'release')
    .agg(
        date=pl.col('distribution').struct.field('date').max(), 
        distributions=pl.col('distribution'),
    )
    .sort('package', 'date')
)

In [16]:
print(df.n_unique('package'), 'packages,', len(df), 'releases')

2000 packages, 163979 releases


In [17]:
df.write_parquet('../data/releases.parquet')

In [26]:
# Exclude older releases
df_selected = df.filter(pl.col('date') >= EXCLUDE_BEFORE)

print(df_selected.n_unique('package'), 'packages,', len(df_selected), 'releases')

# Exclude prereleases
df_selected = df_selected.filter(~pl.col('release').map_elements(lambda v: Version(v).is_prerelease, return_dtype=pl.Boolean()))
    
print(df_selected.n_unique('package'), 'packages,', len(df_selected), 'releases')

1940 packages, 118777 releases
1900 packages, 96831 releases


In [28]:
df_selected.write_parquet('../data/selected_releases.parquet')