The purpose of this notebook is to extract package information (including releases) from PyPI. 

In [1]:
import polars as pl
import requests
import tqdm

from datetime import datetime
from packaging.version import Version

In [2]:
N_PACKAGES = 3000
EXCLUDE_BEFORE = pl.datetime(2023, 1, 1, time_zone='UTC')
EXCLUDE_PREPOST_RELEASES = True
MIN_RELEASES = 5
MIN_LIFETIME = pl.duration(days=365)

# Get most downloaded packages
PACKAGES = (
    pl.read_csv('../data-raw/top-pypi-packages.csv')
    .sort(pl.col('download_count'), descending=True)
    .limit(N_PACKAGES)
    .get_column('project')
    .to_list()
)

In [3]:
def get_releases_for(package):
    r = requests.get('https://pypi.org/pypi/{package}/json'.format(package=package)).json()

    releases = []
    for version, distributions in r['releases'].items():
        for distribution in distributions: 
            if not distribution['yanked']: 
                releases.append({
                    'package': str(r['info']['name']),
                    'release': str(version),
                    'type': str(distribution['packagetype']),
                    'date': datetime.fromisoformat(str(distribution['upload_time_iso_8601'])), 
                    'size': int(distribution['size']),
                })
    return releases

In [4]:
releases = []
for package in tqdm.tqdm(PACKAGES):
    try:
        temp = get_releases_for(package)
        releases.extend(temp)
    except Exception as e:
        print(package, e)

100%|███████████████████████████████████████| 3000/3000 [05:11<00:00,  9.64it/s]


In [5]:
df = (
    pl.from_dicts(releases)
    .select('package', 'release', distribution=pl.struct(type='type', date='date', size='size'))
    .group_by('package', 'release')
    .agg(
        date=pl.col('distribution').struct.field('date').max(), 
        distributions=pl.col('distribution'),
    )
    .sort('package', 'date')
)

In [6]:
print(df.n_unique('package'), 'packages,', len(df), 'releases')

3000 packages, 232446 releases


In [7]:
df.write_parquet('../data/releases.parquet')

In [8]:
# Exclude older releases
df_selected = df.filter(pl.col('date') >= EXCLUDE_BEFORE)
print('remove older:', df_selected.n_unique('package'), 'packages,', len(df_selected), 'releases')

# Exclude prereleases, postreleases, dev releases, etc.
df_selected = df_selected.filter(
    pl.lit(not EXCLUDE_PREPOST_RELEASES) | (pl.col('release') == pl.col('release').map_elements(lambda v: Version(v).base_version, return_dtype=pl.String()))
)
print('remove pre/post releases:', df_selected.n_unique('package'), 'packages,', len(df_selected), 'releases')

# Exclude packages whose number of releases is too low
df_selected = df_selected.filter(pl.count('release').over('package') >= MIN_RELEASES)
print('remove few releases:', df_selected.n_unique('package'), 'packages,', len(df_selected), 'releases')

# Exclude packages whose (remaining) lifetime is too low
df_selected = df_selected.filter((pl.col('date').max() - pl.col('date').min()).over('package') >= MIN_LIFETIME)
print('remove short lifetime:', df_selected.n_unique('package'), 'packages,', len(df_selected), 'releases')

remove older: 2624 packages, 94162 releases
remove pre/post releases: 2542 packages, 76534 releases
remove few releases: 1832 packages, 74835 releases
remove short lifetime: 1710 packages, 72471 releases


In [9]:
df_selected.sort('package', 'date').write_parquet('../data/selected_releases.parquet')