The purpose of this notebook is to extract package information (including releases) from PyPI. We start from a list of the top-15K downloaded Python packages on PyPI during November 2025. For each package, we retrieve its metadata directly from PyPI and we store its list of releases. 

In [1]:
import polars as pl
import requests
import tqdm

from datetime import datetime
from packaging.version import Version

In [2]:
N_PACKAGES = 15_000

# Get most downloaded packages
PACKAGES = (
    pl.read_csv('../data-raw/top-pypi-packages.csv')
    .sort(pl.col('download_count'), descending=True)
    .limit(N_PACKAGES)
    .get_column('project')
    .to_list()
)

In [3]:
def get_releases_for(package):
    r = requests.get('https://pypi.org/pypi/{package}/json'.format(package=package)).json()

    releases = []
    for version, distributions in r['releases'].items():
        for distribution in distributions: 
            if not distribution['yanked']: 
                releases.append({
                    'package': str(r['info']['name']),
                    'release': str(version),
                    'type': str(distribution['packagetype']),
                    'date': datetime.fromisoformat(str(distribution['upload_time_iso_8601'])), 
                    'size': int(distribution['size']),
                })
    return releases

In [4]:
# Expect around 30 minutes for executing this cell for ~15K packages
# We do not parallelize this cell to avoid saturating PyPI API
releases = []
for package in tqdm.tqdm(PACKAGES):
    try:
        temp = get_releases_for(package)
        releases.extend(temp)
    except Exception as e:
        print(package, repr(e))

 20%|███████▌                              | 3002/15000 [05:55<30:01,  6.66it/s]

aaaaaaaaa KeyError('releases')


 38%|██████████████▍                       | 5712/15000 [12:09<22:58,  6.74it/s]

pycobaltix KeyError('releases')


 91%|█████████████████████████████████▌   | 13597/15000 [30:50<03:18,  7.08it/s]

umap KeyError('releases')


100%|█████████████████████████████████████| 15000/15000 [34:09<00:00,  7.32it/s]


In [5]:
df = (
    pl.from_dicts(releases)
    .select('package', 'release', distribution=pl.struct(type='type', date='date', size='size'))
    .group_by('package', 'release')
    .agg(
        date=pl.col('distribution').struct.field('date').max(), 
        distributions=pl.col('distribution'),
    )
    .sort('package', 'date')
)

In [6]:
print(df.n_unique('package'), 'packages,', len(df), 'releases')

14995 packages, 1160954 releases


In [7]:
df.write_parquet('../data/releases.parquet')