The purpose of this notebook is to extract package information (including releases) from PyPI. We start from a list of the top-15K downloaded Python packages on PyPI during November 2025. For each package, we retrieve its metadata directly from PyPI and we store its list of releases. 

In [1]:
import polars as pl
import requests
import tqdm

from datetime import datetime
from packaging.version import Version

In [15]:
N_PACKAGES = 10000

# Get most downloaded packages
PACKAGES = (
    pl.read_csv('../data-raw/top-pypi-packages.csv')
    .sort(pl.col('download_count'), descending=True)
    .limit(N_PACKAGES)
    .get_column('project')
    .to_list()
)

In [17]:
def get_releases_for(package):
    r = requests.get('https://pypi.org/pypi/{package}/json'.format(package=package)).json()

    releases = []
    for version, distributions in r['releases'].items():
        for distribution in distributions: 
            if not distribution['yanked']: 
                releases.append({
                    'package': str(r['info']['name']),
                    'release': str(version),
                    'type': str(distribution['packagetype']),
                    'date': datetime.fromisoformat(str(distribution['upload_time_iso_8601'])), 
                    'size': int(distribution['size']),
                })
    return releases

In [18]:
releases = []
for package in tqdm.tqdm(PACKAGES):
    try:
        temp = get_releases_for(package)
        releases.extend(temp)
    except Exception as e:
        print(package, e)

 30%|███████████▍                          | 3002/10000 [06:00<17:42,  6.59it/s]

aaaaaaaaa 'releases'


 57%|█████████████████████▋                | 5712/10000 [12:17<10:20,  6.91it/s]

pycobaltix 'releases'


100%|█████████████████████████████████████| 10000/10000 [22:20<00:00,  7.46it/s]


In [19]:
df = (
    pl.from_dicts(releases)
    .select('package', 'release', distribution=pl.struct(type='type', date='date', size='size'))
    .group_by('package', 'release')
    .agg(
        date=pl.col('distribution').struct.field('date').max(), 
        distributions=pl.col('distribution'),
    )
    .sort('package', 'date')
)

In [20]:
print(df.n_unique('package'), 'packages,', len(df), 'releases')

9997 packages, 781932 releases


In [21]:
df.write_parquet('../data/releases.parquet')