The purpose of this notebook is to extract package information (including releases) from PyPI. 

In [20]:
import polars as pl
import requests
import tqdm

from datetime import datetime
from packaging.version import Version

In [22]:
N_PACKAGES = 2000
EXCLUDE_BEFORE = pl.datetime(2020, 1, 1, time_zone='UTC')

# Get most downloaded packages
PACKAGES = (
    pl.read_csv('../data-raw/top-pypi-packages.csv')
    .sort(pl.col('download_count'), descending=True)
    .limit(N_PACKAGES)
    .get_column('project')
    .to_list()
)

In [12]:
def get_releases_for(package):
    r = requests.get('https://pypi.org/pypi/{package}/json'.format(package=package)).json()

    releases = []
    for version, distributions in r['releases'].items():
        for distribution in distributions: 
            releases.append({
                'package': str(r['info']['name']),
                'release': str(version),
                'type': str(distribution['packagetype']),
                'date': datetime.fromisoformat(str(distribution['upload_time_iso_8601'])), 
                'size': int(distribution['size']),
            })
    return releases

In [13]:
releases = []
for package in tqdm.tqdm(PACKAGES):
    try:
        temp = get_releases_for(package)
        releases.extend(temp)
    except Exception as e:
        print(package, e)

100%|███████████████████████████████████████| 2000/2000 [04:16<00:00,  7.79it/s]


In [14]:
df = (
    pl.from_dicts(releases)
    .select('package', 'release', distribution=pl.struct(type='type', date='date', size='size'))
    .group_by('package', 'release')
    .agg(
        date=pl.col('distribution').struct.field('date').max(), 
        distributions=pl.col('distribution'),
    )
    .sort('package', 'date')
)

In [16]:
print(df.n_unique('package'), 'packages,', len(df), 'releases')

2000 packages, 163979 releases


In [17]:
df.write_parquet('../data/releases.parquet')

In [30]:
# Exclude older releases
df_selected = df.filter(pl.col('date') >= EXCLUDE_BEFORE)

print(df_selected.n_unique('package'), 'packages,', len(df_selected), 'releases')

# Exclude prereleases, postreleases, dev releases, etc.
df_selected = df_selected.filter(
    pl.col('release') == pl.col('release').map_elements(lambda v: Version(v).base_version, return_dtype=pl.String())
)
    
print(df_selected.n_unique('package'), 'packages,', len(df_selected), 'releases')

1940 packages, 118777 releases
1900 packages, 95883 releases


In [29]:
df_selected

package,release,date,distributions
str,str,"datetime[μs, UTC]",list[struct[3]]
"""APScheduler""","""3.7.0""",2021-01-19 14:35:17.646332 UTC,"[{""bdist_wheel"",2021-01-19 14:35:16.372004 UTC,59349}, {""sdist"",2021-01-19 14:35:17.646332 UTC,97826}]"
"""APScheduler""","""3.8.0""",2021-09-23 22:10:25.859304 UTC,"[{""bdist_wheel"",2021-09-23 22:10:24.219996 UTC,59449}, {""sdist"",2021-09-23 22:10:25.859304 UTC,100739}]"
"""APScheduler""","""3.8.1""",2021-10-24 20:57:39.730092 UTC,"[{""bdist_wheel"",2021-10-24 20:57:37.728242 UTC,59448}, {""sdist"",2021-10-24 20:57:39.730092 UTC,98699}]"
"""APScheduler""","""3.9.0""",2022-02-24 09:09:50.620827 UTC,"[{""bdist_wheel"",2022-02-24 09:09:48.628894 UTC,112751}, {""sdist"",2022-02-24 09:09:50.620827 UTC,100492}]"
"""APScheduler""","""3.9.0.post1""",2022-02-24 09:22:34.111065 UTC,"[{""bdist_wheel"",2022-02-24 09:22:32.768242 UTC,59679}, {""sdist"",2022-02-24 09:22:34.111065 UTC,100511}]"
…,…,…,…
"""zstd""","""1.5.6.7""",2025-04-02 19:49:27.514609 UTC,"[{""bdist_wheel"",2025-04-02 18:48:12.101631 UTC,258048}, {""bdist_wheel"",2025-04-02 18:52:03.034799 UTC,1330747}, … {""sdist"",2025-04-02 18:28:58.514022 UTC,649577}]"
"""zstd""","""1.5.6.8""",2025-05-01 09:24:08.719649 UTC,"[{""bdist_wheel"",2025-05-01 09:16:11.099027 UTC,1310574}, {""bdist_wheel"",2025-05-01 09:23:09.223904 UTC,241977}, … {""sdist"",2025-05-01 09:15:30.565191 UTC,652944}]"
"""zstd""","""1.5.7.0""",2025-05-05 16:48:38.784463 UTC,"[{""bdist_wheel"",2025-05-05 16:38:54.892474 UTC,1339331}, {""bdist_wheel"",2025-05-05 16:45:38.647686 UTC,247169}, … {""sdist"",2025-05-05 16:33:38.151699 UTC,666101}]"
"""zstd""","""1.5.7.1""",2025-06-07 16:34:35.144187 UTC,"[{""bdist_wheel"",2025-06-07 15:41:03.812965 UTC,270615}, {""bdist_wheel"",2025-06-07 16:10:30.983470 UTC,1490499}, … {""sdist"",2025-06-07 15:31:05.697350 UTC,670457}]"


In [28]:
df_selected.write_parquet('../data/selected_releases.parquet')