This notebook simulates the installation of several packages through time. More specifically, for each "package of interest" (see filters below), we simulate its installation in a virtual environment (using `uv`) at several dates. We do this simulation for 2 releases for each package: a first, fixed, release selected based on `SNAPSHOT_DATE`, and for each date, the latest release available at this point in time. 

The output of this simulation is a dependency tree that we store locally. This dependency tree is obtained via `uv tree` and will be parsed in another notebook. Note that executing this notebook on a large collection of packages (as is the case by default) is going to take some time (expect a few hours on a very fast server). For convenience, if a package simulation has already been performed (i.e., there is a corresponding `*.tree` or `*.err` file in `PATH`), it will be skipped. Do not forget to empty this directory when needed!

In [2]:
import polars as pl 

import joblib
import tqdm

from packaging.version import Version, InvalidVersion
from pathlib import Path
import tempfile
import subprocess

In [3]:
SNAPSHOT_DATE = pl.datetime(2023, 1, 1, time_zone='UTC')
END_DATE = pl.datetime(2025, 12, 15, time_zone='UTC')

EXCLUDE_PREPOST_RELEASES = True
MIN_RELEASES_BEFORE = 5
MIN_LIFETIME_BEFORE = pl.duration(days=180)
MIN_RELEASES_AFTER = 5
MIN_LIFETIME_AFTER = pl.duration(days=365)
 
# 3.12 has backward incompatible changes in importlib
# 3.11 is supported from 2022-10-24 to 2027-10
# /!\ Make sure to "uv python install 3.11" before running this notebook!
PYTHON_VERSION = "3.11" 

# Blacklist packages that are known to be in error or to take ages
BLACKLIST = ['apache-flink', 'Products.CMFPlone', 'Products.CMFCore', 'Products.CMFEditions', 'plone.app.dexterity', 'plone.protect', 'plone.app.z3cform', 'plone.app.layout', 'plone.app.portlets', 
            'cdk-certbot-dns-route53', 'cdk-events-notify', 'cdk-gitlab-runner', 'aws-parallelcluster', ]
TIMEOUT = 60
PATH = Path('../data-raw/uv-tree/')

In [46]:
# List of all extracted releases
df_releases = pl.read_parquet('../data/releases.parquet')

In [47]:
def _f(v): 
    try:
        return Version(v).base_version
    except InvalidVersion:
        return None
        
df_candidates = (
    df_releases
    .select('package', 'release', 'date')
    .filter(
        pl.lit(not EXCLUDE_PREPOST_RELEASES) | (pl.col('release') == pl.col('release').map_elements(_f, return_dtype=pl.String()))
    )
    .with_columns(
        first_release=pl.col('date').min().over('package'), 
        last_release=pl.col('date').max().over('package'),
        rel_before=pl.col('date').filter(pl.col('date') < SNAPSHOT_DATE).count().over('package'),
        rel_after=pl.col('date').filter(pl.col('date') >= SNAPSHOT_DATE).count().over('package'),
        lifetime_before=(pl.col('date').filter(pl.col('date') < SNAPSHOT_DATE).max() - pl.col('date').filter(pl.col('date') < SNAPSHOT_DATE).min()).over('package'),
        lifetime_after=(pl.col('date').filter(pl.col('date') >= SNAPSHOT_DATE).max() - pl.col('date').filter(pl.col('date') >= SNAPSHOT_DATE).min()).over('package'),   
    )
    .filter(
        pl.col('rel_before') >= MIN_RELEASES_BEFORE, 
        pl.col('lifetime_before') >= MIN_LIFETIME_BEFORE, 
        pl.col('rel_after') >= MIN_RELEASES_AFTER, 
        pl.col('lifetime_after') >= MIN_LIFETIME_AFTER, 
    )
    # Select latest release before snapshot
    .filter(pl.col('date') <= SNAPSHOT_DATE)
    .group_by(pl.col('package'))
    .agg(pl.all().sort_by('date').last())
)

In [48]:
df_candidates

package,release,date,first_release,last_release,rel_before,rel_after,lifetime_before,lifetime_after
str,str,"datetime[μs, UTC]","datetime[μs, UTC]","datetime[μs, UTC]",u32,u32,duration[μs],duration[μs]
"""daphne""","""4.0.0""",2022-10-07 13:17:22.244245 UTC,2016-02-09 20:54:37.443307 UTC,2025-07-02 12:57:04.935176 UTC,56,5,2431d 16h 22m 44s 800938µs,507d 22h 9m 57s 106834µs
"""paddleocr""","""2.6.1.2""",2022-12-14 02:58:13.461221 UTC,2020-08-22 07:18:18.836513 UTC,2025-11-13 14:46:38.722945 UTC,34,23,843d 19h 39m 54s 624708µs,1009d 6h 56m 57s 576511µs
"""types-aiobotocore-dms""","""2.4.2""",2022-12-23 01:24:28.046439 UTC,2022-02-07 05:21:18.142305 UTC,2025-12-11 01:57:14.143081 UTC,11,45,318d 20h 3m 9s 904134µs,1009d 16m 8s 759611µs
"""osc-placement""","""4.0.0""",2022-07-15 08:09:56.688886 UTC,2017-09-05 11:24:14.971756 UTC,2025-09-01 13:35:34.158312 UTC,19,7,1773d 20h 45m 41s 717130µs,927d 3h 10m 31s 686220µs
"""pyobjc-framework-CoreData""","""9.0.1""",2022-12-19 10:42:01.525491 UTC,2009-11-24 16:01:14.960480 UTC,2025-11-14 10:13:36.435834 UTC,44,13,4772d 18h 40m 46s 565011µs,943d 38m 59s 949086µs
…,…,…,…,…,…,…,…,…
"""rosbags""","""0.9.13""",2022-09-25 15:42:56.867341 UTC,2021-05-16 18:45:33.298420 UTC,2025-10-15 09:49:54.453807 UTC,14,23,496d 20h 57m 23s 568921µs,1006d 21h 39m 52s 681029µs
"""onnx2torch""","""1.5.4""",2022-12-26 10:01:11.348138 UTC,2021-12-14 11:04:58.570205 UTC,2024-08-07 14:04:09.985592 UTC,12,9,376d 22h 56m 12s 777933µs,544d 4h 22m 33s 926339µs
"""google-cloud-pipeline-componen…","""1.0.32""",2022-12-21 01:15:03.677826 UTC,2021-05-13 21:39:26.311358 UTC,2025-11-10 23:26:13.739718 UTC,52,43,586d 3h 35m 37s 366468µs,1033d 21h 53s 8709µs
"""huawei-solar""","""2.1.6""",2022-10-30 18:01:14.451174 UTC,2020-03-04 17:12:14.548279 UTC,2025-10-11 07:50:59.291817 UTC,25,16,970d 48m 59s 902895µs,995d 22h 35m 11s 435384µs


In [49]:
def simulate_installation(package, date, path):
    with tempfile.TemporaryDirectory() as directory:
        # Initialize a uv project
        try:
            subprocess.run(['uv', 'init', '.', '--bare', '--no-workspace', '--name', 'root', '--python', PYTHON_VERSION], 
                timeout=TIMEOUT, cwd=directory, check=True, capture_output=True, text=True
            )
        except subprocess.CalledProcessError as e:
            with open(f'{path}.err', 'w') as fp:
                fp.write(e.stdout)
                fp.write('\n\n\n')
                fp.write(e.stderr)
            return False
        except subprocess.TimeoutExpired:
            print(f'Timeout: {path} (uv init)')
            return False
            
        # Add dependency
        try:
            r = subprocess.run(
                ['uv', 'add', package, '--exclude-newer', str(date), '--no-sync', '-qq', '--color', 'never', '--python', PYTHON_VERSION], 
                timeout=TIMEOUT, cwd=directory, check=True, capture_output=True, text=True
            )
        except subprocess.CalledProcessError as e:
            with open(f'{path}.err', 'w') as fp:
                fp.write(e.stdout)
                fp.write('\n\n\n')
                fp.write(e.stderr)
            return False
        except subprocess.TimeoutExpired:
            print(f'Timeout: {path} (uv add)')
            return False

        # Export tree
        try:
            r = subprocess.run(
                ['uv', 'tree', '--exclude-newer', str(date), '--no-dev', '--no-dedupe', '--no-progress', '--show-sizes', '--quiet', '--color', 'never', '--python', PYTHON_VERSION], 
                timeout=TIMEOUT, cwd=directory, capture_output=True, text=True
            )
        except subprocess.CalledProcessError as e:
            with open(f'{path}.err', 'w') as fp:
                fp.write(e.stdout)
                fp.write('\n\n\n')
                fp.write(e.stderr)
            return False
        except subprocess.TimeoutExpired:
            print(f'Timeout: {path} (uv tree)')
            return False
        
        with open(f'{path}.tree', 'w') as fp:
            fp.write(r.stdout)
        return True

In [57]:
if not PATH.exists():
    PATH.mkdir()

dates = pl.date_range(SNAPSHOT_DATE, END_DATE, interval='1mo', eager=True)

jobs = []

for package, release in tqdm.tqdm(df_candidates.select('package', 'release').iter_rows()):
    if package in BLACKLIST:
        continue
        
    for date in dates: 
        path = PATH / f'{package}#{release}#{date}'
        if not Path(f'{path}.tree').exists() and not Path(f'{path}.err').exists():
            jobs.append((f'{package}=={release}', date, path))

        path = PATH / f'{package}#latest#{date}'
        if not Path(f'{path}.tree').exists() and not Path(f'{path}.err').exists():
            jobs.append((package, date, path))

print(len(jobs), 'planned jobs')

tasks = joblib.Parallel(return_as='generator_unordered')(joblib.delayed(simulate_installation)(*job) for job in jobs)

5131it [00:04, 1133.40it/s]

49653 planned jobs





In [None]:
# Take a coffee, this cell is going to run for a while :-)
# On a very very decent server, it took around 1h for 50K+ packages
# Do not trust the progress bar too early! 
for _ in tqdm.tqdm(tasks, total=len(jobs), smoothing=0):
    pass

# Consider executing this cell (and the above one) multiple times, since 
# jobs that timed out are simply ignored. They won't generate any .tree
# nor .err file, and will be attempted again during next run.

  0%|                                       | 157/49653 [00:06<34:24, 23.97it/s]

Let's see how many simulations were performed correctly.

In [4]:
results = []
for filename in PATH.iterdir():
    if filename.name.startswith('.'):
        continue
        
    package, release, _ = filename.name.split('#')
    date, result = _.split('.')
    results.append((package, release, date, result))

results = pl.from_records(results, schema=['package', 'simulation', 'date', 'result'], orient='row')

In [5]:
# Here, "simulations" should be 72 per package
print(results.n_unique('package'), 'packages and', len(results), 'simulations')
print('success for', results.filter(pl.col('result') == pl.lit('tree')).n_unique('package'), 'packages and', len(results.filter(pl.col('result') == pl.lit('tree'))), 'simulations')
print('failure for', results.filter(pl.col('result') != pl.lit('tree')).n_unique('package'), 'packages and', len(results.filter(pl.col('result') != pl.lit('tree'))), 'simulations')

_ = results.group_by('package').agg((pl.col('result') == pl.lit('tree')).all(), pl.len().alias('simulations')).filter('result')
print('full success for', _.n_unique('package'), 'packages and', _.select('simulations').sum().item(), 'simulations')

5119 packages and 368500 simulations
success for 5108 packages and 360005 simulations
failure for 251 packages and 8495 simulations
full success for 4868 packages and 350428 simulations
