This notebook simulates the installation of several packages through time. More specifically, for each "package of interest" (see filters below), we simulate its installation in a virtual environment (using `uv`) at several dates. We do this simulation for 2 releases for each package: a first, fixed, release selected based on `SNAPSHOT_DATE`, and for each date, the latest release available at this point in time. 

The output of this simulation is a dependency tree that we store locally. This dependency tree is obtained via `uv tree` and will be parsed in another notebook.

In [1]:
import polars as pl 

import joblib
import tqdm

from packaging.version import Version, InvalidVersion
from pathlib import Path
import tempfile
import subprocess

In [2]:
SNAPSHOT_DATE = pl.datetime(2023, 1, 1, time_zone='UTC')
END_DATE = pl.datetime(2025, 12, 15, time_zone='UTC')

EXCLUDE_PREPOST_RELEASES = True
MIN_RELEASES_BEFORE = 5
MIN_LIFETIME_BEFORE = pl.duration(days=180)
MIN_RELEASES_AFTER = 5
MIN_LIFETIME_AFTER = pl.duration(days=365)
 
# 3.12 has backward incompatible changes in importlib
# 3.11 is supported from 2022-10-24 to 2027-10
# /!\ Make sure to "uv python install 3.11" before running this notebook!
PYTHON_VERSION = "3.11" 

# Blacklist packages that are known to be in error or to take ages
BLACKLIST = ['apache-flink']

PATH = Path('../data-raw/uv-tree/')

In [3]:
# List of all extracted releases
df_releases = pl.read_parquet('../data/releases.parquet')

In [4]:
def _f(v): 
    try:
        return Version(v).base_version
    except InvalidVersion:
        return None
        
df_candidates = (
    df_releases
    .select('package', 'release', 'date')
    .filter(
        pl.lit(not EXCLUDE_PREPOST_RELEASES) | (pl.col('release') == pl.col('release').map_elements(_f, return_dtype=pl.String()))
    )
    .with_columns(
        first_release=pl.col('date').min().over('package'), 
        last_release=pl.col('date').max().over('package'),
        rel_before=pl.col('date').filter(pl.col('date') < SNAPSHOT_DATE).count().over('package'),
        rel_after=pl.col('date').filter(pl.col('date') >= SNAPSHOT_DATE).count().over('package'),
        lifetime_before=(pl.col('date').filter(pl.col('date') < SNAPSHOT_DATE).max() - pl.col('date').filter(pl.col('date') < SNAPSHOT_DATE).min()).over('package'),
        lifetime_after=(pl.col('date').filter(pl.col('date') >= SNAPSHOT_DATE).max() - pl.col('date').filter(pl.col('date') >= SNAPSHOT_DATE).min()).over('package'),   
    )
    .filter(
        pl.col('rel_before') >= MIN_RELEASES_BEFORE, 
        pl.col('lifetime_before') >= MIN_LIFETIME_BEFORE, 
        pl.col('rel_after') >= MIN_RELEASES_AFTER, 
        pl.col('lifetime_after') >= MIN_LIFETIME_AFTER, 
    )
    # Select latest release before snapshot
    .filter(pl.col('date') <= SNAPSHOT_DATE)
    .group_by(pl.col('package'))
    .agg(pl.all().sort_by('date').last())
)

In [5]:
df_candidates

package,release,date,first_release,last_release,rel_before,rel_after,lifetime_before,lifetime_after
str,str,"datetime[μs, UTC]","datetime[μs, UTC]","datetime[μs, UTC]",u32,u32,duration[μs],duration[μs]
"""uvicorn""","""0.20.0""",2022-11-20 12:34:04.947431 UTC,2017-06-05 11:32:15.069863 UTC,2025-10-18 13:46:44.630076 UTC,141,35,1994d 1h 1m 49s 877568µs,953d 14h 39m 18s 308506µs
"""djhtml""","""1.5.2""",2022-08-04 15:16:41.929712 UTC,2021-05-09 07:01:21.667366 UTC,2025-10-08 12:03:17.172011 UTC,27,12,452d 8h 15m 20s 262346µs,980d 14h 2m 3s 676256µs
"""soda-core""","""3.0.17""",2022-12-28 04:00:59.098450 UTC,2022-06-28 07:45:16.804888 UTC,2025-09-24 10:43:43.104230 UTC,18,79,182d 20h 15m 42s 293562µs,987d 3m 20s 166693µs
"""django-tables2""","""2.5.0""",2022-12-27 11:43:28.604688 UTC,2011-06-20 07:55:12.848444 UTC,2025-11-21 10:17:43.813956 UTC,109,11,4208d 3h 48m 15s 756244µs,1048d 20h 15m 43s 622129µs
"""cmaes""","""0.9.0""",2022-11-08 07:22:13.281561 UTC,2020-01-30 13:25:50.039875 UTC,2025-07-23 07:01:53.576152 UTC,16,5,1012d 17h 56m 23s 241686µs,929d 57m 9s 444682µs
…,…,…,…,…,…,…,…,…
"""rocketchat-API""","""1.28.1""",2022-10-30 20:41:18.227710 UTC,2017-03-13 13:32:10.079238 UTC,2025-12-07 21:02:11.959402 UTC,78,13,2057d 7h 9m 8s 148472µs,1010d 6h 3m 18s 763142µs
"""mypy-boto3-comprehend""","""1.26.21""",2022-12-01 20:25:50.854104 UTC,2019-11-09 01:42:28.043332 UTC,2025-12-04 20:58:52.367631 UTC,550,27,1118d 18h 43m 22s 810772µs,1010d 30m 38s 994800µs
"""myst-parser""","""0.18.1""",2022-09-27 09:57:45.183176 UTC,2020-06-22 15:46:52.823295 UTC,2025-02-12 10:53:03.833302 UTC,35,9,826d 18h 10m 52s 359881µs,714d 4h 5m 40s 228604µs
"""tdda""","""2.0.1""",2022-02-24 10:23:05.661097 UTC,2017-01-18 14:21:52.116003 UTC,2025-03-05 08:49:17.700252 UTC,41,23,1862d 20h 1m 13s 545094µs,753d 15h 47m 25s 979709µs


In [6]:
def simulate_installation(package, date, path):
    with tempfile.TemporaryDirectory() as directory:
        # Initialize a uv project
        try:
            subprocess.run(['uv', 'init', '.', '--bare', '--no-workspace', '--name', 'root', '--python', PYTHON_VERSION], cwd=directory, check=True, capture_output=True, text=True)
        except subprocess.CalledProcessError as e:
            with open(f'{path}.err', 'w') as fp:
                fp.write(e.stdout)
                fp.write('\n\n\n')
                fp.write(e.stderr)
            return False
            
        # Add dependency
        try:
            r = subprocess.run(
                ['uv', 'add', package, '--exclude-newer', str(date), '--no-sync', '-qq', '--color', 'never', '--python', PYTHON_VERSION], 
                cwd=directory, check=True, capture_output=True, text=True
            )
        except subprocess.CalledProcessError as e:
            with open(f'{path}.err', 'w') as fp:
                fp.write(e.stdout)
                fp.write('\n\n\n')
                fp.write(e.stderr)
            return False

        # Export tree
        r = subprocess.run(
            ['uv', 'tree', '--no-dev', '--no-dedupe', '--no-progress', '--show-sizes', '--quiet', '--color', 'never'], 
            cwd=directory, capture_output=True, text=True
        )
        with open(f'{path}.tree', 'w') as fp:
            fp.write(r.stdout)
        return True

In [7]:
if not PATH.exists():
    PATH.mkdir()

dates = pl.date_range(SNAPSHOT_DATE, END_DATE, interval='1mo', eager=True)

jobs = []

for package, release in tqdm.tqdm(df_candidates.select('package', 'release').iter_rows()):
    if package in BLACKLIST:
        continue
        
    for date in dates: 
        path = PATH / f'{package}#{release}#{date}'
        if not Path(f'{path}.tree').exists():
            jobs.append((f'{package}=={release}', date, path))

        path = PATH / f'{package}#latest#{date}'
        if not Path(f'{path}.tree').exists():
            jobs.append((package, date, path))

print(len(jobs), 'planned jobs')

tasks = joblib.Parallel(return_as='generator_unordered')(joblib.delayed(simulate_installation)(*job) for job in jobs)

3746it [00:03, 1109.47it/s]

178434 planned jobs





In [None]:
# Take a coffee, this cell is going to run for a while :-)
for _ in tqdm.tqdm(tasks, total=len(jobs)):
    pass

  2%|▌                                | 3134/178434 [08:48<355:57:22,  7.31s/it]

Let's see how many simulations were performed correctly.

In [11]:
results = []
for filename in PATH.iterdir():
    if filename.name.startswith('.'):
        continue
        
    package, release, _ = filename.name.split('#')
    date, result = _.split('.')
    results.append((package, release, date, result))

results = pl.from_records(results, schema=['package', 'simulation', 'date', 'result'], orient='row')

In [12]:
# Here, "simulations" should be 72 per package
print(results.n_unique('package'), 'packages and', len(results), 'simulations')
print('success for', results.filter(pl.col('result') == pl.lit('tree')).n_unique('package'), 'packages and', len(results.filter(pl.col('result') == pl.lit('tree'))), 'simulations')
print('failure for', results.filter(pl.col('result') != pl.lit('tree')).n_unique('package'), 'packages and', len(results.filter(pl.col('result') != pl.lit('tree'))), 'simulations')

_ = results.group_by('package').agg((pl.col('result') == pl.lit('tree')).all(), pl.len().alias('simulations')).filter('result')
print('full success for', _.n_unique('package'), 'packages and', _.select('simulations').sum().item(), 'simulations')

3747 packages and 269746 simulations
success for 3746 packages and 264820 simulations
failure for 150 packages and 4926 simulations
full success for 3597 packages and 258946 simulations
