In [38]:
import polars as pl 

import joblib
import tqdm

from packaging.version import Version, InvalidVersion
from pathlib import Path
import tempfile
import subprocess

In [98]:
SNAPSHOT_DATE = pl.datetime(2023, 1, 1, time_zone='UTC')
END_DATE = pl.datetime(2025, 12, 15, time_zone='UTC')

EXCLUDE_PREPOST_RELEASES = True
MIN_RELEASES_BEFORE = 5
MIN_LIFETIME_BEFORE = pl.duration(days=365)
MIN_RELEASES_AFTER = 5
MIN_LIFETIME_AFTER = pl.duration(days=365)

PYTHON_VERSION = "3.11"  
# 3.12 has backward incompatible changes in importlib
# 3.11 is supported from 2022-10-24 to 2027-10
# /!\ Make sure to "uv python install 3.11" before running this notebook!

PATH = Path('../data-raw/uv-tree/')

In [22]:
# List of all extracted releases
df_releases = pl.read_parquet('../data/releases.parquet')

In [25]:
def _f(v): 
    try:
        return Version(v).base_version
    except InvalidVersion:
        return None
        
df_candidates = (
    df_releases
    .select('package', 'release', 'date')
    .filter(
        pl.lit(not EXCLUDE_PREPOST_RELEASES) | (pl.col('release') == pl.col('release').map_elements(_f, return_dtype=pl.String()))
    )
    .with_columns(
        first_release=pl.col('date').min().over('package'), 
        last_release=pl.col('date').max().over('package'),
        rel_before=pl.col('date').filter(pl.col('date') < SNAPSHOT_DATE).count().over('package'),
        rel_after=pl.col('date').filter(pl.col('date') >= SNAPSHOT_DATE).count().over('package'),
        lifetime_before=(pl.col('date').filter(pl.col('date') < SNAPSHOT_DATE).max() - pl.col('date').filter(pl.col('date') < SNAPSHOT_DATE).min()).over('package'),
        lifetime_after=(pl.col('date').filter(pl.col('date') >= SNAPSHOT_DATE).max() - pl.col('date').filter(pl.col('date') >= SNAPSHOT_DATE).min()).over('package'),   
    )
    .filter(
        pl.col('rel_before') >= MIN_RELEASES_BEFORE, 
        pl.col('lifetime_before') >= MIN_LIFETIME_BEFORE, 
        pl.col('rel_after') >= MIN_RELEASES_AFTER, 
        pl.col('lifetime_after') >= MIN_LIFETIME_AFTER, 
    )
    # Select latest release before snapshot
    .filter(pl.col('date') <= SNAPSHOT_DATE)
    .group_by(pl.col('package'))
    .agg(pl.all().sort_by('date').last())
)

In [62]:
df_candidates.head()

package,release,date,first_release,last_release,rel_before,rel_after,lifetime_before,lifetime_after
str,str,"datetime[μs, UTC]","datetime[μs, UTC]","datetime[μs, UTC]",u32,u32,duration[μs],duration[μs]
"""rollbar""","""0.16.3""",2022-06-08 10:33:30.318731 UTC,2013-02-26 06:35:41.758426 UTC,2025-03-26 15:12:19.274683 UTC,90,6,3389d 3h 57m 48s 560305µs,496d 1h 12m 45s 970665µs
"""openapi-spec-validator""","""0.5.1""",2022-09-05 10:37:53.153567 UTC,2017-09-06 13:05:58.121463 UTC,2025-06-07 14:48:56.299247 UTC,23,10,1824d 21h 31m 55s 32104µs,873d 9h 17m 17s 699377µs
"""pytest-check""","""1.3.0""",2022-12-02 20:22:31.948949 UTC,2017-10-30 05:35:39.236507 UTC,2025-11-29 02:51:15.908509 UTC,32,25,1859d 14h 46m 52s 712442µs,1055d 9h 42m 28s 333050µs
"""geventhttpclient""","""2.0.8""",2022-10-15 08:37:34.423783 UTC,2014-04-09 18:00:39.473245 UTC,2025-10-26 10:33:56.475367 UTC,19,13,3110d 14h 36m 54s 950538µs,957d 12h 40m 43s 459688µs
"""django-stubs-ext""","""0.7.0""",2022-11-03 10:25:02.739789 UTC,2020-11-14 18:30:07.993948 UTC,2025-12-01 08:12:37.486674 UTC,7,23,718d 15h 54m 54s 745841µs,990d 18h 46s 707834µs


In [94]:
def simulate_installation(package, date, path):
    with tempfile.TemporaryDirectory() as directory:
        # Initialize a uv project
        try:
            subprocess.run(['uv', 'init', '.', '--bare', '--no-workspace', '--name', 'root', '--python', PYTHON_VERSION], cwd=directory, check=True, capture_output=True, text=True)
        except subprocess.CalledProcessError as e:
            with open(f'{path}.err', 'w') as fp:
                fp.write(e.stdout)
                fp.write('\n\n\n')
                fp.write(e.stderr)
            return False
            
        # Add dependency
        try:
            r = subprocess.run(
                ['uv', 'add', package, '--exclude-newer', str(date), '--no-sync', '-qq', '--color', 'never', '--python', PYTHON_VERSION], 
                cwd=directory, check=True, capture_output=True, text=True
            )
        except subprocess.CalledProcessError as e:
            with open(f'{path}.err', 'w') as fp:
                fp.write(e.stdout)
                fp.write('\n\n\n')
                fp.write(e.stderr)
            return False

        # Export tree
        r = subprocess.run(
            ['uv', 'tree', '--no-dev', '--no-dedupe', '--no-progress', '--show-sizes', '--quiet', '--color', 'never'], 
            cwd=directory, capture_output=True, text=True
        )
        with open(f'{path}.tree', 'w') as fp:
            fp.write(r.stdout)
        return True

In [95]:
if not PATH.exists():
    PATH.mkdir()

dates = pl.date_range(SNAPSHOT_DATE, END_DATE, interval='1mo', eager=True)

jobs = []

for package, release in tqdm.tqdm(df_candidates.select('package', 'release').iter_rows()):
    for date in dates: 
        path = PATH / f'{package}#{release}#{date}'
        if not Path(f'{path}.tree').exists():
            jobs.append((f'{package}=={release}', date, path))

        path = PATH / f'{package}#latest#{date}'
        if not Path(f'{path}.tree').exists():
            jobs.append((package, date, path))

print(len(jobs), 'planned jobs')

tasks = joblib.Parallel(return_as='generator_unordered')(joblib.delayed(simulate_installation)(*job) for job in jobs)

1281it [00:01, 1236.04it/s]

4383 planned jobs





In [96]:
# Do the job
for _ in tqdm.tqdm(tasks, total=len(jobs)):
    pass

100%|███████████████████████████████████████| 4383/4383 [05:53<00:00, 12.41it/s]


Let's see how many simulations were performed correctly.

In [99]:
results = []
for filename in PATH.iterdir():
    if filename.name.startswith('.'):
        continue
        
    package, release, _ = filename.name.split('#')
    date, result = _.split('.')
    results.append((package, release, date, result))

results = pl.from_records(results, schema=['package', 'simulation', 'date', 'result'], orient='row')

In [100]:
# Here, "simulations" should be 72 per package
print(results.n_unique('package'), 'packages and', len(results), 'simulations')
print('success for', results.filter(pl.col('result') == pl.lit('tree')).n_unique('package'), 'packages and', len(results.filter(pl.col('result') == pl.lit('tree'))), 'simulations')
print('failure for', results.filter(pl.col('result') != pl.lit('tree')).n_unique('package'), 'packages and', len(results.filter(pl.col('result') != pl.lit('tree'))), 'simulations')

_ = results.group_by('package').agg((pl.col('result') == pl.lit('tree')).all(), pl.len().alias('simulations')).filter('result')
print('full success for', _.n_unique('package'), 'packages and', _.select('simulations').sum().item(), 'simulations')

1281 packages and 92232 simulations
success for 1281 packages and 91312 simulations
failure for 30 packages and 920 simulations
full success for 1251 packages and 90072 simulations
