In [1]:
import polars as pl 

import joblib
import tqdm

from packaging.version import Version, InvalidVersion
from pathlib import Path
import tempfile
import subprocess

In [17]:
SNAPSHOT_DATE = pl.datetime(2023, 1, 1, time_zone='UTC')
END_DATE = pl.datetime(2025, 12, 15, time_zone='UTC')

EXCLUDE_PREPOST_RELEASES = True
MIN_RELEASES_BEFORE = 5
MIN_LIFETIME_BEFORE = pl.duration(days=180)
MIN_RELEASES_AFTER = 5
MIN_LIFETIME_AFTER = pl.duration(days=365)
 
# 3.12 has backward incompatible changes in importlib
# 3.11 is supported from 2022-10-24 to 2027-10
# /!\ Make sure to "uv python install 3.11" before running this notebook!
PYTHON_VERSION = "3.11" 

PATH = Path('../data-raw/uv-tree/')

In [18]:
# List of all extracted releases
df_releases = pl.read_parquet('../data/releases.parquet')

In [19]:
def _f(v): 
    try:
        return Version(v).base_version
    except InvalidVersion:
        return None
        
df_candidates = (
    df_releases
    .select('package', 'release', 'date')
    .filter(
        pl.lit(not EXCLUDE_PREPOST_RELEASES) | (pl.col('release') == pl.col('release').map_elements(_f, return_dtype=pl.String()))
    )
    .with_columns(
        first_release=pl.col('date').min().over('package'), 
        last_release=pl.col('date').max().over('package'),
        rel_before=pl.col('date').filter(pl.col('date') < SNAPSHOT_DATE).count().over('package'),
        rel_after=pl.col('date').filter(pl.col('date') >= SNAPSHOT_DATE).count().over('package'),
        lifetime_before=(pl.col('date').filter(pl.col('date') < SNAPSHOT_DATE).max() - pl.col('date').filter(pl.col('date') < SNAPSHOT_DATE).min()).over('package'),
        lifetime_after=(pl.col('date').filter(pl.col('date') >= SNAPSHOT_DATE).max() - pl.col('date').filter(pl.col('date') >= SNAPSHOT_DATE).min()).over('package'),   
    )
    .filter(
        pl.col('rel_before') >= MIN_RELEASES_BEFORE, 
        pl.col('lifetime_before') >= MIN_LIFETIME_BEFORE, 
        pl.col('rel_after') >= MIN_RELEASES_AFTER, 
        pl.col('lifetime_after') >= MIN_LIFETIME_AFTER, 
    )
    # Select latest release before snapshot
    .filter(pl.col('date') <= SNAPSHOT_DATE)
    .group_by(pl.col('package'))
    .agg(pl.all().sort_by('date').last())
)

In [20]:
df_candidates

package,release,date,first_release,last_release,rel_before,rel_after,lifetime_before,lifetime_after
str,str,"datetime[μs, UTC]","datetime[μs, UTC]","datetime[μs, UTC]",u32,u32,duration[μs],duration[μs]
"""GitPython""","""3.1.30""",2022-12-29 07:16:38.435877 UTC,2010-03-21 02:56:30.287083 UTC,2025-07-24 03:45:54.871622 UTC,68,14,4666d 4h 20m 8s 148794µs,888d 11h 12m 38s 543870µs
"""cloud-sql-python-connector""","""1.0.0""",2022-12-06 21:41:25.183934 UTC,2021-05-04 22:26:45.354866 UTC,2025-10-09 22:30:04.133806 UTC,22,33,580d 23h 14m 39s 829068µs,1003d 4h 44m 14s 300959µs
"""pyhanko-certvalidator""","""0.19.8""",2022-12-19 23:15:39.929559 UTC,2020-12-05 12:53:31.953496 UTC,2025-09-12 22:05:37.207710 UTC,24,21,744d 10h 22m 7s 976063µs,955d 48s 495616µs
"""types-cachetools""","""5.2.1""",2022-06-26 09:17:01.769749 UTC,2021-02-02 20:27:35.392177 UTC,2025-10-22 03:03:58.160795 UTC,27,14,508d 12h 49m 26s 377572µs,991d 14h 32m 46s 286650µs
"""mpire""","""2.6.0""",2022-08-29 09:03:35.296883 UTC,2020-09-03 17:11:46.042786 UTC,2024-05-07 14:00:31.815336 UTC,18,8,724d 15h 51m 49s 254097µs,416d 23h 29m 2s 618397µs
…,…,…,…,…,…,…,…,…
"""cuda-python""","""12.0.0""",2022-12-08 19:24:55.458757 UTC,2021-10-21 01:48:28.309746 UTC,2025-12-04 23:03:37.916370 UTC,8,22,413d 17h 36m 27s 149011µs,1010d 38m 26s 52909µs
"""ansible-lint""","""6.10.1""",2022-12-31 15:27:07.171138 UTC,2014-09-18 02:22:52.107498 UTC,2025-12-02 15:47:18.115325 UTC,125,65,3026d 13h 4m 15s 63640µs,1065d 21h 29m 59s 265648µs
"""google-cloud-core""","""2.3.2""",2022-07-18 13:18:46.182761 UTC,2016-09-29 00:13:26.718430 UTC,2025-10-29 23:17:39.513293 UTC,43,6,2118d 13h 5m 19s 464331µs,847d 3h 33m 5s 357972µs
"""djangorestframework-simplejwt""","""5.2.2""",2022-10-20 16:58:54.009437 UTC,2017-05-09 05:47:32.987359 UTC,2025-07-21 16:52:25.026340 UTC,42,5,1990d 11h 11m 21s 22078µs,700d 3h 45m 30s 978512µs


In [6]:
def simulate_installation(package, date, path):
    with tempfile.TemporaryDirectory() as directory:
        # Initialize a uv project
        try:
            subprocess.run(['uv', 'init', '.', '--bare', '--no-workspace', '--name', 'root', '--python', PYTHON_VERSION], cwd=directory, check=True, capture_output=True, text=True)
        except subprocess.CalledProcessError as e:
            with open(f'{path}.err', 'w') as fp:
                fp.write(e.stdout)
                fp.write('\n\n\n')
                fp.write(e.stderr)
            return False
            
        # Add dependency
        try:
            r = subprocess.run(
                ['uv', 'add', package, '--exclude-newer', str(date), '--no-sync', '-qq', '--color', 'never', '--python', PYTHON_VERSION], 
                cwd=directory, check=True, capture_output=True, text=True
            )
        except subprocess.CalledProcessError as e:
            with open(f'{path}.err', 'w') as fp:
                fp.write(e.stdout)
                fp.write('\n\n\n')
                fp.write(e.stderr)
            return False

        # Export tree
        r = subprocess.run(
            ['uv', 'tree', '--no-dev', '--no-dedupe', '--no-progress', '--show-sizes', '--quiet', '--color', 'never'], 
            cwd=directory, capture_output=True, text=True
        )
        with open(f'{path}.tree', 'w') as fp:
            fp.write(r.stdout)
        return True

In [7]:
if not PATH.exists():
    PATH.mkdir()

dates = pl.date_range(SNAPSHOT_DATE, END_DATE, interval='1mo', eager=True)

jobs = []

for package, release in tqdm.tqdm(df_candidates.select('package', 'release').iter_rows()):
    for date in dates: 
        path = PATH / f'{package}#{release}#{date}'
        if not Path(f'{path}.tree').exists():
            jobs.append((f'{package}=={release}', date, path))

        path = PATH / f'{package}#latest#{date}'
        if not Path(f'{path}.tree').exists():
            jobs.append((package, date, path))

print(len(jobs), 'planned jobs')

tasks = joblib.Parallel(return_as='generator_unordered')(joblib.delayed(simulate_installation)(*job) for job in jobs)

1281it [00:01, 1182.41it/s]

92232 planned jobs





In [8]:
# Do the job
for _ in tqdm.tqdm(tasks, total=len(jobs)):
    pass

100%|█████████████████████████████████████| 92232/92232 [50:36<00:00, 30.37it/s]


Let's see how many simulations were performed correctly.

In [9]:
results = []
for filename in PATH.iterdir():
    if filename.name.startswith('.'):
        continue
        
    package, release, _ = filename.name.split('#')
    date, result = _.split('.')
    results.append((package, release, date, result))

results = pl.from_records(results, schema=['package', 'simulation', 'date', 'result'], orient='row')

In [10]:
# Here, "simulations" should be 72 per package
print(results.n_unique('package'), 'packages and', len(results), 'simulations')
print('success for', results.filter(pl.col('result') == pl.lit('tree')).n_unique('package'), 'packages and', len(results.filter(pl.col('result') == pl.lit('tree'))), 'simulations')
print('failure for', results.filter(pl.col('result') != pl.lit('tree')).n_unique('package'), 'packages and', len(results.filter(pl.col('result') != pl.lit('tree'))), 'simulations')

_ = results.group_by('package').agg((pl.col('result') == pl.lit('tree')).all(), pl.len().alias('simulations')).filter('result')
print('full success for', _.n_unique('package'), 'packages and', _.select('simulations').sum().item(), 'simulations')

1281 packages and 92232 simulations
success for 1281 packages and 91312 simulations
failure for 30 packages and 920 simulations
full success for 1251 packages and 90072 simulations
