In this notebook, we go over all the dependency trees that were previously extracted (see *Extract dependency trees*), we parse them, we do some sanity checks, and we store the result in *../data/dependencies.parquet* (our future working file).

In [1]:
import polars as pl
import re
import tqdm 
import joblib

from pathlib import Path

In [3]:
PATH = Path('../data-raw/uv-tree/')

# Do not parse trees that have more (non-deduped) dependencies than:
MAX_DEPS = 10_000

# Do not include packages having no dependency (only for "selected" releases)
MIN_DEPS = 1

# Do not include dependency trees having a cycle
EXCLUDE_CYCLES = True

# Hardcoded number of expected simulations, to ease filtering
# This corresponds to 2 releases * 36 points in time per package.
SIMULATIONS = 72

TREE_RE = re.compile(
    r"""
    ^(?P<prefix>[ \u2502\u251c\u2514\u2500]*)   # leading spaces and tree chars
    (?P<name>[A-Za-z0-9_\.\-]+)                 # package name
    (?P<extra>\[[^\]]+\])?                      # optional extra
    \s+v(?P<version>[^\s(]+)                    # version after 'v'
    (?:\s+\(extra: [^)]+\))?                    # optional extra
    (?:\s+\((?P<size>[^)]+)\))?                 # optional size in parentheses
    (\s+\(\*\))?                                # optional marker/footnote
    \s*$                                        # optional trailing space
    """,
    re.VERBOSE,
)

In [4]:
def parse_tree(tree):
    nodes = []
    path = []
    
    # Skip first line as it is of no interest in our case
    for line in tree.splitlines()[1:]:
        if line == '(*) Package tree is a cycle and cannot be shown':
            if EXCLUDE_CYCLES:
                return None
            else:
                continue
                
        m = TREE_RE.match(line)
            
        package = m.group('name')
        version = m.group('version')
        size = m.group('size')
        depth = len(m.group('prefix')) // 4

        while len(path) >= depth:
            path.pop()
        path.append(package)

        nodes.append((list(path), version, size))
    
    return nodes

# Example
parse_tree("""\
root v0.1.0
└── pandas v2.3.3 (11.0MiB)
    ├── numpy v2.3.5 (16.2MiB)
    ├── python-dateutil v2.9.0.post0 (224.5KiB)
    │   └── six v1.17.0 (10.8KiB)
    ├── pytz v2025.2 (497.3KiB)
    └── tzdata v2025.3 (340.4KiB)
""")

[(['pandas'], '2.3.3', '11.0MiB'),
 (['pandas', 'numpy'], '2.3.5', '16.2MiB'),
 (['pandas', 'python-dateutil'], '2.9.0.post0', '224.5KiB'),
 (['pandas', 'python-dateutil', 'six'], '1.17.0', '10.8KiB'),
 (['pandas', 'pytz'], '2025.2', '497.3KiB'),
 (['pandas', 'tzdata'], '2025.3', '340.4KiB')]

In [5]:
def task(filename):
    with open(filename) as fp:
        c = fp.read()
        try:
            if len(c.splitlines()) <= MAX_DEPS:
                return filename, parse_tree(c)
        except IndexError as e:
            # A package has been added at top-level, this shouldn't happen!!
            # See https://github.com/astral-sh/uv/issues/17160
            pass
    return filename, None            

In [6]:
# Create tasks 
jobs = []
for filename in PATH.iterdir():
    if filename.name.startswith('.'):
        continue
    package, release, _ = filename.name.split('#')
    date, result = _.split('.')
    if result == 'tree': 
        jobs.append(filename)

print(len(jobs), 'planned jobs')

tasks = joblib.Parallel(return_as='generator_unordered')(joblib.delayed(task)(job) for job in jobs)

264820 planned jobs


In [7]:
results = []

for filename, result in tqdm.tqdm(tasks, total=len(jobs)):    
    if result is not None:
        package, release, _ = filename.name.split('#')
        if len(result) <= MIN_DEPS and release != 'latest':  # result always include current package, hence <=
            continue
        
        date, _ = _.split('.')
        for row in result: 
            results.append((
                package, 
                'selected' if release != 'latest' else release,
                date, 
                *row,
            ))

100%|█████████████████████████████████| 264820/264820 [03:01<00:00, 1456.95it/s]


In [8]:
df_deps = (
    pl.from_records(results, schema=['package', 'release', 'date', 'path', 'version', 'size'], orient='row')
    # Exclude packages not reaching the expected number of simulations
    .filter(
        SIMULATIONS == pl.struct('release', 'date').n_unique().over('package'),
    )
    # Convert to more appropriate dtypes
    .with_columns(
        pl.col('release').cast(pl.Enum(['selected', 'latest'])),
        pl.col('date').str.to_date(),
        pl.col('size').str.head(-3).str.to_decimal(scale=2) * pl.col('size').str.tail(3).replace_strict(['KiB', 'MiB'], [1, 1024]),
    )
    # Sort, mostly for convenience
    .sort('package', 'release', 'date', pl.col('path').list.len())
)

print(f'{df_deps.n_unique('package')} packages, {df_deps.n_unique(['package', 'release', 'date'])} simulations and {len(df_deps)} dependencies (avg: {len(df_deps) / df_deps.n_unique(['package', 'release', 'date']):.2f})')
df_deps

2839 packages, 204408 simulations and 8177328 dependencies (avg: 40.00)


package,release,date,path,version,size
str,enum,date,list[str],str,"decimal[38,2]"
"""2captcha-python""","""selected""",2023-01-01,"[""2captcha-python""]","""1.1.3""",8.60
"""2captcha-python""","""selected""",2023-01-01,"[""2captcha-python"", ""requests""]","""2.32.5""",63.20
"""2captcha-python""","""selected""",2023-01-01,"[""2captcha-python"", ""requests"", ""urllib3""]","""2.6.2""",128.10
"""2captcha-python""","""selected""",2023-01-01,"[""2captcha-python"", ""requests"", ""idna""]","""3.11""",69.30
"""2captcha-python""","""selected""",2023-01-01,"[""2captcha-python"", ""requests"", ""charset-normalizer""]","""3.4.4""",202.10
…,…,…,…,…,…
"""zyte-api""","""latest""",2025-12-01,"[""zyte-api"", ""aiohttp"", ""aiohappyeyeballs""]","""2.6.1""",14.90
"""zyte-api""","""latest""",2025-12-01,"[""zyte-api"", ""aiohttp"", … ""multidict""]","""6.7.0""",74.80
"""zyte-api""","""latest""",2025-12-01,"[""zyte-api"", ""aiohttp"", … ""propcache""]","""0.4.1""",78.30
"""zyte-api""","""latest""",2025-12-01,"[""zyte-api"", ""aiohttp"", … ""frozenlist""]","""1.8.0""",84.90


In [9]:
df_deps.write_parquet('../data/dependencies.parquet')