This notebook computes several diffs between simulations. Only unique dependencies are considered (i.e., if a package requires another one more than once, only the dependency at the lowest depth is considered). The diffs identify added, removed, common and updated dependencies (1) between consecutive simulations (i.e., through time); (2) between a simulation and the initial simulation (i.e., accumulated delta); (3) between a "latest" release and the "selected" release for each simulation.

In [15]:
import polars as pl

In [16]:
df_all_deps = pl.read_parquet('../data/dependencies.parquet')

print(f'{df_all_deps.n_unique('package')} packages, {df_all_deps.n_unique(['package', 'release', 'date'])} simulations and {len(df_all_deps)} dependencies (avg: {len(df_all_deps) / df_all_deps.n_unique(['package', 'release', 'date']):.2f})')
df_all_deps.sample(10)

3852 packages, 277344 simulations and 11465904 dependencies (avg: 41.34)


package,release,date,path,version,size
str,enum,date,list[str],str,"decimal[38,2]"
"""dbt-redshift""","""latest""",2023-04-01,"[""dbt-redshift"", ""boto3"", … ""botocore""]","""1.29.104""",10342.4
"""nuclio-jupyter""","""selected""",2025-07-01,"[""nuclio-jupyter"", ""notebook"", … ""jupyter-client""]","""8.6.3""",103.6
"""ara""","""selected""",2025-04-01,"[""ara"", ""requests"", ""idna""]","""3.10""",68.8
"""octavia""","""selected""",2024-05-01,"[""octavia"", ""python-barbicanclient"", … ""prettytable""]","""3.10.0""",27.5
"""json-schema-for-humans""","""latest""",2025-02-01,"[""json-schema-for-humans"", ""dataclasses-json"", … ""mypy-extensions""]","""1.0.0""",4.6
"""nbformat""","""latest""",2024-09-01,"[""nbformat"", ""jsonschema"", … ""rpds-py""]","""0.20.0""",310.9
"""zope.i18n""","""selected""",2023-03-01,"[""zope-i18n"", ""zope-schema"", ""setuptools""]","""67.4.0""",1024.0
"""azureml-designer-serving""","""latest""",2025-07-01,"[""azureml-designer-serving"", ""azureml-defaults"", … ""pysocks""]","""1.7.1""",16.3
"""jupyterlab-git""","""selected""",2024-01-01,"[""jupyterlab-git"", ""jupyter-server"", ""anyio""]","""4.2.0""",83.5
"""aws-solutions-constructs.core""","""latest""",2024-07-01,"[""aws-solutions-constructs-core"", ""aws-cdk-integ-tests-alpha"", … ""six""]","""1.16.0""",10.8


Be careful, some packages have their `package` field **distinct from** the first component of their `path` field!

Hereafter, we slightly simplify the dataset, focusing on *unique* dependencies (taking the less nested ones first). 
Note that `depth` starts from `0`. A depth of 0 *always* corresponds to the current package.

In [19]:
df_deps = (
    df_all_deps.lazy()
    # Cache some computation to ease writing polars code
    .with_columns(
        name=pl.col('path').list.get(-1),
        depth=pl.col('path').list.len() - 1,
    )
    # Remove duplicated dependencies, keep the one with the lowest depth
    .group_by('package', 'release', 'date', 'name')
    .agg(
        pl.all().sort_by(pl.col('depth')).first(),
    )
    .select('package', 'date', 'release', pl.col('version').filter(pl.col('depth') == pl.lit(0)).first().over('package', 'release', 'date').alias('release_name'), 'name', 'version', 'depth', 'size')
    .collect()
)

df_deps.write_parquet('../data/dependencies_unique.parquet')

print(f'{df_deps.n_unique('package')} packages, {df_deps.n_unique(['package', 'release', 'date'])} simulations and {len(df_deps)} dependencies (avg: {len(df_deps) / df_deps.n_unique(['package', 'release', 'date']):.2f})')
df_deps.sample(10)

3852 packages, 277344 simulations and 3767879 dependencies (avg: 13.59)


package,date,release,release_name,name,version,depth,size
str,date,enum,str,str,str,u32,"decimal[38,2]"
"""mkdocs-material""",2024-10-01,"""selected""","""8.5.11""","""jinja2""","""3.1.4""",1,130.1
"""evo""",2023-11-01,"""latest""","""1.25.1""","""contourpy""","""1.1.1""",2,242.8
"""jupyterlab-code-formatter""",2023-10-01,"""selected""","""1.5.3""","""jupyter-server-fileid""","""0.9.0""",3,15.3
"""DataRecorder""",2023-10-01,"""latest""","""3.4.8""","""datarecorder""","""3.4.8""",0,33.2
"""azureml-core""",2025-07-01,"""selected""","""1.48.0""","""pytz""","""2025.2""",1,497.3
"""types-aiobotocore-braket""",2023-03-01,"""latest""","""2.4.2.post2""","""types-aiobotocore-braket""","""2.4.2.post2""",0,25.1
"""dagster-cloud-cli""",2023-08-01,"""latest""","""1.4.3""","""packaging""","""23.1""",1,47.8
"""giddy""",2024-04-01,"""selected""","""2.3.3""","""libpysal""","""4.10""",1,2764.8
"""lalsuite""",2024-02-01,"""selected""","""7.11""","""astropy-iers-data""","""0.2024.1.29.0.30.37""",2,1843.2
"""scatterd""",2025-02-01,"""latest""","""1.3.7""","""tzdata""","""2025.1""",3,338.6


In [24]:
df_diffs = (
    df_deps
    # Get previous simulation
    .pipe(lambda df: df.join(
        other=df.unique('date').sort('date').select(pl.col('date'), pl.col('date').shift(1).alias('prev_date')),
        on='date', 
        how='left', 
    ))
    # Identify added, removed, common and changed dependencies
    .pipe(lambda df: df.join(
            other=df,
            left_on=['package', 'prev_date', 'release', 'name'],
            right_on=['package', 'date', 'release', 'name'],
            how='full',
        )
        .select(
            *[pl.coalesce(col, f'{col}_right') for col in ['package', 'date', 'release', 'name', 'depth']],
            pl.col('version'),
            pl.col('version_right').alias('other_version'),
            status=pl.when(pl.col('version').is_null()).then(pl.lit('removed'))
                .when(pl.col('version_right').is_null()).then(pl.lit('added'))
                .when(pl.col('version') == pl.col('version_right')).then(pl.lit('common'))
                .otherwise(pl.lit('updated')).cast(pl.Enum(['common', 'updated', 'added', 'removed'])),
        )  
    )
    .sort('package', 'release', 'date', 'depth', 'status')
    # Remove first snapshot, since everything is "added" there!
    .filter(pl.col('date') != pl.col('date').min())
)
df_diffs

package,date,release,name,depth,version,other_version,status
str,date,enum,str,u32,str,str,enum
"""2captcha-python""",2023-02-01,"""selected""","""2captcha-python""",0,"""1.1.3""","""1.1.3""","""common"""
"""2captcha-python""",2023-02-01,"""selected""","""requests""",1,"""2.28.2""","""2.28.1""","""updated"""
"""2captcha-python""",2023-02-01,"""selected""","""idna""",2,"""3.4""","""3.4""","""common"""
"""2captcha-python""",2023-02-01,"""selected""","""certifi""",2,"""2022.12.7""","""2022.12.7""","""common"""
"""2captcha-python""",2023-02-01,"""selected""","""urllib3""",2,"""1.26.14""","""1.26.13""","""updated"""
…,…,…,…,…,…,…,…
"""zyte-api""",2025-12-01,"""latest""","""multidict""",2,,"""6.7.0""","""removed"""
"""zyte-api""",2025-12-01,"""latest""","""idna""",3,"""3.11""","""3.11""","""common"""
"""zyte-api""",2025-12-01,"""latest""","""typing-extensions""",3,"""4.15.0""","""4.15.0""","""common"""
"""zyte-api""",2025-12-01,"""latest""","""idna""",3,,"""3.11""","""removed"""


In [25]:
# TODO: Check accuracy

In [26]:
df_diffs.write_parquet('../data/deps_diffs.parquet')

Now we do the same, but this time, we compare the "current" simulation with the very first one. 

In [28]:
df_diffs_initial= (
    df_deps
    # Get date of first simulation
    .with_columns(
        prev_date=pl.col('date').min().over('package', 'release')
    )
    .join(
        other=df_deps,
        left_on=['package', 'prev_date', 'release', 'name'],
        right_on=['package', 'date', 'release', 'name'],
        how='full',
    )
    .select(
        *[pl.coalesce(col, f'{col}_right') for col in ['package', 'date', 'release', 'name', 'depth']],
        pl.col('version'),
        pl.col('version_right').alias('other_version'),
        status=pl.when(pl.col('version').is_null()).then(pl.lit('removed'))
            .when(pl.col('version_right').is_null()).then(pl.lit('added'))
            .when(pl.col('version') == pl.col('version_right')).then(pl.lit('common'))
            .otherwise(pl.lit('updated')).cast(pl.Enum(['common', 'updated', 'added', 'removed'])),
    )  
    .sort('package', 'release', 'date', 'depth', 'status')
    # Remove first snapshot, since everything is "common" there!
    .filter(pl.col('date') != pl.col('date').min())
)
df_diffs_initial

package,date,release,name,depth,version,other_version,status
str,date,enum,str,u32,str,str,enum
"""2captcha-python""",2023-02-01,"""selected""","""2captcha-python""",0,"""1.1.3""","""1.1.3""","""common"""
"""2captcha-python""",2023-02-01,"""selected""","""2captcha-python""",0,,"""1.1.3""","""removed"""
"""2captcha-python""",2023-02-01,"""selected""","""requests""",1,"""2.28.2""","""2.28.1""","""updated"""
"""2captcha-python""",2023-02-01,"""selected""","""requests""",1,,"""2.28.2""","""removed"""
"""2captcha-python""",2023-02-01,"""selected""","""idna""",2,"""3.4""","""3.4""","""common"""
…,…,…,…,…,…,…,…
"""zyte-api""",2025-12-01,"""latest""","""multidict""",2,,"""6.7.0""","""removed"""
"""zyte-api""",2025-12-01,"""latest""","""idna""",3,"""3.11""","""3.4""","""updated"""
"""zyte-api""",2025-12-01,"""latest""","""typing-extensions""",3,"""4.15.0""",,"""added"""
"""zyte-api""",2025-12-01,"""latest""","""idna""",3,,"""3.11""","""removed"""


In [32]:
df_diffs_initial.write_parquet('../data/deps_diffs_origin.parquet')

"More of the same": let's do this diff again, but this time we'll compare each `latest` release to its corresponding `selected` release.

In the resulting dataframe, **it should be assumed that the missing `release` column always equal to `latest`**. 

In [33]:
df_diffs_latest = (
    df_deps
    .filter(pl.col('release') == 'latest')
    .join(
        other=df_deps.filter(pl.col('release') == 'selected'),
        left_on=['package', 'date', 'name'],
        right_on=['package', 'date', 'name'],
        how='full',
    )
    .select(
        *[pl.coalesce(col, f'{col}_right') for col in ['package', 'date', 'name', 'depth']],
        pl.col('version'),
        pl.col('version_right').alias('other_version'),
        status=pl.when(pl.col('version').is_null()).then(pl.lit('removed'))
            .when(pl.col('version_right').is_null()).then(pl.lit('added'))
            .when(pl.col('version') == pl.col('version_right')).then(pl.lit('common'))
            .otherwise(pl.lit('updated')).cast(pl.Enum(['common', 'updated', 'added', 'removed'])),
    )  
    .sort('package', 'date', 'depth', 'status')
)
df_diffs_latest

package,date,name,depth,version,other_version,status
str,date,str,u32,str,str,enum
"""2captcha-python""",2023-01-01,"""2captcha-python""",0,"""1.1.3""","""1.1.3""","""common"""
"""2captcha-python""",2023-01-01,"""requests""",1,"""2.28.1""","""2.28.1""","""common"""
"""2captcha-python""",2023-01-01,"""urllib3""",2,"""1.26.13""","""1.26.13""","""common"""
"""2captcha-python""",2023-01-01,"""idna""",2,"""3.4""","""3.4""","""common"""
"""2captcha-python""",2023-01-01,"""certifi""",2,"""2022.12.7""","""2022.12.7""","""common"""
…,…,…,…,…,…,…
"""zyte-api""",2025-12-01,"""certifi""",2,,"""2025.11.12""","""removed"""
"""zyte-api""",2025-12-01,"""charset-normalizer""",2,,"""3.4.4""","""removed"""
"""zyte-api""",2025-12-01,"""urllib3""",2,,"""2.5.0""","""removed"""
"""zyte-api""",2025-12-01,"""typing-extensions""",3,"""4.15.0""","""4.15.0""","""common"""


In [34]:
df_diffs_latest.write_parquet('../data/deps_diffs_latest.parquet')