This notebook computes several diffs between simulations. Only unique dependencies are considered (i.e., if a package requires another one more than once, only the dependency at the lowest depth is considered). The diffs identify added, removed, common and updated dependencies (1) between consecutive simulations (i.e., through time); (2) between a simulation and the initial simulation (i.e., accumulated delta); (3) between a "latest" release and the "selected" release for each simulation.

In [1]:
import polars as pl

In [2]:
df_all_deps = pl.read_parquet('../data/dependencies.parquet')

print(f'{df_all_deps.n_unique('package')} packages, {df_all_deps.n_unique(['package', 'release', 'date'])} simulations and {len(df_all_deps)} dependencies (avg: {len(df_all_deps) / df_all_deps.n_unique(['package', 'release', 'date']):.2f})')
df_all_deps.sample(10)

3852 packages, 277344 simulations and 11465904 dependencies (avg: 41.34)


package,release,date,path,version,size
str,enum,date,list[str],str,"decimal[38,2]"
"""titiler-application""","""selected""",2025-10-01,"[""titiler-application"", ""titiler-mosaic"", … ""annotated-types""]","""0.7.0""",13.3
"""aws-sam-cli""","""latest""",2023-12-01,"[""aws-sam-cli"", ""rich"", ""markdown-it-py""]","""3.0.0""",85.5
"""qcodes""","""latest""",2025-07-01,"[""qcodes"", ""ipywidgets"", … ""jedi""]","""0.19.2""",1536.0
"""seleniumbase""","""selected""",2024-06-01,"[""seleniumbase"", ""pytest-html"", … ""pluggy""]","""1.0.0""",13.3
"""titiler-mosaic""","""selected""",2024-06-01,"[""titiler-mosaic"", ""titiler-core"", … ""jmespath""]","""1.0.1""",19.8
"""hdijupyterutils""","""latest""",2025-05-01,"[""hdijupyterutils"", ""jupyter"", … ""referencing""]","""0.36.2""",26.1
"""neutron-lib""","""selected""",2023-05-01,"[""neutron-lib"", ""oslo-messaging"", … ""urllib3""]","""1.26.15""",137.6
"""lumigo-opentelemetry""","""selected""",2024-05-01,"[""lumigo-opentelemetry"", ""opentelemetry-instrumentation-mysql"", … ""opentelemetry-api""]","""1.9.1""",46.7
"""miniwdl""","""latest""",2024-11-01,"[""miniwdl"", ""importlib-metadata""]","""8.5.0""",25.9
"""octavia""","""latest""",2024-08-01,"[""octavia"", ""oslo-messaging"", ""webob""]","""1.8.7""",112.3


Be careful, some packages have their `package` field **distinct from** the first component of their `path` field!

Hereafter, we slightly simplify the dataset, focusing on *unique* dependencies (taking the less nested ones first). 
Note that `depth` starts from `0`. A depth of 0 *always* corresponds to the current package.

In [3]:
df_deps = (
    df_all_deps.lazy()
    # Cache some computation to ease writing polars code
    .with_columns(
        name=pl.col('path').list.get(-1),
        depth=pl.col('path').list.len() - 1,
    )
    # Remove duplicated dependencies, keep the one with the lowest depth
    .group_by('package', 'release', 'date', 'name')
    .agg(
        pl.all().sort_by(pl.col('depth')).first(),
    )
    .select('package', 'release', 'date', pl.col('version').filter(pl.col('depth') == pl.lit(0)).first().over('package', 'release', 'date').alias('release_name'), 'name', 'version', 'depth', 'size')
    .sort('package', 'release', 'date', 'name')
    .collect()
)

df_deps.write_parquet('../data/dependencies_unique.parquet')

print(f'{df_deps.n_unique('package')} packages, {df_deps.n_unique(['package', 'release', 'date'])} simulations and {len(df_deps)} dependencies (avg: {len(df_deps) / df_deps.n_unique(['package', 'release', 'date']):.2f})')
df_deps.sample(10)

3852 packages, 277344 simulations and 3767879 dependencies (avg: 13.59)


package,release,date,release_name,name,version,depth,size
str,enum,date,str,str,str,u32,"decimal[38,2]"
"""fugue""","""latest""",2025-07-01,"""0.9.1""","""pyarrow""","""20.0.0""",2,30105.6
"""celery""","""selected""",2025-07-01,"""5.2.7""","""kombu""","""5.5.4""",1,205.1
"""pynautobot""","""latest""",2025-10-01,"""2.6.6""","""charset-normalizer""","""3.4.3""",2,199.7
"""hana-ml""","""selected""",2025-09-01,"""2.15.22122300""","""hdbcli""","""2.25.31""",1,5529.6
"""pulpcore""","""latest""",2025-04-01,"""3.74.1""","""whitenoise""","""6.9.0""",1,19.7
"""adlfs""","""selected""",2023-07-01,"""2022.11.2""","""azure-core""","""1.27.1""",1,170.5
"""mlserver-mlflow""","""selected""",2025-02-01,"""1.2.1""","""asgiref""","""3.8.1""",4,23.3
"""smartystreets-python-sdk""","""latest""",2024-11-01,"""4.16.1""","""smartystreets-python-sdk""","""4.16.1""",0,
"""rio-tiler""","""selected""",2025-11-01,"""4.1.5""","""numpy""","""2.3.4""",1,20787.2
"""dagster-azure""","""latest""",2023-06-01,"""0.19.7""","""pycparser""","""2.21""",4,115.9


In [4]:
df_diffs = (
    df_deps
    # Get previous simulation
    .pipe(lambda df: df.join(
        other=df.unique('date').sort('date').select(
            pl.col('date'), 
            pl.col('date').shift(1).alias('prev_date'),
            pl.col('date').shift(-1).alias('next_date'),
        ),
        on='date', 
        how='left', 
    ))
    # Identify added, removed, common and changed dependencies
    .pipe(lambda df: df.join(
            other=df,
            left_on=['package', 'prev_date', 'release', 'name'],
            right_on=['package', 'date', 'release', 'name'],
            how='full',
            coalesce=True,
        )
        .select(
            'package', pl.coalesce('date', 'next_date_right'), 'release', 'name', 
            pl.coalesce('depth', 'depth_right'), 
            pl.col('version'),
            pl.col('version_right').alias('other_version'),
            status=pl.when(pl.col('version').is_null()).then(pl.lit('removed'))
                .when(pl.col('version_right').is_null()).then(pl.lit('added'))
                .when(pl.col('version') == pl.col('version_right')).then(pl.lit('common'))
                .otherwise(pl.lit('updated')).cast(pl.Enum(['common', 'updated', 'added', 'removed'])),
        )  
    )
    # Remove first snapshot, since everything is "added" there!
    .filter(pl.col('date') != pl.col('date').min())
    # Remove current package as a dependency
    .filter(pl.col('depth') > 0)
    .sort('package', 'release', 'date', 'name')
)
df_diffs

package,date,release,name,depth,version,other_version,status
str,date,enum,str,u32,str,str,enum
"""2captcha-python""",2023-02-01,"""selected""","""certifi""",2,"""2022.12.7""","""2022.12.7""","""common"""
"""2captcha-python""",2023-02-01,"""selected""","""charset-normalizer""",2,"""3.0.1""","""2.1.1""","""updated"""
"""2captcha-python""",2023-02-01,"""selected""","""idna""",2,"""3.4""","""3.4""","""common"""
"""2captcha-python""",2023-02-01,"""selected""","""requests""",1,"""2.28.2""","""2.28.1""","""updated"""
"""2captcha-python""",2023-02-01,"""selected""","""urllib3""",2,"""1.26.14""","""1.26.13""","""updated"""
…,…,…,…,…,…,…,…
"""zyte-api""",2025-12-01,"""latest""","""tenacity""",1,"""9.1.2""","""9.1.2""","""common"""
"""zyte-api""",2025-12-01,"""latest""","""tqdm""",1,"""4.67.1""","""4.67.1""","""common"""
"""zyte-api""",2025-12-01,"""latest""","""typing-extensions""",3,"""4.15.0""","""4.15.0""","""common"""
"""zyte-api""",2025-12-01,"""latest""","""w3lib""",1,"""2.3.1""","""2.3.1""","""common"""


In [5]:
df_diffs.write_parquet('../data/deps_diffs.parquet')

Now we do the same, but this time, we compare the "current" simulation with the very first one.

In [6]:
df_diffs_initial = (
    df_deps
    # Get previous simulation
    .pipe(lambda df: df.join(
        other=df.unique('date').sort('date').select(
            pl.col('date'), 
            pl.col('date').shift(1).alias('prev_date'),
            pl.col('date').shift(-1).alias('next_date'),
        ),
        on='date', 
        how='left', 
    ))
    # Identify added, removed, common and changed dependencies
    .pipe(lambda df: df.join(
            other=(
                # Let's pretend that all simulations correspond to the first one
                df
                .filter(pl.col('date') == pl.col('date').min())
                .select(pl.exclude('date', 'prev_date', 'next_date'))
                .join(
                    other=df.unique('date').sort('date').select(
                        pl.col('date'), 
                        pl.col('date').shift(1).alias('prev_date'),
                        pl.col('date').shift(-1).alias('next_date'),
                    ),
                    how='cross',
                )
            ),
            left_on=['package', 'prev_date', 'release', 'name'],
            right_on=['package', 'date', 'release', 'name'],
            how='full',
            coalesce=True,
        )
        .select(
            'package', pl.coalesce('date', 'next_date_right'), 'release', 'name', 
            pl.coalesce('depth', 'depth_right'), 
            pl.col('version'),
            pl.col('version_right').alias('other_version'),
            status=pl.when(pl.col('version').is_null()).then(pl.lit('removed'))
                .when(pl.col('version_right').is_null()).then(pl.lit('added'))
                .when(pl.col('version') == pl.col('version_right')).then(pl.lit('common'))
                .otherwise(pl.lit('updated')).cast(pl.Enum(['common', 'updated', 'added', 'removed'])),
        )  
    )
    # Remove first snapshot, since everything is "added" there!
    .filter(pl.col('date') != pl.col('date').min())
    # Remove current package as a dependency
    .filter(pl.col('depth') > 0)
    .sort('package', 'release', 'date', 'name')
)
df_diffs_initial

package,date,release,name,depth,version,other_version,status
str,date,enum,str,u32,str,str,enum
"""2captcha-python""",2023-02-01,"""selected""","""certifi""",2,"""2022.12.7""","""2022.12.7""","""common"""
"""2captcha-python""",2023-02-01,"""selected""","""charset-normalizer""",2,"""3.0.1""","""2.1.1""","""updated"""
"""2captcha-python""",2023-02-01,"""selected""","""idna""",2,"""3.4""","""3.4""","""common"""
"""2captcha-python""",2023-02-01,"""selected""","""requests""",1,"""2.28.2""","""2.28.1""","""updated"""
"""2captcha-python""",2023-02-01,"""selected""","""urllib3""",2,"""1.26.14""","""1.26.13""","""updated"""
…,…,…,…,…,…,…,…
"""zyte-api""",2025-12-01,"""latest""","""tqdm""",1,"""4.67.1""","""4.64.1""","""updated"""
"""zyte-api""",2025-12-01,"""latest""","""typing-extensions""",3,"""4.15.0""",,"""added"""
"""zyte-api""",2025-12-01,"""latest""","""urllib3""",2,,"""1.26.13""","""removed"""
"""zyte-api""",2025-12-01,"""latest""","""w3lib""",1,"""2.3.1""","""2.1.1""","""updated"""


In [7]:
df_diffs_initial.write_parquet('../data/deps_diffs_origin.parquet')

"More of the same": let's do this diff again, but this time we'll compare each `latest` release to its corresponding `selected` release.

In the resulting dataframe, **it should be assumed that the missing `release` column always equal to `latest`**. 

In [8]:
df_diffs_latest = (
    df_deps
    .filter(pl.col('release') == 'latest')
    .join(
        other=df_deps.filter(pl.col('release') == 'selected'),
        on=['package', 'date', 'name'],
        how='full',
        coalesce=True,
    )
    .select(
        'package', 'date', 'name', 
        pl.coalesce('depth', 'depth_right'), 
        pl.col('version'),
        pl.col('version_right').alias('other_version'),
        status=pl.when(pl.col('version').is_null()).then(pl.lit('removed'))
            .when(pl.col('version_right').is_null()).then(pl.lit('added'))
            .when(pl.col('version') == pl.col('version_right')).then(pl.lit('common'))
            .otherwise(pl.lit('updated')).cast(pl.Enum(['common', 'updated', 'added', 'removed'])),
    )
    # Remove current package as a dependency
    .filter(pl.col('depth') > 0)
    .sort('package', 'date', 'name')
)
df_diffs_latest

package,date,name,depth,version,other_version,status
str,date,str,u32,str,str,enum
"""2captcha-python""",2023-01-01,"""certifi""",2,"""2022.12.7""","""2022.12.7""","""common"""
"""2captcha-python""",2023-01-01,"""charset-normalizer""",2,"""2.1.1""","""2.1.1""","""common"""
"""2captcha-python""",2023-01-01,"""idna""",2,"""3.4""","""3.4""","""common"""
"""2captcha-python""",2023-01-01,"""requests""",1,"""2.28.1""","""2.28.1""","""common"""
"""2captcha-python""",2023-01-01,"""urllib3""",2,"""1.26.13""","""1.26.13""","""common"""
…,…,…,…,…,…,…
"""zyte-api""",2025-12-01,"""tqdm""",1,"""4.67.1""","""4.67.1""","""common"""
"""zyte-api""",2025-12-01,"""typing-extensions""",3,"""4.15.0""","""4.15.0""","""common"""
"""zyte-api""",2025-12-01,"""urllib3""",2,,"""2.5.0""","""removed"""
"""zyte-api""",2025-12-01,"""w3lib""",1,"""2.3.1""","""2.3.1""","""common"""


In [9]:
df_diffs_latest.write_parquet('../data/deps_diffs_latest.parquet')

Let's do some checks... (I know only one is visible in this notebook, but we tried with many ones!)

In [10]:
# Candidate dependencies having exhibited the 4 status
(
    df_diffs
    .group_by('package', 'release', 'name')
    .agg(pl.col('status').n_unique())
    .filter(pl.col('status') == 4)
)

package,release,name,status
str,enum,str,u32
"""django-app-helper""","""selected""","""typing-extensions""",4
"""types-aiobotocore-discovery""","""latest""","""typing-extensions""",4
"""etos-test-runner""","""latest""","""deprecated""",4
"""dagster-cloud-cli""","""latest""","""tqdm""",4
"""fastapi-injector""","""selected""","""email-validator""",4
…,…,…,…
"""yamlfix""","""selected""","""annotated-types""",4
"""rasa-sdk""","""latest""","""opentelemetry-exporter-jaeger""",4
"""great-expectations-experimenta…","""selected""","""jupyter-lsp""",4
"""subliminal""","""latest""","""setuptools""",4


In [11]:
package, release, name = 'pytest-mypy-plugins', 'latest', 'attrs'

list(
    df_deps
    .filter(package=package, release=release, name=name)
    .sort('date')
    .select('date', 'version')
    .iter_rows()
)

[(datetime.date(2023, 1, 1), '22.2.0'),
 (datetime.date(2023, 2, 1), '22.2.0'),
 (datetime.date(2023, 3, 1), '22.2.0'),
 (datetime.date(2023, 4, 1), '22.2.0'),
 (datetime.date(2024, 3, 1), '23.2.0'),
 (datetime.date(2024, 4, 1), '23.2.0'),
 (datetime.date(2024, 5, 1), '23.2.0'),
 (datetime.date(2024, 6, 1), '23.2.0'),
 (datetime.date(2024, 7, 1), '23.2.0'),
 (datetime.date(2024, 8, 1), '23.2.0'),
 (datetime.date(2024, 9, 1), '24.2.0'),
 (datetime.date(2024, 10, 1), '24.2.0'),
 (datetime.date(2024, 11, 1), '24.2.0'),
 (datetime.date(2024, 12, 1), '24.2.0'),
 (datetime.date(2025, 1, 1), '24.3.0'),
 (datetime.date(2025, 2, 1), '25.1.0'),
 (datetime.date(2025, 3, 1), '25.1.0'),
 (datetime.date(2025, 4, 1), '25.3.0'),
 (datetime.date(2025, 5, 1), '25.3.0'),
 (datetime.date(2025, 6, 1), '25.3.0'),
 (datetime.date(2025, 7, 1), '25.3.0'),
 (datetime.date(2025, 8, 1), '25.3.0'),
 (datetime.date(2025, 9, 1), '25.3.0'),
 (datetime.date(2025, 10, 1), '25.3.0'),
 (datetime.date(2025, 11, 1), '25.4.

In [12]:
list(
    df_diffs
    .filter(package=package, release=release, name=name)
    .select('date', 'version', 'other_version', 'status')
    .iter_rows()
)

[(datetime.date(2023, 2, 1), '22.2.0', '22.2.0', 'common'),
 (datetime.date(2023, 3, 1), '22.2.0', '22.2.0', 'common'),
 (datetime.date(2023, 4, 1), '22.2.0', '22.2.0', 'common'),
 (datetime.date(2023, 5, 1), None, '22.2.0', 'removed'),
 (datetime.date(2024, 3, 1), '23.2.0', None, 'added'),
 (datetime.date(2024, 4, 1), '23.2.0', '23.2.0', 'common'),
 (datetime.date(2024, 5, 1), '23.2.0', '23.2.0', 'common'),
 (datetime.date(2024, 6, 1), '23.2.0', '23.2.0', 'common'),
 (datetime.date(2024, 7, 1), '23.2.0', '23.2.0', 'common'),
 (datetime.date(2024, 8, 1), '23.2.0', '23.2.0', 'common'),
 (datetime.date(2024, 9, 1), '24.2.0', '23.2.0', 'updated'),
 (datetime.date(2024, 10, 1), '24.2.0', '24.2.0', 'common'),
 (datetime.date(2024, 11, 1), '24.2.0', '24.2.0', 'common'),
 (datetime.date(2024, 12, 1), '24.2.0', '24.2.0', 'common'),
 (datetime.date(2025, 1, 1), '24.3.0', '24.2.0', 'updated'),
 (datetime.date(2025, 2, 1), '25.1.0', '24.3.0', 'updated'),
 (datetime.date(2025, 3, 1), '25.1.0', '25

In [13]:
list(
    df_diffs_initial
    .filter(package=package, release=release, name=name)
    .select('date', 'version', 'other_version', 'status')
    .iter_rows()
)

[(datetime.date(2023, 2, 1), '22.2.0', '22.2.0', 'common'),
 (datetime.date(2023, 3, 1), '22.2.0', '22.2.0', 'common'),
 (datetime.date(2023, 4, 1), '22.2.0', '22.2.0', 'common'),
 (datetime.date(2023, 5, 1), None, '22.2.0', 'removed'),
 (datetime.date(2023, 6, 1), None, '22.2.0', 'removed'),
 (datetime.date(2023, 7, 1), None, '22.2.0', 'removed'),
 (datetime.date(2023, 8, 1), None, '22.2.0', 'removed'),
 (datetime.date(2023, 9, 1), None, '22.2.0', 'removed'),
 (datetime.date(2023, 10, 1), None, '22.2.0', 'removed'),
 (datetime.date(2023, 11, 1), None, '22.2.0', 'removed'),
 (datetime.date(2023, 12, 1), None, '22.2.0', 'removed'),
 (datetime.date(2024, 1, 1), None, '22.2.0', 'removed'),
 (datetime.date(2024, 2, 1), None, '22.2.0', 'removed'),
 (datetime.date(2024, 3, 1), '23.2.0', '22.2.0', 'updated'),
 (datetime.date(2024, 4, 1), '23.2.0', '22.2.0', 'updated'),
 (datetime.date(2024, 5, 1), '23.2.0', '22.2.0', 'updated'),
 (datetime.date(2024, 6, 1), '23.2.0', '22.2.0', 'updated'),
 (d

In [14]:
display(
    list(
        df_deps
        .filter(package=package, date=pl.col('date').max())
        .sort('date')
        .select('date', 'release', 'name', 'version')
        .iter_rows()
    )
)

[(datetime.date(2025, 12, 1), 'selected', 'chevron', '0.14.0'),
 (datetime.date(2025, 12, 1), 'selected', 'decorator', '5.2.1'),
 (datetime.date(2025, 12, 1), 'selected', 'iniconfig', '2.3.0'),
 (datetime.date(2025, 12, 1), 'selected', 'librt', '0.6.3'),
 (datetime.date(2025, 12, 1), 'selected', 'mypy', '1.19.0'),
 (datetime.date(2025, 12, 1), 'selected', 'mypy-extensions', '1.1.0'),
 (datetime.date(2025, 12, 1), 'selected', 'packaging', '25.0'),
 (datetime.date(2025, 12, 1), 'selected', 'pathspec', '0.12.1'),
 (datetime.date(2025, 12, 1), 'selected', 'pluggy', '1.6.0'),
 (datetime.date(2025, 12, 1), 'selected', 'pygments', '2.19.2'),
 (datetime.date(2025, 12, 1), 'selected', 'pytest', '9.0.1'),
 (datetime.date(2025, 12, 1), 'selected', 'pytest-mypy-plugins', '1.10.1'),
 (datetime.date(2025, 12, 1), 'selected', 'pyyaml', '6.0.3'),
 (datetime.date(2025, 12, 1), 'selected', 'regex', '2025.11.3'),
 (datetime.date(2025, 12, 1), 'selected', 'typing-extensions', '4.15.0'),
 (datetime.date(20

In [15]:
list(
    df_diffs_latest
    .filter(package=package, date=pl.col('date').max())
    .select('date', 'name', 'version', 'other_version', 'status')
    .sort('status')
    .iter_rows()
)

[(datetime.date(2025, 12, 1), 'decorator', '5.2.1', '5.2.1', 'common'),
 (datetime.date(2025, 12, 1), 'iniconfig', '2.3.0', '2.3.0', 'common'),
 (datetime.date(2025, 12, 1), 'librt', '0.6.3', '0.6.3', 'common'),
 (datetime.date(2025, 12, 1), 'mypy', '1.19.0', '1.19.0', 'common'),
 (datetime.date(2025, 12, 1), 'mypy-extensions', '1.1.0', '1.1.0', 'common'),
 (datetime.date(2025, 12, 1), 'packaging', '25.0', '25.0', 'common'),
 (datetime.date(2025, 12, 1), 'pathspec', '0.12.1', '0.12.1', 'common'),
 (datetime.date(2025, 12, 1), 'pluggy', '1.6.0', '1.6.0', 'common'),
 (datetime.date(2025, 12, 1), 'pygments', '2.19.2', '2.19.2', 'common'),
 (datetime.date(2025, 12, 1), 'pytest', '9.0.1', '9.0.1', 'common'),
 (datetime.date(2025, 12, 1), 'pyyaml', '6.0.3', '6.0.3', 'common'),
 (datetime.date(2025, 12, 1), 'regex', '2025.11.3', '2025.11.3', 'common'),
 (datetime.date(2025, 12, 1),
  'typing-extensions',
  '4.15.0',
  '4.15.0',
  'common'),
 (datetime.date(2025, 12, 1), 'attrs', '25.4.0', Non