This notebook aims to find good candidate packages to ask their maintainer about 0.y.z/>=1.0.0 perception.

In [1]:
import pandas

In [3]:
ECOSYSTEMS = ['Cargo', 'NPM', 'Packagist', 'Rubygems']
CENSOR_DATE = pandas.to_datetime('2020-01-12')

In [4]:
df_releases = dict()
df_dependencies = dict()

for ecosystem in ECOSYSTEMS:
    print('Loading', ecosystem)
    print('.. releases')
    df_releases[ecosystem] = (
        pandas.read_csv(
            '../data/{}-releases.csv.gz'.format(ecosystem),
            parse_dates=['date'],
            infer_datetime_format=True,
        )
        [lambda d: d['date'] >= pandas.to_datetime('1990-01-01')]
    )
    
    print('.. dependencies')
    df_dependencies[ecosystem] = (
        pandas.read_csv(
            '../data/{}-dependencies.csv.gz'.format(ecosystem),
        )
    )
print('Done!')

Loading Cargo
.. releases
.. dependencies
Loading NPM
.. releases
.. dependencies
Loading Packagist
.. releases
.. dependencies
Loading Rubygems
.. releases
.. dependencies
Done!


## Selection criteria

In [56]:
df_candidates = pandas.concat([
    (
        # On releases
        df_releases[ecosystem]
        .assign(pre1=lambda d: d['major'] == 0)
        .assign(post1=lambda d: d['major'] > 0)
        .assign(reached=lambda d: d['date'].where(d['pre1'], pandas.np.nan))
        .assign(created=lambda d: d['date'])
        .groupby('package', sort=False, as_index=False)
        .agg({
            'pre1': 'sum',
            'post1': 'sum',
            'reached': 'max',
            'created': 'min',
        })

        # On dependencies
        .merge(
            (
                df_dependencies[ecosystem]
                .drop_duplicates(['source', 'target'], keep='last')
                .groupby('target', sort=False, as_index=False)
                .agg({'source': 'count'})
                .rename(columns={'source': 'dependents'})
                [['target', 'dependents']]
            ),
            how='left',
            left_on='package',
            right_on='target',
        )
        .merge(
            (
                df_dependencies[ecosystem]
                .drop_duplicates(['source', 'target'], keep='last')
                .assign(pre1deps=lambda d: d['i_dev'])
                .assign(post1deps=lambda d: ~d['i_dev'])
                .groupby('source', as_index=False, sort=False)
                .agg({
                    'pre1deps': 'sum',
                    'post1deps': 'sum',
                })
                [['source', 'pre1deps', 'post1deps']]
            ),
            how='left',
            left_on='package',
            right_on='source',
        )

        .assign(ecosystem=ecosystem)
        [['ecosystem', 'package', 'created', 'reached', 'pre1', 'post1', 'dependents', 'pre1deps', 'post1deps',]]
    )
    for ecosystem in ECOSYSTEMS]
)

In [62]:
CREATED_SINCE = CENSOR_DATE - pandas.to_timedelta('365 days')
REACHED_SINCE = (
    CENSOR_DATE - pandas.to_timedelta('364 days'),
    CENSOR_DATE - pandas.to_timedelta('15 days'),
)
PRE1_RELEASES = 2
POST1_RELEASES = 2
DEPENDENTS = 5
PRE1_DEPS = 1
POST1_DEPS = 1

mask = lambda d: (
    (d['dependents'] >= DEPENDENTS) &
    (d['pre1deps'] >= PRE1_DEPS) &
    (d['post1deps'] >= POST1_DEPS) &
    (d['created'] <= CREATED_SINCE) & 
    (d['reached'].between(*REACHED_SINCE)) &
    (d['pre1'] >= PRE1_RELEASES) &
    (d['post1'] >= POST1_RELEASES)
)

for ecosystem in ECOSYSTEMS:
    _ = df_candidates[lambda d: d['ecosystem'] == ecosystem]
    print(ecosystem, 'has', len(_[mask]), 'candidates out of', len(_), '({:.2%})'.format(len(_[mask]) / len(_)))

Cargo has 9 candidates out of 34769 (0.03%)
NPM has 309 candidates out of 1217677 (0.03%)
Packagist has 9 candidates out of 180093 (0.00%)
Rubygems has 8 candidates out of 154997 (0.01%)
