This notebook aims to find good candidate packages to ask their maintainer about 0.y.z/>=1.0.0 perception.

In [34]:
import pandas

In [35]:
ECOSYSTEMS = ['Cargo', 'NPM', 'Packagist', 'Rubygems']
CENSOR_DATE = pandas.to_datetime('2020-01-12')

In [36]:
df_releases = dict()
df_dependencies = dict()

for ecosystem in ECOSYSTEMS:
    print('Loading', ecosystem)
    print('.. releases')
    df_releases[ecosystem] = (
        pandas.read_csv(
            '../data/{}-releases.csv.gz'.format(ecosystem),
            parse_dates=['date'],
            infer_datetime_format=True,
        )
        [lambda d: d['date'] >= pandas.to_datetime('1990-01-01')]
        .sort_values('date')
    )
    
    print('.. dependencies')
    df_dependencies[ecosystem] = (
        pandas.read_csv(
            '../data/{}-dependencies.csv.gz'.format(ecosystem),
        )
    )
print('Done!')

Loading Cargo
.. releases
.. dependencies
Loading NPM
.. releases
.. dependencies
Loading Packagist
.. releases
.. dependencies
Loading Rubygems
.. releases
.. dependencies
Done!


## Selection criteria

In [37]:
df_candidates = pandas.concat([
    (
        # On releases
        df_releases[ecosystem]
        .assign(pre1=lambda d: d['major'] == 0)
        .assign(post1=lambda d: d['major'] > 0)
        .assign(reached=lambda d: d['date'].where(d['post1'], pandas.np.nan))
        .assign(created=lambda d: d['date'])
        .groupby('package', sort=False, as_index=False)
        .agg({
            'pre1': 'sum',
            'post1': 'sum',
            'reached': 'min',
            'created': 'min',
        })
        # Last release
        .merge(
            df_releases[ecosystem]
            .drop_duplicates('package', keep='last')
            [['package', 'date']]
            .rename(columns={'date': 'last_release'}),
            how='left',
            on='package',
        )
        # On dependencies
        .merge(
            (
                df_dependencies[ecosystem]
                .drop_duplicates(['source', 'target'], keep='last')
                .groupby('target', sort=False, as_index=False)
                .agg({'source': 'count'})
                .rename(columns={'source': 'dependents'})
                [['target', 'dependents']]
            ),
            how='left',
            left_on='package',
            right_on='target',
        )
        .merge(
            (
                df_dependencies[ecosystem]
                .drop_duplicates(['source', 'target'], keep='last')
                .assign(pre1deps=lambda d: d['i_dev'])
                .assign(post1deps=lambda d: ~d['i_dev'])
                .groupby('source', as_index=False, sort=False)
                .agg({
                    'pre1deps': 'sum',
                    'post1deps': 'sum',
                })
                [['source', 'pre1deps', 'post1deps']]
            ),
            how='left',
            left_on='package',
            right_on='source',
        )

        .assign(ecosystem=ecosystem)
        [['ecosystem', 'package', 'created', 'reached', 'last_release', 'pre1', 'post1', 'dependents', 'pre1deps', 'post1deps',]]
    )
    for ecosystem in ECOSYSTEMS]
)

## Packages having reached 1.0.0

In [38]:
CREATED_SINCE = CENSOR_DATE - pandas.to_timedelta('365 days')
REACHED_SINCE = (
    CENSOR_DATE - pandas.to_timedelta('364 days'),
    CENSOR_DATE - pandas.to_timedelta('15 days'),
)
ACTIVE_SINCE = CENSOR_DATE - pandas.to_timedelta('365 days')
PRE1_RELEASES = 2
POST1_RELEASES = 2
DEPENDENTS = 5
PRE1_DEPS = 1
POST1_DEPS = 1

mask = lambda d: (
    (d['last_release'] >= ACTIVE_SINCE) & 
    (d['dependents'] >= DEPENDENTS) &
    (d['pre1deps'] >= PRE1_DEPS) &
    (d['post1deps'] >= POST1_DEPS) &
    (d['created'] <= CREATED_SINCE) & 
    (d['reached'].between(*REACHED_SINCE)) &
    (d['pre1'] >= PRE1_RELEASES) &
    (d['post1'] >= POST1_RELEASES)
)

for ecosystem in ECOSYSTEMS:
    _ = df_candidates[lambda d: d['ecosystem'] == ecosystem]
    print(ecosystem, 'has', len(_[mask]), 'candidates out of', len(_), '({:.2%})'.format(len(_[mask]) / len(_)))

Cargo has 18 candidates out of 34769 (0.05%)
NPM has 406 candidates out of 1217677 (0.03%)
Packagist has 11 candidates out of 180093 (0.01%)
Rubygems has 12 candidates out of 154997 (0.01%)


In [49]:
(
    df_candidates[mask]
    .sort_values('dependents', ascending=False)
    .groupby('ecosystem')
    .head()
    .sort_values('ecosystem')
)

Unnamed: 0,ecosystem,package,created,reached,last_release,pre1,post1,dependents,pre1deps,post1deps
5846,Cargo,syn,2016-09-07 15:22:40,2019-08-13 16:07:32,2020-01-03 17:15:12,122.0,14.0,1339.0,3.0,3.0
1679,Cargo,smallvec,2015-04-06 06:21:18,2019-11-03 22:53:23,2019-12-20 22:18:52,37.0,2.0,311.0,1.0,3.0
11603,Cargo,actix-web,2017-10-23 23:08:29,2019-06-05 03:07:24,2019-12-25 16:30:24,63.0,11.0,169.0,63.0,21.0
4410,Cargo,tera,2016-04-04 16:03:33,2019-12-07 16:26:41,2019-12-18 08:55:53,51.0,2.0,68.0,9.0,10.0
18498,Cargo,zeroize,2018-10-03 09:33:21,2019-10-13 22:22:56,2019-12-03 00:53:53,20.0,2.0,71.0,1.0,1.0
7740,NPM,style-loader,2012-04-07 01:04:00,2019-08-06 09:51:57,2019-12-25 12:59:56,54.0,6.0,9640.0,1.0,2.0
817046,NPM,react-app-polyfill,2018-09-25 18:43:25,2019-04-22 18:50:35,2019-10-03 04:18:28,8.0,5.0,1665.0,1.0,5.0
473675,NPM,fork-ts-checker-webpack-plugin,2017-05-05 16:10:15,2019-03-06 22:17:51,2019-11-25 15:08:52,37.0,28.0,631.0,1.0,17.0
61,NPM,forever,2010-12-25 05:01:45,2019-04-03 22:09:00,2020-01-05 19:00:17,63.0,3.0,483.0,21.0,8.0
6815,NPM,tracer,2012-03-02 05:37:59,2019-08-07 13:18:06,2020-01-04 09:13:32,42.0,3.0,435.0,3.0,3.0


### Packages not having reached 1.0.0

In [None]:
PRE1_RELEASES = 20
POST1_RELEASES = 0
DEPENDENTS = 50

mask = lambda d: (
    (d['last_release'] >= ACTIVE_SINCE) & 
    (d['dependents'] >= DEPENDENTS) &
    (d['pre1deps'] >= PRE1_DEPS) &
    (d['created'] <= CREATED_SINCE) & 
    (d['reached'].isnull()) & 
    (d['pre1'] >= PRE1_RELEASES) &
    (d['post1'] >= POST1_RELEASES)
)

for ecosystem in ECOSYSTEMS:
    _ = df_candidates[lambda d: d['ecosystem'] == ecosystem]
    print(ecosystem, 'has', len(_[mask]), 'candidates out of', len(_), '({:.2%})'.format(len(_[mask]) / len(_)))

In [None]:
(
    df_candidates[mask]
    .sort_values('dependents', ascending=False)
    .groupby('ecosystem')
    .head()
    .sort_values('ecosystem')
)