The goal being to study backports in an ecosystem, we will focus on:

 - Packages being required by other packages (otherwise, there is no need to backport updates!);
 - Packages being sufficiently required by other packages (see above);
 - Required packages being active (abandonned packages are unlikely to deploy backports);
 - Dependent packages being active (one can expect packages abandonned for years to still rely on very old versions of their dependencies)
 
This notebook aims to select required and dependent packages based on some thresholds.

In [None]:
import pandas
import matplotlib
import seaborn
import tqdm

from version import Version
from parsers import parse_or_empty
from parsers import NPMParser

%matplotlib inline

In [None]:
FIG_SIZE = (6, 4)
PALETTE = seaborn.color_palette()

In [None]:
df_releases = (
    pandas.read_csv('../data-raw/releases.csv.gz')
    .assign(date=lambda d: pandas.to_datetime(d['date'], infer_datetime_format=True))
    .dropna()
)

In [None]:
df_dependencies = (
    pandas.read_csv('../data-raw/dependencies.csv.gz')
    .dropna()
)

Let's have a quick look at the data:

In [None]:
df_releases

In [None]:
df_dependencies

We convert versions to semver syntax, and remove those that cannot be converted.

In [None]:
df_releases[['major', 'minor', 'patch', 'misc']] = (
    df_releases['version'].str.extract(Version.RE, expand=True)
)
df_releases[['major', 'minor', 'patch']] = df_releases[['major', 'minor', 'patch']].astype(float)

# Remove non-compliant versions
n = len(df_releases)
df_releases = df_releases.dropna(subset=['major', 'minor', 'patch'])
print(n - len(df_releases), 'non-compliant versions dropped (total was {})'.format(n))

# Remove prereleases and duplicates, keep first
n = len(df_releases)
df_releases = (
    df_releases
    [lambda d: d['misc'].isnull()]
    .sort_values(['package', 'date'])
    .drop_duplicates(['package', 'major', 'minor', 'patch'], keep='last')
    .drop(columns=['misc'])
)
print(n - len(df_releases), 'prerelease and duplicated versions dropped (total was {})'.format(n))

To select a set of required and dependent packages, we remove packages that were not active for some time.

In [None]:
LAST_ACTIVITY = df_releases.date.max() - pandas.to_timedelta('365 days')
LAST_ACTIVITY

In [None]:
print('Packages and releases:', len(df_releases.drop_duplicates('package')), len(df_releases))
print('Active packages:', len(df_releases[lambda d: d['date'] >= LAST_ACTIVITY].drop_duplicates('package')))

Then we consider only dependencies from the latest version of each package. This will allow us to quantify the number of dependents for each required packages. 

In [None]:
df_dependencies = (
    df_dependencies
    .merge(
        (
            df_releases
            [['package', 'version', 'date']]
            [lambda d: d['date'] >= LAST_ACTIVITY]
            .sort_values('date')
            .drop_duplicates('package', keep='last')
        ),
        how='inner',
        left_on=['source', 'version'],
        right_on=['package', 'version']
    )
    .drop(columns=['package'])
    [lambda d: d['target'].isin(df_releases.package)]
)

In [None]:
df_dependencies

In [None]:
fig, ax = matplotlib.pyplot.subplots()
fig.set_size_inches(FIG_SIZE)

data = (
    df_dependencies
    .groupby(['target'], sort=False)
    .agg({'source': 'count'})
    .sort_values('source', ascending=False)
    .assign(cum_source=lambda d: d['source'].cumsum())
    .assign(cum_target=lambda d: d.assign(n=1).n.cumsum())
    # Make them proportional
    .assign(
        cum_source=lambda d: d['cum_source'] / d['cum_source'].iloc[-1],
        cum_target=lambda d: d['cum_target'] / d['cum_target'].iloc[-1],
    )
)

data.set_index('cum_target')[['cum_source']].plot(ax=ax)

ax.legend().remove()
ax.set(
    xlabel='cumulative proportion of required packages',
    xlim=(0, 1),
    ylabel='cumulative proportion of dependent packages',
    ylim=(0, 1),
)

ax.hlines(0.8, 0, 1, color='r', alpha=0.5, linestyles='dashed')
ax.vlines(0.2, 0, 1, color='r', alpha=0.5, linestyles='dashed')

We must decide on how to select packages that will be kept as "required". 

In [None]:
data.query('cum_source >= 0.8').iloc[0]

In [None]:
data.query('cum_target >= 0.2').iloc[0]

In [None]:
data.query('cum_target >= 0.1').iloc[0]

In [None]:
data.query('source <= 20').iloc[0]

Keeping 10% of all required packages means we have at least 10 dependent packages. Keeping 20 dependent packages implies ignoring nearly 95% of all required packages. 

In [None]:
MIN_REQ = 20

Now we retrieve the list of these required packages, and we identify for all their releases their order and type.
This will be our dataset of "required packages". Based on these packages, we will then create a dataset of "dependent packages".

In [None]:
required = data[lambda d: d['source'] >= MIN_REQ].index

In [None]:
data = []
for name, group in tqdm.tqdm(df_releases[lambda d: d['package'].isin(required)].groupby('package', sort=False)):
    group = (
        group
        # Rank by version
        .sort_values(['major', 'minor', 'patch'])
        .assign(
            rank=lambda d: d.assign(N=1).N.cumsum(),
            kinitial=lambda d: d['major'].shift(1).isnull(),
            kmajor=lambda d: (d['major'] - d['major'].shift(1)).clip(0, 1).astype(bool),
            kminor=lambda d: (d['minor'] - d['minor'].shift(1)).clip(0, 1).astype(bool),
            kpatch=lambda d: (d['patch'] - d['patch'].shift(1)).clip(0, 1).astype(bool),
        )
        .assign(kind=lambda d: d[['kinitial', 'kmajor', 'kminor', 'kpatch']].idxmax(axis=1))
        .replace({'kind': {'kinitial': 'initial', 'kmajor': 'major', 'kminor': 'minor', 'kpatch': 'patch'}})        
        .drop(columns=['kinitial', 'kmajor', 'kminor', 'kpatch'])
        
        # Rank by date
        .sort_values(['date', 'rank'])  # Use rank if versions are distributed on the same day (e.g. imports)
        .assign(rank_date=lambda d: d.assign(N=1).N.cumsum())
        
        # Detect backported releases
        .assign(hrank=lambda d: d['rank'].expanding().max())
        .assign(hmajor=lambda d: d['major'].expanding().max())
        # Identify backported releases. The value corresponds to the highest rank seen so far...
        #.assign(backported=lambda d: d['hrank'].where(d['rank'] < d['hrank'], pandas.np.nan))
        .assign(backported=lambda d: d['hrank'].where(d['major'] < d['hmajor'], pandas.np.nan))
        .drop(columns=['hrank', 'hmajor'])
        # ... but it could be the case that the backport is released before its "origin", so we check
        # the date of rank + 1 as well, and take the closest date.
        .pipe(lambda df: 
            df.merge(
                df[['date', 'rank']], 
                how='left', 
                left_on=['backported'], 
                right_on=['rank'],
                suffixes=('', '_previous'),
            )
            .merge(
                df[['date', 'rank']].assign(rank=lambda d: d['rank'] - 1),
                how='left',
                left_on=['backported'],
                right_on=['rank'],
                suffixes=('', '_next'),
            )
            .assign(rank_next=lambda d: d['rank_next'] + 1)
            # Take closest date
            .assign(backported_from=lambda d:
                d['rank_previous'].where(abs(d['date'] - d['date_previous']) <= abs(d['date'] - d['date_next']), d['rank_next'])
            )
        )
        .drop(columns=['date_previous', 'date_next', 'rank_previous', 'rank_next'])
        .assign(backported=lambda d: ~d['backported'].isnull())
    )
    
    data.append(group)
    
df_required = (
    pandas.concat(data)
    .sort_values(['package', 'rank'])
    [['package', 'version', 'major', 'minor', 'patch', 'rank', 'date', 'rank_date', 'backported', 'backported_from']]
)

Let's have a look at the data.

In [None]:
df_required.query('package == "vue-awesome"').iloc[25:35]

In [None]:
# Save data
df_required.to_csv('../data/required.csv.gz', index=False, compression='gzip')

Now let's collect data for dependent packages. We'll convert dependency constraint to intervals, and then we look at what is the latest and highest versions being accepted by that constraint. 

In [None]:
df_dependents = (
    df_dependencies
    [lambda d: d['target'].isin(required)]
)

intervals = dict() 
parser = NPMParser()

for constraint in tqdm.tqdm(df_dependents.constraint.drop_duplicates()):
    interval = parse_or_empty(parser, constraint)
    d = {'interval': interval}
    
    if interval.is_empty():
        d['empty'] = True
        d['major'] = d['minor'] = d['patch'] = d['dev'] = False
    else:
        base = interval.lower 
        d['empty'] = False
        d['major'] = Version(float('inf'), 0, 0) in interval
        d['minor'] = Version(base.major, float('inf'), 0) in interval
        d['patch'] = Version(base.major, base.minor, float('inf')) in interval
        d['dev'] = Version(1, 0, 0) > interval
        
    intervals[constraint] = d
    
# Are all intervals equal to their enclosure? (i.e. are there "gaps"?)
len([i['interval'] for i in intervals.values() if i['interval'] != i['interval'].enclosure()])

In [None]:
# Identify highest accepted releases
data = []

for target, group in tqdm.tqdm(df_dependents.groupby('target', as_index=False, sort=False)):
    releases = (
        df_required[lambda d: d['package'] == target]
        .assign(version=lambda d: d['version'].apply(lambda v: Version(v)))
        .sort_values('rank', ascending=False)
    )
    
    for constraint, group in group.groupby('constraint', as_index=False, sort=False):
        d = intervals[constraint]
        interval = d['interval']
        selected = None
        
        for release in releases.itertuples():
            if release.version in interval:
                selected = release.rank
                break  # Because they are sorted by descending rank
        else:
            selected = pandas.np.nan
            
        data.append((
            group.assign(
                interval=str(interval),
                selected=selected,
                c_empty=d['empty'],
                c_dev=d['dev'],
                c_major=d['major'],
                c_minor=d['minor'],
                c_patch=d['patch'],
            )
        ))
        
df_dependents = (
    pandas.concat(data)
    .sort_values(['source', 'target'])
    [['source', 'version', 'date', 'target', 'constraint', 'interval', 'selected', 'c_empty', 'c_dev', 'c_major', 'c_minor', 'c_patch']]
)

In [None]:
df_dependents

In [None]:
df_dependents.to_csv('../data/dependents.csv.gz', index=False, compression='gzip')