In [1]:
import pandas
import sys
import tqdm

sys.path.append('..')
from helpers import RE_SEMVER, semver

In [2]:
df_releases = (
    pandas.read_csv('../data/releases.csv.gz')
)

In [83]:
df_dependencies = (
    pandas.read_csv('../data-raw/libio-dependencies.csv.gz')
)

In [3]:
def version_distance(base, target, releases):
    """
    Compute the distance between `base` and `target` (release rank) 
    based on the number of patch, minor and major updates in between.
    
    We assume that `releases` (a subset of `df_releases`) is sorted by `RankByVersion`.
    """
    major, minor, patch = 0, 0, 0
    looking_for = 'Patch'
    
    for row in releases.itertuples():
        if row.RankByVersion <= base:
            continue
        if row.RankByVersion > target:
            break
        
        if looking_for == 'Patch': 
            if row.ReleaseType == 'Patch' or row.ReleaseType == 'Misc':
                patch += 1
            else:
                looking_for = row.ReleaseType
        if looking_for == 'Minor':
            if row.ReleaseType == 'Minor':
                minor += 1
            elif row.ReleaseType == 'Major': 
                looking_for = 'Major'
        if looking_for == 'Major' and row.ReleaseType == 'Major':
            major += 1
        
    return (major, minor, patch)

In [41]:
def compute_lags(releases, time, next_time, constraint):
    """
    Compute lag at `time` and `next_time`, assuming that given `constraint`
    should be evaluated wrt. to given set of package `releases` (a DataFrame that's a 
    subset of `df_releases`. 
    
    Return an iterable with: 
     - highest release installable at `time`;
     - highest release missed at `time`;
     - oldest release missed at `time`;
     - version lag at `time`;
     - temporal lag at `time`;
     - All the items above but at `next_time`.
    """
    
    releases= (
        # Restrict releases to the ones that were available at next_time
        releases[lambda d: d['ReleaseDate'] <= next_time]
        .assign(
            # Tag them depending on their availability at both times
            AvailableAtTime=lambda d: d['ReleaseDate'] <= time,
            AvailableAtNextTime=True,
            # Tag the ones that are installable (ie. accepted by the constraint)
            Installable=lambda d: d['Release'].isin(semver(constraint, d['Release']))
        )
    )
    
    results = []
    
    # Handle computation at two times points
    for is_next in [False, True]:
        available_label = 'AvailableAtTime' if not is_next else 'AvailableAtNextTime'
        
        # Find highest installable and annotate missed releases
        try:
            highest_installable = releases[lambda d: d['Installable'] & d[available_label]].iloc[-1]
            releases['Missed'] = releases[available_label] & (releases['RankByVersion'] > highest_installable['RankByVersion'])
            results.append(highest_installable['Release'])
        except IndexError:
            releases['Missed'] = releases[available_label]
            results.append(None)
            
        # Find highest missed and oldest missed (required to compute version and temporal lags)    
        try:
            highest_missed = releases[lambda d: d['Missed']].iloc[-1]
            first_missed = releases[lambda d: d['Missed']].sort_values('RankByDate').iloc[0]
            version_lag = version_distance(
                highest_installable['RankByVersion'], 
                highest_missed['RankByVersion'],
                releases[lambda d: d[available_label]],
            )
            current_time = time if not is_next else next_time
            temporal_lag = current_time - first_missed['ReleaseDate']
            
            results.append(highest_missed['Release'])
            results.append(first_missed['Release'])
            results.append(version_lag)
            results.append(temporal_lag)
        except IndexError: 
            results.append(None)
            results.append(None)
            results.append((0, 0, 0))
            results.append(0)
            
    return results

In [None]:
# Proposed filters:
# - Packages updated in 2017
# - Packages having at least 2 updates

# Results in 188.400 packages, 9.922.958 dependencies