In [1]:
import pandas
import sys
import tqdm

sys.path.append('..')
from helpers import RE_SEMVER, semver

In [2]:
df_releases = (
    pandas.read_csv('../data/releases.csv.gz')
)

df_dependencies = (
    pandas.read_csv('../data-raw/libio-dependencies.csv.gz')
)

In [25]:
def version_distance(base, target, releases):
    """
    Compute the distance between `base` and `target` (release rank) 
    based on the number of patch, minor and major updates in between.
    
    We assume that `releases` (a subset of `df_releases`) is sorted by `RankByVersion`.
    """
    major, minor, patch = 0, 0, 0
    looking_for = 'Patch'
    
    for row in releases.itertuples():
        if row.RankByVersion <= base:
            continue
        if row.RankByVersion > target:
            break
        
        if looking_for == 'Patch': 
            if row.ReleaseType == 'Patch' or row.ReleaseType == 'Misc':
                patch += 1
            else:
                looking_for = row.ReleaseType
        if looking_for == 'Minor':
            if row.ReleaseType == 'Minor':
                minor += 1
            elif row.ReleaseType == 'Major': 
                looking_for = 'Major'
        if looking_for == 'Major' and row.ReleaseType == 'Major':
            major += 1
        
    return (major, minor, patch)

In [None]:
def compute_lags(releases, time, next_time, constraint):
    """
    Compute lag at `time` and `next_time`, assuming that given `constraint`
    should be evaluated wrt. to given set of package `releases` (a DataFrame that's a 
    subset of `df_releases`. 
    
    Return an iterable with: 
     - highest release installable at `time`;
     - oldest release missed at `time`;
     - highest release missed at `time`;
     - version lag at `time`;
     - temporal lag at `time`;
     - All the items above but at `next_time`.
    """
    
    releases= (
        # Restrict releases to the ones that were available at next_time
        releases[lambda d: d['ReleaseDate'] <= next_time]
        .assign(
            # Tag them depending on their availability at both times
            AvailableAtTime=lambda d: d['ReleaseDate'] <= time,
            AvailableAtNextTime=True,
            # Tag the ones that are installable (ie. accepted by the constraint)
            Installable=lambda d: d['Release'].isin(semver(constraint, d['Release']))
        )
    )
    
    # CARE WITH:
    #  - None available
    #  - None installable
    #  - None missed
    #  - ... ???
    
    # Identify missed ones at both times
    
    
    highest_installable_at_time = None
    highest_missed_at_time = None
    first_missed_at_time = None
    version_lag_at_time = None
    temporal_lag_at_time = None
    
    return