In [1]:
import pandas
import sys
import tqdm

sys.path.append('..')
from helpers import RE_SEMVER, semver

In [2]:
df_releases = (
    pandas.read_csv('../data-raw/libio-versions.csv.gz')
    .assign(Date=lambda d: pandas.to_datetime(d['Date'], infer_datetime_format=True))
    .rename(columns={'Project': 'Package', 'Date': 'ReleaseDate'})
)    

In [3]:
# Identify major, minor, patch and misc components
df_releases[['VMajor', 'VMinor', 'VPatch', 'VMisc']] = (
    df_releases['Release']
    .str.extract(RE_SEMVER, expand=True)
)

In [4]:
# Convert them to float (not int because some packages have 9999999999999999999 as version oO)
for label in ['VMajor', 'VMinor', 'VPatch']:
    df_releases[label] = df_releases[label].astype(float)

In [13]:
df_releases = (
    df_releases
    [lambda d: ~(d['VMisc'] != '')]
    .drop_duplicates(['Package', 'VMajor', 'VMinor', 'VPatch'])
)

In [14]:
# We're not using groupby(..).apply() because 
# using a loop is more than 2 times faster in our case. 

data = []
for name, group in tqdm.tqdm_notebook(df_releases.groupby('Package', sort=False)):
    group = (
        group
        .sort_values('ReleaseDate')
        .assign(
            RankByDate=lambda d: d.assign(N=1).N.cumsum(),
            NextReleaseDateByDate=lambda d: d['ReleaseDate'].shift(-1)
        )
                
        .sort_values(['VMajor', 'VMinor', 'VPatch', 'ReleaseDate'])
        .assign(
            RankByVersion=lambda d: d.assign(N=1).N.cumsum(),
            Initial=lambda d: d['VMajor'].shift(1).isnull(),
            Major=lambda d: (d['VMajor'] - d['VMajor'].shift(1)).clip(0, 1).astype(bool),
            Minor=lambda d: (d['VMinor'] - d['VMinor'].shift(1)).clip(0, 1).astype(bool),
            Patch=lambda d: (d['VPatch'] - d['VPatch'].shift(1)).clip(0, 1).astype(bool),
            Misc=True
        )
        .assign(ReleaseType=lambda d: d[['Initial', 'Major', 'Minor', 'Patch', 'Misc']].idxmax(axis=1))
        
        .drop(columns=[
            'Initial', 'Major', 'Minor', 'Patch', 'Misc',
            'VMajor', 'VMinor', 'VPatch', 'VMisc',
        ])        
    )

    data.append(group)




In [15]:
df_semver = pandas.concat(data)

In [16]:
df_semver.to_csv(
    '../data/releases.csv.gz', 
    index=False,
    compression='gzip',
)