In [43]:
import pandas as pd
import re
from tqdm import tqdm_notebook as tqdm
import os
import subprocess
import json as js
import concurrent.futures
import time
import semver

# load installed packages

In [2]:
packages = pd.read_csv('../data/cleaned_installed_packages.csv', dtype=str)
packages.package.fillna('nan', inplace=True)

In [4]:
packages.sample()

Unnamed: 0,image,package,version,base,operating,popularity,last_updated,core,version_release,version_today
481741,ccarney16/pterodactyl-daemon,get-stream,3.0.0,node,Alpine,62339,2019-04-03,True,3.0.0,3.0.0


## load available packages

In [6]:
available = pd.concat([
    pd.read_csv('../data/'+x, dtype=str) 
    for x in ['npm_versions.csv','pypi_versions.csv','ruby_versions.csv']])
available.package.fillna('nan', inplace=True)

In [9]:
available = available.query('date < "2019-12-12"')

In [None]:
available['version'] = available['version'].apply(lambda x: x+'.0' if x.count('.')==1
                                                 else x)

In [13]:
available = pd.concat([available,available.version.str.extract(RE_SEMVER, expand=True)],axis=1)
available.fillna('undefined', inplace=True)

In [14]:
# versions that we cannot use MAJOR.MINOR.PATCH with them
for x in ['node', 'python','ruby']:
    print(len(available.query('base == "'+x+'"').query('Major == "undefined"'))/len(available.query('base == "'+x+'"')))

0.0
0.018337030437470453
0.0039921348984091045


In [16]:
# Filter those packages OUT
p = available.query('Major == "undefined"').package
available = available[~available.package.isin(p)]

In [17]:
for col in ['Major','Minor','Patch']:
    available[col] = available[col].apply(int)

In [18]:
available.sort_values(['base','package','Major','Minor','Patch'], inplace=True)

In [19]:
available['Major_previous'] = available['Major'].shift(1)
available['Minor_previous'] = available['Minor'].shift(1)
available['package_previous'] = available['package'].shift(1)

In [20]:
available ['release_type'] = available.apply(lambda d: 
                                                  release_type(d['package'],
                                                               d['Major'],
                                                               d['Minor'],
                                                               d['package_previous'],
                                                               d['Major_previous'],
                                                               d['Minor_previous']
                                                              ), axis=1)

In [21]:
l = len(packages)
packages = packages[packages.package.isin(available.package)]
len(packages)*100/l

98.43791146129023

In [24]:
available.drop(['Major','Minor','Patch','v_misc','package_previous',
                'Major_previous','Minor_previous'], axis=1, inplace=True )
available.rename(columns = {'version':'version_compare'}, inplace=True)

In [25]:
available.head(2)

Unnamed: 0,package,version_compare,date,base,release_type
65393,7zip-bin,0.0.1,2016-04-08T10:17:02.879Z,node,initial
65394,7zip-bin,0.0.2,2016-04-08T10:56:33.502Z,node,patch


In [30]:
packages.head(2)

Unnamed: 0,image,package,version,base,operating,popularity,last_updated,core,version_release,version_today
0,centralci/alpine-node,bower,1.8.8,node,Alpine,406310,2019-10-28,False,1.8.8,1.8.8
1,centralci/alpine-node,gulp,4.0.2,node,Alpine,406310,2019-10-28,False,4.0.2,4.0.2


## Compute the technical lag at the analysis date

In [32]:
unique_packages = packages[['package','version_today','base']].drop_duplicates()

In [33]:
len(available), len(unique_packages), len(packages)

(376604, 29932, 983126)

In [34]:
unique_packages = (available
                   .merge(unique_packages,
                          on = ['base','package'],
                          how = 'left'
                         )
                   )

In [47]:
def compare_semver(version, compare):
    version = version.split('-')[0]
    compare = compare.split('-')[0]
    try:
        return semver.compare(version, compare) < 0
    except:
        try:
            version2 = version.split('.')
            compare2 = compare.split('.')
            if int(version2[0]) < int(compare2[0]):
                return True
            elif int(version2[0]) > int(compare2[0]):
                return False
            else:
                if int(version2[1]) < int(compare2[1]):
                    return True
                elif int(version2[1]) > int(compare2[1]):
                    return False
                else:
                    if int(version2[2]) < int(compare2[2]):
                        return True
                    elif int(version2[2]) > int(compare2[2]):
                        return False
                    else:
                        return  version < compare
        except:
            return version < compare

In [56]:
unique_packages.head(2)

Unnamed: 0,package,version_compare,date,base,release_type,version_today,missed
0,7zip-bin,0.0.1,2016-04-08T10:17:02.879Z,node,initial,4.1.0,False
1,7zip-bin,0.0.2,2016-04-08T10:56:33.502Z,node,patch,4.1.0,False


In [51]:
unique_packages['missed'] = unique_packages.apply(lambda d: compare_semver(d['version_today'], d['version_compare']), axis=1)

In [57]:
lag_today = (unique_packages
             .query('missed == True')
             .groupby(['package','version_today','release_type','base'])
             .count()[['date']]
             .rename(columns={'date':'lag'})
             .reset_index()
            )

In [59]:
lag_today.head(2)

Unnamed: 0,package,version_today,release_type,base,lag
0,7zip-bin,4.1.0,major,node,1
1,7zip-bin,4.1.0,patch,node,3


In [60]:
lag_today = lag_today.pivot_table(index=['package','version_today','base'], columns='release_type', values='lag').reset_index()

In [66]:
lag_today = lag_today.drop('initial', axis=1).fillna(0)

In [67]:
lag_today.head(2)

release_type,package,version_today,base,major,minor,patch
0,7zip-bin,4.1.0,node,1.0,0.0,3.0
1,@adonisjs/ace,5.0.8,node,1.0,7.0,20.0


In [None]:
len(lag_today), len(packages)

In [70]:
lag_today = (packages
             .merge(lag_today,
                    on = ['base','package','version_today'],
                    how = 'left'
                   )
            )
lag_today.shape

(983126, 13)

In [71]:
lag_today.fillna(0, inplace=True)

In [74]:
lag_today.drop(['version','version_release'], axis=1, inplace=True)

In [76]:
lag_today.to_csv('../data/lag_today.csv', index=False)

## Compute the technical lag at the release date

In [77]:
packages.sample()

Unnamed: 0,image,package,version,base,operating,popularity,last_updated,core,version_release,version_today
427613,jiphex/homebridge,assert-plus,1.0.0,node,Debian,15773,2019-08-31,True,1.0.0,1.0.0


In [89]:
unique_packages = packages[['package','version_release','base','last_updated']].drop_duplicates()

In [90]:
len(unique_packages)

383472

In [91]:
unique_packages = (available
                   .merge(unique_packages,
                          on = ['base','package'],
                          how = 'left'
                         )
                   )
len(unique_packages)

12657107

In [92]:
unique_packages['date'] = unique_packages['date'].apply(lambda x: x.split('T')[0])

In [93]:
unique_packages = unique_packages.query('date <= last_updated')

In [94]:
len(unique_packages)

11295373

In [95]:
unique_packages = unique_packages.query('release_type != "initial"')

In [97]:
unique_packages['missed'] = unique_packages.apply(lambda d: compare_semver(d['version_release'], d['version_compare']), axis=1)

In [98]:
lag_release = (unique_packages
             .query('missed == True')
             .groupby(['package','version_release','release_type','base','last_updated'])
             .count()[['date']]
             .rename(columns={'date':'lag'})
             .reset_index()
            )

In [99]:
lag_release = lag_release.pivot_table(index=['package','version_release','base','last_updated'], columns='release_type', values='lag').reset_index()

In [100]:
lag_release = (packages
             .merge(lag_release,
                    on = ['base','package','version_release','last_updated'],
                    how = 'left'
                   )
            )
lag_release.fillna(0, inplace=True)
lag_release.shape

(983126, 13)

In [101]:
lag_release.head(2)

Unnamed: 0,image,package,version,base,operating,popularity,last_updated,core,version_release,version_today,major,minor,patch
0,centralci/alpine-node,bower,1.8.8,node,Alpine,406310,2019-10-28,False,1.8.8,1.8.8,0.0,0.0,0.0
1,centralci/alpine-node,gulp,4.0.2,node,Alpine,406310,2019-10-28,False,4.0.2,4.0.2,0.0,0.0,0.0


In [102]:
lag_release.drop(['version','version_today'], axis=1, inplace=True)

In [103]:
lag_release.to_csv('../data/lag_release.csv', index=False)

## Helpers

In [12]:

RE_SEMVER = r'^(?:v|V)?(?P<Major>\d+)\.(?P<Minor>\d+)\.(?P<Patch>\d+)(?P<v_misc>.*)$'

# Version lag function

def semver(constraint, versions):
    """
    Return the versions in ``versions'' that satisfy given constraint.
    Semantic is provided by the semver tool of nodejs.
    """
    args = ['semver', '-r', constraint] + list(versions)
    
    completed = subprocess.run(args, stdout=subprocess.PIPE)
    if completed.returncode == 0:
        return completed.stdout.decode().strip().split('\n')
    else:
        return []
    
def comply_semver(series_of_releases):
    extracted = series_of_releases.str.extract(RE_SEMVER, expand=True)
    return (
        extracted
        [['v_major', 'v_minor', 'v_patch']]
        .astype('str')
        .assign(v_major= lambda d: d['v_major'])
        .assign(v_major= lambda d: d['v_minor'])
        .assign(v_major= lambda d: d['v_patch'])
        #.assign(Semver=lambda d: d['v_major'] + '.' + d['v_minor'] + '.' + d['v_patch'])
        #['Semver']
    )

def release_type(package, major, minor, package_previous, major_previous,minor_previous):
    if package != package_previous:
        return 'initial'
    if major != major_previous:
        return 'major'
    if minor != minor_previous:
        return 'minor'
    return 'patch'
        
def count_versionlag(package,date,date_latest):
    if date==date_latest:
        return '0,0,0'
    tmp=versions.query('package=="'+package+'" and date>"'+date+'"').query('date<="'+date_latest+'"')

    tmp=tmp.groupby('release_type').count()
    try:
        major=str(tmp.package['major'])
    except:
        major="0"
        
    try:
        minor=str(tmp.package['minor'])
    except:
        minor="0"
        
    try:
        patch=str(tmp.package['patch'])
    except:
        patch="0"
        
    return major+','+minor+','+patch