In [65]:
import pandas as pd
import re
from tqdm import tqdm_notebook as tqdm
import os

### let's start with parsing the operating system

In [66]:
images = pd.read_csv('../data/to_pull.csv', dtype=object)
images  = images.drop('base', axis=1).drop_duplicates()
images['last_updated'] = images['last_updated'].apply(lambda x: str(x).split('T')[0])
images.shape

(4376, 3)

In [67]:
images.tail()

Unnamed: 0,image,popularity,last_updated
4496,dgarros/gitlab-getcitoken,83,2016-11-09
4497,greyblake/sinatra,82,2016-07-06
4498,maicatus/loghouse,82,2018-04-03
4499,sider/goodcheck,82,2019-11-19
4500,library/node,1000000,2019-11-19


In [68]:
operating = {}

# parse operating system
path = '../data/pulled/os/r1/'
for file in os.listdir (path):
    operating[file] = []
    with open(path+file) as lines:
        for line in lines.readlines():
            if 'alpine' in line.lower():
                operating[file].append('Alpine')
            elif 'debian' in line.lower():
                operating[file].append('Debian')
            elif 'ubuntu' in line.lower():
                operating[file].append('Ubuntu')
            break
            
# parse operating system in case it couldn't be identified with the first command   
path = '../data/pulled/os/r2/'
for file in os.listdir (path):
    if file not in operating:
        operating[file] = []
        with open(path+file) as lines:
            for line in lines.readlines():
                if 'alpine' in line.lower():
                    operating[file].append('Alpine')
                elif 'debian' in line.lower():
                    operating[file].append('Debian')
                elif 'ubuntu' in line.lower():
                    operating[file].append('Ubuntu')
                break
operating = (pd
             .DataFrame
             .from_dict(operating, 
                        orient='index', 
                        columns=['operating'])
             .reset_index()
             .rename(columns={'index':'image'})
            )
operating['image'] = operating['image'].apply(lambda x: x.split(':')[0]+'/'+x.split(':')[1])

In [69]:
# parse python packages
path = '../data/pulled/packages/python/'
packages = []
versions = []
files = []
for file in tqdm(os.listdir (path)):
    with open(path+file) as lines:
        for line in lines.readlines():
            packages.append(line.split('=')[0])
            versions.append(line.split('=')[-1].rstrip('\n'))
            files.append(file)

python = pd.DataFrame({'image': files, 'package': packages, 'version': versions} )
python['base'] = python['image'].apply(lambda x: 'python'+x[-1])
python['image'] = python['image'].apply(lambda x: x[0:-2])
print(python.shape)
python.head(2)

HBox(children=(IntProgress(value=0, max=2934), HTML(value='')))


(60396, 4)


Unnamed: 0,image,package,version,base
0,naorlivne:mesos-cloudwatch-autoscale:latest,boto3,1.7.26,python2
1,naorlivne:mesos-cloudwatch-autoscale:latest,botocore,1.10.26,python2


In [70]:
# parse node packages
path = '../data/pulled/packages/node/'
packages = []
versions = []
files = []
for file in tqdm(os.listdir (path)):
    with open(path+file) as lines:
        for line in lines.readlines():
            line = line.split()
            for element in line:
                if '@' in element:
                    if element.count('@')==1:
                        packages.append(element.split('@')[0])
                        versions.append(element.split('@')[1])
                        files.append(file)
                    elif element.count('@')==2:
                        packages.append('@'+element.split('@')[1])
                        versions.append(element.split('@')[2])
                        files.append(file)

node = pd.DataFrame({'image': files, 'package': packages, 'version': versions, 'base': 'node'}  )
print(node.shape)
node.head(2)

HBox(children=(IntProgress(value=0, max=1429), HTML(value='')))


(846851, 4)


Unnamed: 0,image,package,version,base
0,centralci:alpine-node:latest,bower,1.8.8,node
1,centralci:alpine-node:latest,gulp,4.0.2,node


In [74]:
node.query('image == "library:node:latest"')

Unnamed: 0,image,package,version,base
279124,library:node:latest,npm,6.13.1,node
279125,library:node:latest,abbrev,1.1.1,node
279126,library:node:latest,ansicolors,0.3.2,node
279127,library:node:latest,ansistyles,0.1.3,node
279128,library:node:latest,aproba,2.0.0,node
...,...,...,...,...
279929,library:node:latest,prr,1.0.1,node
279930,library:node:latest,write-file-atomic,2.4.3,node
279931,library:node:latest,graceful-fs,4.2.3,node
279932,library:node:latest,imurmurhash,0.1.4,node


In [75]:
# parse ruby packages
path = '../data/pulled/packages/ruby/'
packages = []
versions = []
files = []
for file in tqdm(os.listdir (path)):
    with open(path+file) as lines:
        for line in lines.readlines():
            packages.append(line.split()[0])
            versions.append(line[line.find("(")+1:line.find(")")])
            files.append(file)

ruby = pd.DataFrame({'image': files, 'package': packages, 'version': versions, 'base': 'ruby'}  )
print(ruby.shape)
ruby.head(2)

HBox(children=(IntProgress(value=0, max=1482), HTML(value='')))


(90241, 4)


Unnamed: 0,image,package,version,base
0,icalialabs:belugas-python:latest,bigdecimal,default: 1.2.8,ruby
1,icalialabs:belugas-python:latest,bundler,1.14.6,ruby


In [76]:
all_images  = pd.concat([node, python, ruby])
all_images['image'] = all_images['image'].apply(lambda x: x.split(':')[0]+'/'+x.split(':')[1])
all_images.shape

(997488, 4)

In [77]:
all_images = (all_images
              .merge(operating,
                    on='image',
                    how='left')
              .merge(images,
                    on='image',
                    how='left')
             )
all_images.shape

(997488, 7)

In [85]:
all_images = all_images.dropna()

In [86]:
all_images[['image','base','operating']].drop_duplicates().groupby('operating').count()

Unnamed: 0_level_0,image,base
operating,Unnamed: 1_level_1,Unnamed: 2_level_1
Alpine,1434,1434
Debian,2540,2540
Ubuntu,9,9


In [87]:
all_images = all_images.query('last_updated != "nan"')

In [88]:
all_images.to_csv('../data/installed_packages.csv', index=False )

In [27]:
all_images.head()

Unnamed: 0,image,package,version,base,operating,popularity,last_updated
0,centralci/alpine-node,bower,1.8.8,node,Alpine,406310,2019-10-28
1,centralci/alpine-node,gulp,4.0.2,node,Alpine,406310,2019-10-28
2,centralci/alpine-node,glob-watcher,5.0.3,node,Alpine,406310,2019-10-28
3,centralci/alpine-node,anymatch,2.0.0,node,Alpine,406310,2019-10-28
4,centralci/alpine-node,micromatch,3.1.10,node,Alpine,406310,2019-10-28
