In [1]:
# DOWNLOAD AND EXTRACT INSTALLED PACKAGES WITH THEIR VERSION NUMBERS AND DEBIAN VERSION
import os
def download(image, removed):
    os.system("docker run --entrypoint '/bin/bash' "+image+" -c 'cat /etc/issue' > "+dir+file+"_issue")
    os.system("docker run --entrypoint '/bin/bash' "+image+" -c 'cat /etc/debian_version' > "+dir+file+"_debian")
    os.system("docker run --entrypoint '/bin/bash' "+image+" -c 'dpkg -l' > "+dir+file+"_dpkg")
    if removed:
        os.system("docker stop $(docker ps -a | grep '"+image+" ' | cut -d' ' -f1)")
        os.system("docker stop $(docker ps -a | grep '"+image+" ' | cut -d' ' -f1)")
        os.system("docker rmi $(docker images | grep '"+image.split(':')[0]+" ' | grep '"+image.split(':')[0]+" ')")

In [2]:
# PARSE INSTALLED PACKAGES WITH THEIR VERSION NUMBERS AND DEBIAN VERSION
import json as js
import os
import codecs
import subprocess
import pandas as pd

# get packages and their versions found installed in countainers; from "dpkg -l" files

def parse_packages(dir,file):
    columns=['name','package','version']
    data = pd.DataFrame(columns=columns)
    command_package="grep ^ii "+dir+file+"_dpkg" # sed 's/  */ /g' | 

    proc = subprocess.Popen(command_package, stdout=subprocess.PIPE, shell=True)
    lines = list(filter(lambda x:len(x)>0,(line.strip().decode('utf-8') for line in proc.stdout)))
    packages=[]
    versions=[]
    for line in lines:
        line=line.split(' ')
        line=sorted(set(line), key=lambda x: line.index(x))
        packages.append(line[2])
        versions.append(line[3])

    df = pd.DataFrame({'name':file, 'package':packages,'version':versions})
    data=data.append(df)
    return data.set_index('name')


def parse_release(dir,file):
    with open(dir+file+"_debian") as lines:
        for line in lines.readlines():
            release=line.strip('\n')
    return release

def parse(dir, file):
    installed_packages=parse_packages(dir,file)
    installed_packages['release_number']=parse_release(dir,file)
    
    installed_packages['debian']=installed_packages['release_number'].apply(lambda x:
                                                               'jessie' if x.startswith('8')
                                                               else 'stretch' if x.startswith('9')
                                                               else 'wheezy' if x.startswith('7')
                                                               else 'squeeze' if x.startswith('6')
                                                               else 'buster' if x.startswith('buster')
                                                                else x)
    installed_packages['package']=installed_packages['package'].apply(lambda x: x.split(':')[0])

    installed_packages.drop_duplicates(inplace=True)
    
    return installed_packages

In [3]:
###### TRACK THE PACKAGES
def debian_packages():
    debian_packages=pd.read_csv('datasets/debian_packages_18March.csv', sep=';', dtype=object, index_col=None,  error_bad_lines=False)
    return debian_packages

def track_packages(installled_packages):
    debian_p=debian_packages()
    
    tracked_packages=(installled_packages.
                      set_index(['package','version']).
                      merge(debian_p.
                            set_index(['package','version']),
                            left_index=True, 
                            right_index=True, 
                            how='outer')
                     ).reset_index().dropna() 
    ##### OR, 
        # .fillna('0')
        # tracked_packages=tracked_packages.query('release_number!="0"')
    for column in['last_order', 'version_order']:
        tracked_packages[column]=tracked_packages[column].apply(int)
    tracked_packages['outdate']=tracked_packages['last_order']-tracked_packages['version_order']
    return tracked_packages
    

In [4]:
###### IDENTIFY VULNERABIITIES
import json as js
import codecs

def parse_json_vuls():
    vulnerabilities=js.load(codecs.open('datasets/vuls_15April.json', 'r', 'utf-8'))
    return vulnerabilities

def dates_release_debian():
    debian_p=debian_packages()
    df_packages=(debian_p.
                 sort_values('date', ascending=True).
                 groupby(['source','source_version','release_snapshot']).
                 first().
                 drop(['package','version'], axis=1)
                )

    dict_date=df_packages.to_dict() ### dict of source version dates

    df_packages_release=(debian_p.
                         sort_values('date', ascending=True).
                         groupby(['source','source_version']).
                         first().
                         drop(['package','version','date'], axis=1)
                        )

    dict_release=df_packages_release.to_dict() # dict of releases
    return dict_date, dict_release

def unique_installed_packages(tracked_packages):
    df=(tracked_packages. ######## We create a DF with source packages found in Docker containers
        groupby(['source','source_version']).
        count().
        drop(['package','version'], axis=1).
        reset_index()
       ) ######## only source versions and distinct.
    
    return df

def final_vuls(tracked_packages):
    
    vulnerabilities=parse_json_vuls()
    
    dict_date, dict_release = dates_release_debian()
    
    sorted_ip=unique_installed_packages(tracked_packages)
    
    
    fcsv=open('./datasets/docker_vulnerabilities.csv','w')
    fcsv.write('source;source_version;urgency;status;fixed_version;debianbug;release;cve\n')

    for index, raw in enumerate(sorted_ip.iterrows()): ######## we iterate over the sources (docker)
        source=raw[1]['source']
        source_version=raw[1]['source_version']
        release=dict_release['release_snapshot'][(source, source_version)]
        date_source=dict_date['date'][(source, source_version,release)]
        try:
            vuls=vulnerabilities[source] ###### check if the source has any vulnerabilities
        except:
            continue
        for cve in vuls:  ###### for each vulnerability
            if not cve.startswith('CVE'):
                continue
            #if cve.split('-')[1] > date_source[0:4]:
            #    continue
            v=vulnerabilities[source][cve]
            try:
                status=v['releases'][release]['status']  ###### check only the release of source
                urgency=v['releases'][release]['urgency'] ###### check only the release of source

                try:
                    debianbug=str(v['debianbug'])
                except:
                    debianbug="undefined"

                if status == "open" or status=="undetermined": ###### if the vulnerability is still OPEN
                    fixed="undefined"
                    fcsv.write(source+';'+source_version+';'+urgency+';'+status+';'+fixed+';'+debianbug+';'+release+';'+cve+'\n')
                else: ###### if the vulnerability is RESOLVED
                    try:
                        fixed=v['releases'][release]["fixed_version"]
                    except:
                        continue
                    try:
                        date_fixed=dict_date['date'][(source, fixed,release)]
                        if date_source<date_fixed: #### Compare between the used source and fixed one (dates comparison)
                            fcsv.write(source+';'+source_version+';'+urgency+';'+status+';'+fixed+';'+debianbug+';'+release+';'+cve+'\n')
                    except:
                        if compare_versions(source_version,fixed):
                            fcsv.write(source+';'+source_version+';'+urgency+';'+status+';'+fixed+';'+debianbug+';'+release+';'+cve+'\n')
                            
            except:
                pass
    fcsv.close()
    
def get_vuls(tracked_packages):
    final_vuls(tracked_packages)
    
    docker_vulnerabilities=pd.read_csv('./datasets/docker_vulnerabilities.csv', sep=';', dtype=object, index_col=None,  error_bad_lines=False)
    docker_vulnerabilities.drop_duplicates(inplace=True)
    
    return docker_vulnerabilities

In [5]:
#### Compare between two version numbers
def calculate_version(source_version,fixed,operator):    
    try:
        f=fixed[0]
        n=source_version[0]
    except:
        if operator!='~':
            if len(source_version)==0:
                return True
            else:
                return False
        else:
            if len(source_version)==0:
                return True
            else:
                return False
    
    if n == f:
        return calculate_version(source_version[1:],fixed[1:],operator)
    else:
        for op in [':','-','~','+']:
            if op in n or op in f:
                if op==":":
                    if ':' in f and ':' not in n :
                        if int(f.split(':')[0])>1:
                            return True
                        else:
                            return calculate_version(n,f.split(':')[1],op)
                            
                    elif ':' in n and ':' not in f :
                        if int(n.split(':')[0])>1:
                            return False
                        else:
                            return calculate_version(n.split(':')[1],f,op)
                    else:
                        if int(n.split(':')[0])<int(f.split(':')[0]):
                            return True
                        elif int(n.split(':')[0])>int(f.split(':')[0]):
                            return False
                        else:
                            try:
                                if int(n.split(':')[1])<int(f.split(':')[1]):
                                    return True
                                elif int(n.split(':')[1])>int(f.split(':')[1]):
                                    return False
                            except:
                                return calculate_version(n.split(':')[1:],f.split(':')[1:], ':')
                else:
                    return calculate_version(n.split(op),f.split(op),op)
                
        try:
            if int(n)<int(f):
                return True
            else:
                return False
        except:
            if n<f:
                return True
            else:
                return False

def compare_versions(source_version,fixed):
    if fixed != source_version:
        source_version=source_version.split('.')
        fixed=fixed.split('.')
        return calculate_version(source_version,fixed,'.')
    else:
        return True

In [6]:
#### MERGE FOUND VULNERABILITIES WITH INSTALLED PACKAGES
def merge_vuls(tracked_packages):
    vuls=get_vuls(tracked_packages)            ### GET VULNERABILITIES
    # Here we merge vulnerabilities with community outdated packages
    docker_vuls=(
        tracked_packages.
        set_index(['source','source_version']).
        merge(vuls.
              set_index(['source','source_version']),
              left_index=True, 
              right_index=True, 
              how='outer').dropna().reset_index().drop_duplicates()
    )
    return docker_vuls

In [None]:
#### BUGS NOT YET

In [7]:
# HERE CONFIGGURATION
image="debian:stretch-slim"
#image="debian:stretch-20171210"
removed="False"
dir="./datasets/"
file=image.replace('/','_')

In [8]:
####### PROCESS #######
download(image,removed)            ### DOWNLOAD THE IMAGES
installed_packages=parse(dir,file)             ### GET THE INSTALLED PACKAGES
tracked_packages=track_packages(installed_packages)            ### TRACK THE INSTALLED PACKAGES
docker_vuls=merge_vuls(tracked_packages)

print('# installed packages', len(installed_packages))
print('# tracked packages', len(tracked_packages))
print('# vulnerabilities', len(docker_vuls))


# installed packages 76
# tracked packages 76
# vulnerabilities 88
