In [1]:
%matplotlib inline
from imports import *

# Load installed source packages

In [2]:
sources = pd.read_csv('../../data/for_analysis/sources_ideal.csv', dtype=object)
sources.shape

(1222801, 5)

# Load vulnerabilities and clean them

In [5]:
vuls = pd.read_csv('../../data/for_analysis/vulnerabilities.csv')
vuls = vuls.drop('archive', axis=1).drop_duplicates()
vuls.shape

(63601, 7)

In [6]:
# Filtre those that we don't want
urgency_map={'low**':'Low',
            'low*':'Low',
            'low':'Low',
            'medium**':'Medium',
            'medium**':'Medium',
            'medium':'Medium',
            'high**':'High',
            'high*':'High',
            'high':'High',
            'unimportant': pd.np.nan,
            'not yet assigned': pd.np.nan,
            'end-of-life': pd.np.nan,
           }

vuls = vuls.query('status != "undetermined"').copy()
vuls = (vuls
        .replace({'urgency': urgency_map})
        .dropna(subset=['urgency'])
        .drop_duplicates()
       )

In [7]:
vuls = vuls.groupby(['source','source_version','urgency']).count()[['cve']].reset_index()

In [8]:
vuls = (vuls
          .pivot_table(index=['source','source_version'], columns='urgency', values='cve')
          .reset_index()
          .fillna(0)
         )

# Load bugs and clean them

In [9]:
bugs = pd.read_csv('../../data/for_analysis/bugs.csv')
bugs.shape

(371639, 10)

In [10]:
severity_map={'minor':'Lower',
            'normal':'Lower',
            'important':'Higher',
            'grave':'Higher',
            'serious':'Higher',
            'critical':'Higher',
           }

bugs = (bugs
        .replace({'severity': severity_map})
        .drop_duplicates()
       )

In [11]:
bugs = bugs.groupby(['source','source_version','severity']).count()[['debianbug']].reset_index()

In [12]:
bugs = (bugs
          .pivot_table(index=['source','source_version'], columns='severity', values='debianbug')
          .reset_index()
          .fillna(0)
         )

# Now we merge vuls and bugs with INSTALLED PACKAGES

In [13]:
sources.shape

(1222801, 5)

In [14]:
# starting with vulnerabilities
sources = (sources
            .merge(vuls,
                   on = ['source','source_version'],
                   how = 'left')
           )
sources.fillna(0, inplace=True)
sources.shape

(1222801, 8)

In [15]:
# Then, bugs
sources = (sources
            .merge(bugs,
                   on = ['source','source_version'],
                   how = 'left')
           )
sources.fillna(0, inplace=True)
sources.shape

(1222801, 10)

# Now we merge vuls and bugs with IDEAL package versions

In [17]:
# starting with vulnerabilities
vuls.rename(columns=
            {'source_version':'ideal_source','High':'l_High','Low':'l_Low', 'Medium':'l_Medium'},
            inplace=True)

sources = (sources
            .merge(vuls,
                   on = ['source','ideal_source'],
                   how = 'left')
           )
sources.fillna(0, inplace=True)
sources.shape

(1222801, 13)

In [18]:
# Then, bugs
bugs.rename(columns=
            {'source_version':'ideal_source','Higher':'l_Higher','Lower':'l_Lower'},
            inplace=True)

sources = (sources
            .merge(bugs,
                   on = ['source','ideal_source'],
                   how = 'left')
           )
sources.fillna(0, inplace=True)
sources.shape

(1222801, 15)

In [19]:
# Save the output
sources.to_csv('../../data/for_analysis/vb_lag_ideal.csv', index=False)