In [2]:
%matplotlib inline
import seaborn as sns; 
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
from matplotlib import gridspec
import scipy
style.use('ggplot')
import datetime
from io import StringIO
import re
import os
import numpy as np
from datetime import datetime, timedelta
import apt_pkg
apt_pkg.init_system()
import json as js
import codecs
import psycopg2

# Load Debian packages

In [3]:
debian_packages = pd.read_csv('../../data/prepared_data/debian_packages.csv',
                      usecols=['source','source_version','archive','date'], dtype=object)

debian_packages = debian_packages[['source','source_version','date','archive']].drop_duplicates().copy()

debian_packages = (debian_packages
                   .sort_values('date')
                  .groupby(['source','source_version','archive'])
                   .first()
                   .reset_index()
                  )
debian_packages['date'] = pd.to_datetime(debian_packages['date'])
debian_packages.shape

(270292, 4)

# EXTRACT BUGS

#### done extraction, check script *get_bugs.py*

In [4]:
bugs = pd.read_csv('../../data/prepared_data/bugs_extracted_20190830.csv', dtype=object)

In [5]:
bugs['arrival'] = pd.to_datetime(bugs['arrival'].apply(lambda x: x.split()[0]))
bugs['last_modified'] = pd.to_datetime(bugs['last_modified'].apply(lambda x: x.split()[0]))

In [6]:
bugs['fixed_in'] = bugs['fixed_in'].apply(lambda x: str(x).split('/')[-1])
bugs['found_in'] = bugs['found_in'].apply(lambda x: str(x).split('/')[-1])

In [7]:
bugs = bugs.query('status != "fixed" and status != "pending-fixed"').copy()
bugs = bugs.query('severity != "fixed" and severity != "wishlist"').copy()

In [8]:
def sub_date(date, genre):
    if genre =="archive":
        return date - timedelta(days=28)
    else:
        return date
bugs['last_modified'] = bugs.apply(lambda d: sub_date(d['last_modified'], d['type']), axis=1)

# Load installed packages
### Get this dataset from the "Prepare vuls" notebook

In [9]:
sources = pd.read_csv('../../data/for_analysis/sources_ideal.csv', dtype=object)

### Get source_version date

In [10]:
sources = (sources
        .merge(debian_packages,
               on = ['source','source_version','archive'],
               how='left')
        )
sources.shape

(1222801, 6)

# Identify bugs of installed source packages

In [11]:
bugs = (sources[['source','source_version','date']]
        .drop_duplicates()
        .merge(bugs,
               on = 'source',
               how='left')
        )
bugs = bugs.dropna(subset=['last_modified'])
bugs.shape

(2901790, 11)

In [12]:
def handle_bugs(source_version, status, fixed_in, found_in, last_modified, date ):
    first_check = apt_pkg.version_compare(str(found_in), str(source_version)) <= 0
    if status != "done" and fixed_in=="nan":
        return first_check
    else:
        if fixed_in != 'nan':
            return (first_check and apt_pkg.version_compare(str(source_version), str(fixed_in)) < 0)
        else:
            return (first_check and str(last_modified) > str(date))

In [13]:
bugs['filtre'] = bugs.apply(lambda d: handle_bugs(d['source_version'],
                                                 d['status'],
                                                 d['fixed_in'],
                                                 d['found_in'],
                                                 d['last_modified'],
                                                 d['date']), axis=1)

In [14]:
bugs = (bugs
 .query('filtre == True')
 .drop(['date','filtre'], axis=1)
 .groupby(['debianbug', 'source','source_version'])
 .first()
 .reset_index()
)
bugs['debianbug'] = bugs['debianbug'].apply(int)
bugs.shape

(359200, 10)

In [15]:
bugs.to_csv('../../data/for_analysis/bugs.csv',index=False)

# Let's get bugs for the last updates

In [16]:
bugs = pd.read_csv('../../data/prepared_data/bugs_extracted_20190830.csv')
bugs['arrival'] = pd.to_datetime(bugs['arrival'].apply(lambda x: x.split()[0]))
bugs['last_modified'] = pd.to_datetime(bugs['last_modified'].apply(lambda x: x.split()[0]))
bugs['fixed_in'] = bugs['fixed_in'].apply(lambda x: str(x).split('/')[-1])
bugs['found_in'] = bugs['found_in'].apply(lambda x: str(x).split('/')[-1])
bugs = bugs.query('status != "fixed" and status != "pending-fixed"').copy()
bugs = bugs.query('severity != "fixed" and severity != "wishlist"').copy()

In [17]:
def sub_date(date, genre):
    if genre =="archive":
        return date - timedelta(days=28)
    else:
        return date
bugs['last_modified'] = bugs.apply(lambda d: sub_date(d['last_modified'], d['type']), axis=1)

### Get date of ideal source version

In [18]:
sources = (sources[['source','ideal_source','archive']]
           .drop_duplicates()
           .rename(columns={'ideal_source':'source_version'})
        .merge(debian_packages,
               on = ['source','source_version','archive'],
               how='left')
        )
sources.shape

(15320, 4)

In [19]:
bugs = (sources[['source','source_version','date']]
        .drop_duplicates()
        .merge(bugs,
               on = 'source',
               how='left')
        )
bugs = bugs.dropna(subset=['last_modified'])
bugs.shape

(2892954, 11)

In [20]:
bugs['filtre'] = bugs.apply(lambda d: handle_bugs(d['source_version'],
                                                 d['status'],
                                                 d['fixed_in'],
                                                 d['found_in'],
                                                 d['last_modified'],
                                                 d['date']), axis=1)

In [21]:
bugs = (bugs
 .query('filtre == True')
 .drop(['date','filtre'], axis=1)
 .groupby(['debianbug', 'source','source_version'])
 .first()
 .reset_index()
)
bugs['debianbug'] = bugs['debianbug'].apply(int)
bugs.shape

(364553, 10)

# Merge between two datasets

In [24]:
bugs_all = pd.read_csv('../../data/for_analysis/bugs.csv', dtype=object)
bugs_all.head(2)

Unnamed: 0,debianbug,source,source_version,status,severity,arrival,last_modified,found_in,fixed_in,type
0,100028,ppp,2.4.6-3.1,pending,normal,2001-06-08,2008-06-30,2.4.1-1,,normal
1,100028,ppp,2.4.7-1+4,pending,normal,2001-06-08,2008-06-30,2.4.1-1,,normal


In [26]:
bugs = pd.concat([bugs, bugs_all])
bugs.drop_duplicates(inplace=True)

In [28]:
bugs.to_csv('../../data/for_analysis/bugs.csv', index=False)