# Setup our vulnerability data science lab environment

First we'll import all the libraries we need. A couple of them need installed first. JQ is a pythonic implementation of jq; a tool for querying json really fast. When looking at 25 years of vulnerabilities it is enormously useful.

In [1]:
#!pip install requests
#!pip install hurst
#!pip install jq

In [2]:
import requests
import gzip as gz
import shutil
import pandas as pd
import json
import datetime
import tqdm
import os
import jq
import json
#import matplotlib.pyplot as plt
#plt.rcParams['font.size'] = '22'
#plt.rcParams['font.weight'] = 'bold'
# Increase size of plot in jupyter
#plt.rcParams["figure.figsize"] = (20,12)
import itertools
#import seaborn as sns
import numpy as np
import datetime
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_absolute_percentage_error as mape
import re
from pandas.plotting import autocorrelation_plot

The folders where we will store the data as a gzip, and as a json need to be created.

In [3]:
file_exists = os.path.exists('CVE-NVD')
if not file_exists:
  os.mkdir('CVE-NVD')
  os.mkdir('CVE-NVD/GZIP')
  os.mkdir('CVE-NVD/JSON')

Let's also setup some other folders for MITRE's advance views of CVE data.

In [4]:
file_exists = os.path.exists('CVE-MITRE')
if not file_exists:
  os.mkdir('CVE-MITRE')
  os.mkdir('CVE-MITRE/CSV')

# convert datetime obj to string
str_current_datetime = str(current_datetime)
  
# create a file object along with extension
file_name = str_current_datetime+".txt"# Download the CVE data from NVD and MITRE

Now we'll download the NVD data for every year since 1999. Don't worry it's faster than you think.  
PROTIP: The progress bar comes for free from the tqdm package. Just wrap a for loop in tqdm.tqdm() it and you get a progress bar for free.
Now after this tutorial if you keep this notebook, you'll always be able to fetch all this CVE data easily. Handy for many more things than just forecasting.

In [5]:
now = datetime.datetime.now()
#PROTIP wrap an iterative loop in python with tqdm.tqdm() and you get a progress bar
for i in tqdm.tqdm(range(1999,now.year+1)):
    url = 'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-'+str(i)+'.json.gz'
    req = requests.get(url, stream=True)
    with open('CVE-NVD/GZIP/nvdcve-1.1-'+str(i)+'.json.gz', 'wb') as f:
        for chunk in req.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)
                f.flush()

100%|██████████| 27/27 [00:01<00:00, 16.24it/s]


Here we need to decompress all those gzip files so we can work with the data as json files.

In [6]:
for i in tqdm.tqdm(range(1999,now.year+1)):
    with gz.open('CVE-NVD/GZIP/nvdcve-1.1-'+str(i)+'.json.gz', 'rb') as f_in:
        with open('CVE-NVD/JSON/nvdcve-1.1-'+str(i)+'.json', 'wb')  as f_out:
            shutil.copyfileobj(f_in, f_out)

100%|██████████| 27/27 [00:10<00:00,  2.53it/s]


Now we need to download the MITRE version of CVEs too, which gives us a different kind of information that is useful later. Specifically, it gives us a view of CVEs that didn't make the cut, and some other timestamps we can use to show when CVEs where submitted, as opposed to published. That can help us calculate the rate of publication.

In [7]:
url = 'https://cve.mitre.org/data/downloads/allitems.csv'
req = requests.get(url, stream=True)
now = datetime.datetime.now()
with open('CVE-MITRE/CSV/allitems_current.csv', 'wb') as f:
    for chunk in req.iter_content(chunk_size=1024):
        if chunk:
            f.write(chunk)
            f.flush()

# Convert the data to panda dataframes and csv files

Here we start to use JQ to make queiries specific to CVE json structure. We pull out the CVE-ID, the published date, the assigner, and the CVSSv2 base score.

In [8]:
cve_id_query = jq.compile(".CVE_Items[].cve.CVE_data_meta.ID")
cve_publication_query = jq.compile(".CVE_Items[].publishedDate")
cve_assigner_query = jq.compile(".CVE_Items[].cve.CVE_data_meta.ASSIGNER")
cve_description_query = jq.compile(".CVE_Items[].cve.description.description_data[].value")
cvss_v2_score_query = jq.compile(".CVE_Items[].impact.baseMetricV2.cvssV2.baseScore")
cvss_v2_exploitability_score_query = jq.compile(".CVE_Items[].impact.baseMetricV2.cvssV2.exploitabilityScore")
cvss_v2_vector_query = jq.compile(".CVE_Items[].impact.baseMetricV2.cvssV2.vectorString")
cvss_v3_score_query = jq.compile(".CVE_Items[].impact.baseMetricV3.cvssV3.baseScore")
cvss_v3_exploitability_score_query = jq.compile(".CVE_Items[].impact.baseMetricV3.cvssV3.exploitabilityScore")
cvss_v3_vector_query = jq.compile(".CVE_Items[].impact.baseMetricV3.cvssV3.vectorString")
cpe23_query = jq.compile('.CVE_Items[] | [.cve.CVE_data_meta.ID, .configurations.nodes[].cpe_match[].cpe23Uri]')
#Product, cpe strings?, cwe
cwe_query = jq.compile(".CVE_Items[].cve.problemtype.problemtype_data[].description[].value")
#Add CVE state in v5, date-assigned, date_requested, date public, product,cpe_string
now = datetime.datetime.now()
collector = []
for i in tqdm.tqdm(range(2002, now.year+1)):
    with open('CVE-NVD/JSON/nvdcve-1.1-'+str(i)+'.json') as json_file:
        try:
            data = json.load(json_file)
        except:
            continue
        pubs = cve_publication_query.input(data).text()
        pubs = pubs.split('\n')
        pubs = [pd.to_datetime(ts.strip('"'), yearfirst=True) for ts in pubs]
        index= pd.Series(pubs,name='Publication')
        cves = cve_id_query.input(data).text()
        cves = cves.split('\n')
        cves = [cve.strip('"') for cve in cves]
        assigners = cve_assigner_query.input(data).text()
        assigners = assigners.split('\n')
        description = cve_description_query.input(data).text()
        description = description.split('\n')
        cvss_v2 = cvss_v2_score_query.input(data).text()
        cvss_v2 = cvss_v2.split('\n')
        cvss_v2_vector = cvss_v2_vector_query.input(data).text()
        cvss_v2_vector = cvss_v2_vector.split('\n')
        cvss_v2_exploitability = cvss_v2_exploitability_score_query.input(data).text()
        cvss_v2_exploitability = cvss_v2_exploitability.split('\n')
        cvss_v3 = cvss_v3_score_query.input(data).text()
        cvss_v3 = cvss_v3.split('\n')
        cvss_v3_vector = cvss_v3_vector_query.input(data).text()
        cvss_v3_vector = cvss_v3_vector.split('\n')
        cvss_v3_exploitability = cvss_v3_exploitability_score_query.input(data).text()
        cvss_v3_exploitability = cvss_v3_exploitability.split('\n')
        cwe = cwe_query.input(data).text()
        cwe = cwe.split('\n')
        cpe23 = cpe23_query.input(data).text()
        cpe23_list = cpe23.split("\n")
        cpe23_string_list = [s.split(",") for s in cpe23_list]
        s1 = pd.Series(cves,name='ID')
        s2 = pd.Series(assigners,name='ASSIGNER')
        s3 = pd.Series(description,name='DESCRIPTION')
        ones = [1]*len(cves)
        s4 = pd.Series(ones,name='Count')
        s5 = pd.Series(cvss_v2,name='v2 CVSS')
        s6 = pd.Series(cvss_v2_vector,name='v2 Vector')
        s7 = pd.Series(cvss_v2_exploitability,name='v2 Exploitability Score')
        s8 = pd.Series(cvss_v3,name='v3 CVSS')
        s9 = pd.Series(cvss_v3_vector,name='v3 Vector')
        s10 = pd.Series(cvss_v3_exploitability,name='v3 Expoitability Score')
        s11 = pd.Series(cwe,name='CWE')
        s12 = pd.Series(cpe23_string_list,name='v2.3 CPE')
        vulns = pd.concat([index,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12], axis=1)
        vulns = vulns.set_index('Publication')
        collector.append(vulns)
        json_file.close()
all_items = pd.concat(collector)

100%|██████████| 24/24 [11:49<00:00, 29.55s/it]


Save all the data we just filtered to a CSV file, for future use.

In [9]:
all_items.sort_index()
all_items.to_csv('NVD-Vulnerability-Volumes.csv')

If you want to read that file in the future, without fetching all the data again, just uncoment the cell below.

In [10]:
all_items = pd.read_csv('NVD-Vulnerability-Volumes.csv',index_col=['Publication'],parse_dates=['Publication'], low_memory=False)
all_items = all_items.sort_index()