In [1]:
import re
import sys

import nltk
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline

# nvdlib has be installed (I suggest to use venv for that)
from nvdlib.nvd import NVD

# add the experimental project to the path
sys.path.append("../src")

from toolkit.transformers import Hook, NBClassifier, cross_validate, FeatureExtractor
from toolkit.preprocessing import NVDFeedPreprocessor, LabelPreprocessor
from toolkit import pipelines, utils

### Get the NVD Feed

In [2]:
Hook.clear_current_instances()

In [3]:
# get the nvd feed
feed = NVD.from_feeds(feed_names=[2017])
feed.update()

DATA = list(feed.cves())

In [4]:
pipeline = Pipeline(
    steps=[
        (
            'nvd_prep',
            NVDFeedPreprocessor(['cve_id', 'project', 'description'])
        ),
        (
            'label_prep',
            LabelPreprocessor(
                feed_attributes=['project', 'description'],
                output_attributes=['cve_id', 'project', 'description'],
                hook=Hook(key='label_hook', func=utils.find_)
            )
        )
    ]
)

series = pipeline.fit_transform(DATA)

In [5]:
df = pd.DataFrame(series)

# Grid search

When performing grid search, exhaustive iteration over all combinations of feature extractor hooks will be performed in order to find an optimal combination.

The values used for evaluation of those combinations will be:
- precision
- cross validation mean
- cross validation standard deviation

The cross validation will be evaluated using 10-Fold XV at train-test split ratio 1:5.

In [6]:
# grid-search
from itertools import combinations, product, compress

# x-val
from sklearn.model_selection import train_test_split, KFold

# feature hooks
from toolkit.transformers import feature_hooks

In [7]:
cve_dict = {cve.cve_id: cve for cve in DATA}

In [8]:
feature_hooks.vendor_product_match_hook.default_kwargs = {'cve_dict': cve_dict}

FEATURE_HOOKS = [
    feature_hooks.is_alnum_hook,
    feature_hooks.has_uppercase_hook,
    feature_hooks.ver_follows_hook,
    feature_hooks.ver_pos_hook,
    feature_hooks.word_len_hook,
    feature_hooks.vendor_product_match_hook
]

In [9]:
# Create prediciton filters
from nltk.corpus import stopwords

STOPWORDS = set(stopwords.words('english'))
def stopwords_filter(t):
    word, _ = t[0]
    return word.lower() not in STOPWORDS

def version_filter(t):
    word, _ = t[0]
    return word != '<VERSION>'

def num_tag_filter(t):
    _, tag = t[0]
    return tag != 'NUM'

prediction_filters = [stopwords_filter, version_filter, num_tag_filter]

In [10]:
def parallel_evaluation(mask):
    """Execute parallel evaluation of feature hooks by given mask."""
    featureset, labels = pipelines.extract_labeled_features(
        data=DATA,
        nvd_attributes=['cve_id', 'description'],
        nltk_feed_attributes=['description'],
        feature_hooks=list(compress(FEATURE_HOOKS, mask)),
        labeling_func=utils.find_
    )
    
    # split the data into train / test set
    X_train, X_test, y_train, y_test = train_test_split(
        featureset, labels,
        test_size = 0.8,  # split in the ratio 1:5 to show how well the model generalizes
        random_state=0
    )

    clf = NBClassifier().fit(X_train)
    
    # precision
    precision = clf.evaluate(X_test[:], y_test[:], sample=True, n=3, filter_hooks=prediction_filters)
    
    # cross validation
    xval_score = cross_validate(
        clf,
        X_train,
        y_train,
        shuffle=True,
        n=3,
        sample=True,
        filter_hooks=prediction_filters
    )
    xval_mean, xval_std = xval_score.mean, xval_score.std
    
    return precision, xval_mean, xval_std

### Validation

In [11]:
import os
import sys

from multiprocessing import Pool


# binary mask for feature hooks
mask_list = list(product(*[[0, 1]] * len(FEATURE_HOOKS[:])))

# number of cores for processes
n_cores = os.cpu_count()

# prepare classifiers with multiprocessing pool
pool = Pool(processes=n_cores) 
proc = pool.map_async(func=parallel_evaluation,
                      iterable=mask_list)

pool.close()
pool.join()

In [12]:
eval_results = proc.get()

In [13]:
df_eval = pd.DataFrame(
    data=[(*acc, *mask) for acc, mask in zip(eval_results, mask_list)],
    columns=['precision', 'xval_mean', 'xval_std'] + [hook.key for hook in FEATURE_HOOKS]
)

df_eval.sort_values(by='xval_mean', ascending=False).style.set_properties(**{'width':'10em'})

Unnamed: 0,precision,xval_mean,xval_std,is_alnum,has_uppercase,ver_follows,ver_pos,word_len,vendor_product_match
63,0.946128,0.955977,0.0217545,1,1,1,1,1,1
61,0.947811,0.955977,0.030804,1,1,1,1,0,1
49,0.947811,0.955977,0.030804,1,1,0,0,0,1
53,0.947811,0.955977,0.030804,1,1,0,1,0,1
55,0.946128,0.955977,0.0217545,1,1,0,1,1,1
57,0.947811,0.955977,0.030804,1,1,1,0,0,1
59,0.946128,0.955977,0.0217545,1,1,1,0,1,1
51,0.946128,0.955977,0.0217545,1,1,0,0,1,1
21,0.947811,0.953103,0.04515,0,1,0,1,0,1
29,0.947811,0.953103,0.04515,0,1,1,1,0,1


#### Examination of the best feature hook combination

In [30]:
featureset, labels = pipelines.extract_labeled_features(
    data=DATA,
    nvd_attributes=['cve_id', 'description'],
    nltk_feed_attributes=['description'],
    feature_hooks=list(compress(FEATURE_HOOKS, mask_list[55])),
    labeling_func=utils.find_
)

# split the data into train / test set
X_train, X_test, y_train, y_test = train_test_split(
    featureset, labels,
    test_size = 0.8,  # split in the ratio 1:5 to show how well the model generalizes
    random_state=0
)

clf = NBClassifier().fit(X_train)

score = cross_validate(
    clf,
    X_train,
    y_train,
    shuffle=True,
    n=3,
    sample=True,
    filter_hooks=prediction_filters
)

In [31]:
score

Score(values=array([1.        , 0.96666667, 0.96666667, 0.9       , 0.96666667,
       1.        , 0.96551724, 0.86206897, 0.96551724, 0.96551724]), mean=0.9558620689655172, std=0.040551098595534955)

In [32]:
print("Cross validation accuracy: {:5.2f} (+/- {:5.3f}) %".format(score.mean * 100, score.std * 200))

Cross validation accuracy: 95.59 (+/- 8.110) %


In [33]:
candidate_arr = clf.fit_predict(featureset, n=3, sample=True, filter_hooks=prediction_filters)

incorrect_predictions = dict()
correct_predictions = dict()

index = 0
for candidates, label in zip(candidate_arr, labels):
    pred = clf._valid_candidates(candidates, label)
    if pred:
        correct_predictions[index] = [c[0][0] for c in candidates]
    else:
        incorrect_predictions[index] = [c[0][0] for c in candidates]
    index += 1

In [34]:
print("Correctly predicted:", len(correct_predictions), "out of", featureset.shape[0])

Correctly predicted: 1408 out of 1484


In [35]:
# project the incorrect prediction into separate df
df_incorrect = df.loc[[*incorrect_predictions.keys()]]
df_incorrect['incorrect']  = pd.Series(incorrect_predictions)

# project the correct predictions into separate df
df_correct = df.loc[[*correct_predictions.keys()]]
df_correct['incorrect']  = pd.Series(correct_predictions)

### Missclassified

In [36]:
df_incorrect

Unnamed: 0,cve_id,project,description,label,incorrect
14,CVE-2017-0909,private_address_check,The private_address_check ruby gem before 0.4....,private_address_check,"[network, private/local, gem]"
21,CVE-2017-1000048,qs,the web framework using ljharb's qs module old...,qs,"[module, using, v6.2.3]"
24,CVE-2017-1000061,xmlsec,xmlsec 1.2.23 and before is vulnerable to XML ...,xmlsec,"[Entity, External, resulting]"
36,CVE-2017-1000097,go,"On Darwin, user's trust preferences for root c...",Go,"[Darwin, Keychain, preferences]"
68,CVE-2017-1000212,alchemist-server,"Elixir's vim plugin, alchemist.vim is vulnerab...",alchemist-server,"[Elixir, plugin, port]"
117,CVE-2017-1000494,miniupnp,Uninitialized stack variable vulnerability in ...,miniupnp,"[miniupnpd, stack, Uninitialized]"
205,CVE-2017-11462,krb5,Double free vulnerability in MIT Kerberos 5 (a...,krb5,"[Kerberos, MIT, Double]"
208,CVE-2017-11468,distribution,Docker Registry before 2.6.2 in Docker Distrib...,Distribution,"[Registry, Docker, Docker]"
209,CVE-2017-11472,acpica,The acpi_ns_terminate() function in drivers/ac...,acpica,"[Linux, kernel, function]"
370,CVE-2017-12856,C.P,Cross-site scripting (XSS) vulnerability in C....,C.P,"[C.P.Sub, Cross-site, keyword]"


## VISUALIZATION

In [37]:
# visualization tools - NOTE: These do not come in the requirements! (Hence the BONUS)
from plotly import graph_objs as go
from plotly.offline import init_notebook_mode, iplot

In [38]:
# initialize plotly
init_notebook_mode(connected=True)

In [39]:
# compute cross validation few more times to get more data
# NOTE: this might take a while to compute
x_val_scores = [
    cross_validate(clf, X_train, y_train, shuffle=True, n=3,
                   sample=True, filter_hooks=prediction_filters)
    for _ in range(10)
]

x_val_scores = np.vstack(x_val_scores)

In [40]:
multi_candidate_eval_scores = [
    clf.evaluate(X_test, y_test, n=i, sample=True)
    for i in range(1, 21)
]

eval_trace = go.Scatter(
    x=np.arange(start=1, stop=len(multi_candidate_eval_scores)),
    y=multi_candidate_eval_scores,
)

In [41]:
scores, mean, std = np.hstack(x_val_scores[:, 0]), x_val_scores[:, 1].mean(), x_val_scores[:, 2].std()

# subtract from 1 to get distane from the middle
scores = 1.0 - scores
mean = 1.0 - mean

# show 2x std, as this is stated above as well
std *= 2

rad = np.linspace(0, 360, num=(len(scores) + 1)) * (np.pi / 180.)
rad = rad[:-1]

x = np.cos(rad) * (scores)
y = np.sin(rad) * (scores)
    
data_labels = ["{:5.3f} %".format((1 - score) * 100) for score in scores]
score_trace = go.Scatter(
    x=x,
    y=y,
    mode='markers',
    hovertext=data_labels,
    hoverinfo='text',
)

In [42]:
cross_validation_layout = {
    'title': 'Cross validation bias-variance tradeoff',
    'xaxis': {
        'zeroline': False,
        'ticks': '',
        'showticklabels': False
        
    },
    'yaxis': {
        'zeroline': False,
        'scaleanchor': 'x',
        'scaleratio': 1,
        'ticks': '',
        'showticklabels': False
    },
    'shapes': [
        {
            'type': 'circle',
            'xref': 'x',
            'yref': 'y',
            'x0': -mean,
            'y0': -mean,
            'x1': mean,
            'y1': mean,
            'opacity': 0.1,
            'fillcolor': 'red',
            'line': {
                'color': 'red',
            }
        },
        {
            'type': 'circle',
            'xref': 'x',
            'yref': 'y',
            'x0': -mean - std,
            'y0': -mean - std,
            'x1': mean + std,
            'y1': mean + std,
            'opacity': 0.8,
            'line': {
                'color': 'orange',
                'dash': 'dashdot',
            }
        },
        {
            'type': 'circle',
            'xref': 'x',
            'yref': 'y',
            'x0': -mean + std,
            'y0': -mean + std,
            'x1': mean - std,
            'y1': mean - std,
            'opacity': 0.8,
            'line': {
                'color': 'orange',
                'dash': 'dashdot',
            }
        },
        {
            'type': 'circle',
            'xref': 'x',
            'yref': 'y',
            'x0': -0.001,
            'y0': -0.001,
            'x1': 0.001,
            'y1': 0.001,
            'opacity': 0.8,
            'fillcolor': 'red',
            'line': {
                'color': 'red',
            }
        },
    ]
}

In [43]:
# visualize cross validation accuracy
fig = {
    'data': [score_trace],
    'layout': cross_validation_layout
}

iplot(fig, show_link=False)

In [44]:
multi_candidate_eval_layout = go.Layout(
    title="Accuracy increase per number of candidates.",
    yaxis=dict(
        title='Accuracy',
        titlefont=dict(
            color='grey'
        )
    ),
    xaxis=dict(
        title='Candidates',
        titlefont=dict(
            color='grey'
        )
    ),
)

In [45]:
fig = go.Figure(data=[eval_trace], layout=multi_candidate_eval_layout)

iplot(fig, show_link=False)