In [None]:
import os
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import json

from nvdlib.nvd import NVD

In [None]:
import nltk

In [None]:
import ast
df = pd.read_csv('dataframe-nvd-2017.csv', converters={'version_range': ast.literal_eval})

In [None]:
import plotly

from plotly import graph_objs as go
from plotly.offline import iplot, init_notebook_mode

init_notebook_mode(connected=True)

In [None]:
%matplotlib inline

## Data preparation


The data consists of CPEs with direct reference to GitHub. This way it is possible to label the data with the project name infered from GitHub repository.

In [None]:
nvd = NVD.from_feeds(['2017'])

In [None]:
nvd.update()

In [None]:
GH_BASE_URL = u"http[s]://github.com"

In [None]:
ecos = ['Java', 'JavaScript', 'Python']
df_ecos = df[ecos]
df_ecos = df[['username', 'project', 'version_range', 'url', *ecos]][df_ecos.any(1)]

In [None]:
import re
def get_reference(cve, url=None, pattern=None) -> str:
    for ref in cve.references:
        if url and url == ref:
            return ref

        if re.search(pattern, ref):
            return ref

    return None

In [None]:
def strip_src_url(url: str):
    """ATM assume that the only reference to source is github."""
    url_base_pattern = u"http[s]://github.com/([\w-]+)/([\w-]+[.]*[\w-]+)"
    strip_url = re.search(url_base_pattern, url)
    
    if not strip_url:
        print(url)
        return None
    
    return strip_url[0]

In [None]:
# Get descriptions and append them to the current DataFrame to avoid recreating a new one from scratch
cves = dict()
cve_list = list()

for cve in nvd.cves():
    ref = get_reference(cve, pattern="http[s]://github.com")
    if ref is None:
        continue
        
    ref = strip_src_url(ref)
    cve_list.append((cve.cve_id, ref, cve.description))
    cves[cve.cve_id] = cve

In [None]:
df_desc = pd.DataFrame(cve_list, columns=['cve_id', 'url', 'description'])

In [None]:
df = pd.merge(df_ecos, df_desc, how='inner', on='url').set_index(['username', 'project'])
del df_desc

In [None]:
df.columns.tolist()
df = df[[
    'cve_id',
    'url',
    'description',
    'version_range',
    'Java',
    'JavaScript',
    'Python',
]]

Sample of the initial unlabeled data for the 3 ecosystems

In [None]:
df.sample(frac=0.1).head()

### Create toy data set

For the purpose of quicker evaluation of feature extractos and classification accuracy, toy data set will be created.

A small portion (about $10%$) of random samples will be taken from the dataset filtered on the Java ecosystem (when Java works... the rest of it will, too).

In [None]:
toy_df = df[['cve_id', 'description']][df.Java > 1E2]

In [None]:
# label_tuple are a position of the project token in the token list
label_tuple = [None] * len(toy_df)
for i, (index, row) in enumerate(toy_df.iterrows()):
    proj = index[1].lower()
    desc = row.description.lower()
    # find the position of proj in the description, if applicable
    tokens = nltk.word_tokenize(desc)
    found = False
    for pos, token in enumerate(tokens):
        if token == proj:
            found = True
            break
    if not found:
        pos = None
    label_tuple[i] = (row.cve_id, token, pos)

# turn index into series
label_series = pd.DataFrame(label_tuple, columns=['cve_id', 'label', 'pos'])

del label_tuple

In [None]:
toy_df = toy_df.reset_index().merge(label_series, how='outer', on='cve_id').set_index(['username', 'project'])

In [None]:
# remove the projects where no project name was found
toy_df = toy_df[toy_df.pos.notnull()]

Sample of the labeled toy dataset

In [None]:
toy_df.sample(frac=0.5).head()

## Classificator training and evaluation

Model chosen for this approach will make use of Naive Bayes classification.

Since it is unknown whether the current data evince a latent pattern, multiple feature extractors will be suggested and evaluated.

In [None]:
def similar(word, cpes) -> bool:
Since it is unknown whether the current data evince a latent pattern, multiple
    for cpe in cpes:
        vendor, = cpe.vendor
        product, = cpe.product
#         if word.lower() in {vendor.lower(), product.lower()}:
        if product.lower().find(word.lower()) != -1:
            return True
    
    return False

In [None]:
def create_feature_list_long(feature_extractor, sents, labels, cve_ids) -> list:
    """Uses all sentences to create feature list given feature extractor."""
    feature_list = list()
    for i, desc in enumerate(sents):
        label = labels[i]
        tagged_sent = nltk.pos_tag(nltk.word_tokenize(desc), tagset='universal')
        for j, (word, tag) in enumerate(tagged_sent):
            is_label = word.lower() == label.lower()
            features = feature_extractor(tagged_sent, j, cve_ids[i])
            feature_list.append((features, is_label))
    
    return feature_list

In [None]:
def eval_accuracy(classifier, extractor, sentences: list, labels: list,
                  cve_ids: list, n=1, verbose=False) -> float:
    """Evaluate accuracy using raw classificator output."""
    from collections import Counter
    
    assert len(sentences) == len(labels)
    
    accurates = [None] * len(labels)
    for i, sent in enumerate(sentences):
        tagged = nltk.pos_tag(nltk.word_tokenize(sent), tagset='universal')
        prob_dist = [classifier.prob_classify(extractor(tagged, j, cve_ids[i])) for j in range(len(tagged))]
        probs = [(word, tag, prob.prob(True)) for (word, tag), prob in zip(tagged, prob_dist)]
        probs  = sorted(probs, key=lambda x: x[2], reverse=True)
        
        most_prob = set([prob[0].lower() for prob in probs[:n]])
        accurates[i] = labels[i].lower() in most_prob
    
        if verbose:
            print('Sentence: ', sent)
            print('Expected: `%s`' % labels[i], 'got: `%s`' % most_prob, '\n')
    
    bag = Counter(accurates)
    return bag[True] / len(labels)

In [None]:
blacklist = set(nltk.corpus.stopwords.words())
blacklist.update(set(['kernel', 'function', 'version', 'functions', 'versions', '<', '=', '.', '>']))

def predict(sent, classifier, extractor, cve_id, n=1, verbose=False) -> list:
    tagged = nltk.pos_tag(nltk.word_tokenize(sent), tagset='universal')
    
    prob_dist = [classifier.prob_classify(extractor(tagged, j, cve_id)) for j in range(len(tagged))]
    probs = set([(word.lower(), prob.prob(True)) for (word, tag), prob in zip(tagged, prob_dist)
                 if word.lower() not in blacklist and tag != 'NUM'
                ])
    
    probs = sorted(probs, key=lambda x: x[1], reverse=True)
    if verbose:
        print(probs)
    return probs[:n]

In [None]:
def eval_accuracy_spec(classifier, extractor, sentences: list, labels: list, cve_ids: list,
                       n=1, verbose=False) -> float:
    """Evaluate accuracy using predict function.
    This also filters out blacklisted words and stopwords.
    """
    from collections import Counter
    
    assert len(sentences) == len(labels)
    
    accurates = [None] * len(labels)
    for i, sent in enumerate(sentences):
        probs = predict(sent, classifier, extractor, cve_ids[i], n)
        
        most_prob = set([p[0].lower() for p in probs])
        
        accurates[i] = False
        for prob in most_prob:
            if labels[i].lower().find(prob) != -1:
                accurates[i] = True
    
        if not accurates[i] and verbose:
            print('Sentence: ', sent)
            print('Expected: `%s`' % labels[i], 'got: `%s`' % most_prob, '\n')
    
    bag = Counter(accurates)
    return bag[True] / len(labels)

In [None]:
def extract_features_vanilla(tagged: list, pos: int, cve_id=None):
    """Extract contextual features from the sentence w.r.t given position of a word."""
    word, tag = tagged[pos]
    features = {
        'tag': tag,
        'has-uppercase': word[0].isupper(),
        'word-len': len(word) > 3,
    }
    if pos == 0:
        features['prev-tag'] = '<start>'
    else:
        features['prev-word'] = tagged[pos - 1][0].lower()
        features['prev-tag'] = tagged[pos - 1][1]
        
    return features

# # Lets not split the dataset here for now
# feature_list = create_feature_list_long(extract_features_vanilla, descriptions, labels, cve_ids)

# classifier = nltk.NaiveBayesClassifier.train(feature_list)
# classifier.show_most_informative_features()

# eval_accuracy(classifier, extract_features_vanilla, descriptions, labels, cve_ids, n=1)

In [None]:
def extract_features_v0(tagged: list, pos: int, cve_id=None):
    """Extract contextual features from the sentence w.r.t given position of a word."""
    word, tag = tagged[pos]
    features = {
        'tag': tag,
        'word-len': len(word) > 3,
    }
    if pos == 0:
        features['prev-tag'] = '<start>'
    else:
        if pos > 1:
            features['prev-tag'] = tagged[pos - 1][1]
            features['prev-bigram'] = " ".join(w.lower() for w, t in tagged[pos - 2: pos])
            
        features['prev-word'] = tagged[pos - 1][0].lower()
        features['prev-tag'] = tagged[pos - 1][1]
        
    return features

# # Lets not split the dataset here for now
# feature_list = create_feature_list_long(extract_features_v0, descriptions, labels, cve_ids)

# classifier = nltk.NaiveBayesClassifier.train(feature_list)
# classifier.show_most_informative_features()

# eval_accuracy(classifier, extract_features_v0, descriptions, labels, cve_ids, n=1)

In [None]:
def extract_features_v1(tagged: list, pos: int, cve_id=None):
    """Extract contextual features from the sentence w.r.t given position of a word."""
    word, tag = tagged[pos]
    cases = [w.isupper() for w in word]
    features = {
        'tag': tag,
        'has-uppercase': any(cases) and not all(cases),
        'vend_prod_match': similar(word, cves[cve_id].get_cpe(cpe_type='a')),
        'word-len-threshold': len(word) > 3
    }
    if pos == 0:
        features['prev-tag'] = '<start>'
    else:
        if pos > 1:
            features['prev-tag'] = tagged[pos - 1][1]
            features['prev-bigram'] = " ".join(w.lower() for w, t in tagged[pos - 2: pos])
            
        features['prev-word'] = tagged[pos - 1][0].lower()
        features['prev-tag'] = tagged[pos - 1][1]
        
    return features

# # Lets not split the dataset here for now
# feature_list = create_feature_list_long(extract_features_v1, descriptions, labels, cve_ids)

# classifier = nltk.NaiveBayesClassifier.train(feature_list)
# classifier.show_most_informative_features()

# eval_accuracy(classifier, extract_features_v1, descriptions, labels, cve_ids, n=2)
# eval_accuracy_spec(classifier, extract_features_v1, descriptions, labels, cve_ids, n=2)

In [None]:
regex_tokenizer = nltk.RegexpTokenizer(pattern=u"[-_]", gaps=True)

_version_pattern = u"(\d[.]?)+[-_]?(\w)*"

In [None]:
def extract_features_v2(tagged: list, pos: int, cve_id=None):
    """Extract contextual features from the sentence w.r.t given position of a word."""
    word, tag = tagged[pos]
    word, tag = tagged[pos]
    # retag if necessary
    if re.match(_version_pattern, word) and tag != 'NUM':
        tag = 'NUM'
        tagged[pos] = word, tag
        
    features = {
        'tag': tag,
        'word-len': len(word) > 3,
        'has-uppercase': any(w.isupper() for w in word),
        'vend_prod_match': similar(word, cves[cve_id].get_cpe(cpe_type='a')),
    }
    if pos == 0:
        features['prev-tag'] = '<start>'
    else:
        if pos > 1:
            features['prev-bigram'] = " ".join(w.lower() for w, t in tagged[pos - 2: pos])
            
        if pos < len(tagged):    
            features['next-bigram'] = " ".join(w.lower() for w, t in tagged[pos + 1: pos + 3])
            features['next-bigram-tags'] = " ".join(t for w, t in tagged[pos + 1: pos + 3])
            
        features['prev-tag'] = tagged[pos - 1][1]
        features['prev-word'] = tagged[pos - 1][0].lower()
        
        
    return features

# # Lets not split the dataset here for now
# feature_list = create_feature_list_long(extract_features_v2, descriptions, labels, cve_ids)

# classifier = nltk.NaiveBayesClassifier.train(feature_list)
# classifier.show_most_informative_features()

# eval_accuracy(classifier, extract_features_v2, descriptions, labels, cve_ids, n=1)
# eval_accuracy_spec(classifier, extract_features_v2, descriptions, labels, cve_ids, n=1)

In [None]:
def extract_features_v3(tagged: list, pos: int, cve_id=None):
    """Extract contextual features from the sentence w.r.t given position of a word."""
    word, tag = tagged[pos]
    # retag if necessary
    if re.match(_version_pattern, word) and tag != 'NUM':
        tag = 'NUM'
        tagged[pos] = (word, tag)
    
    try:
        ver_pos = [pos for pos, (w, t) in enumerate(tagged[pos:]) if tag == 'NUM'][0]
    except:
        ver_pos = None
        
    try:
        ver_follows = any([pos for pos, (w, t) in enumerate(tagged[pos:]) if tag == 'NUM'])
    except:
        print(tagged[pos:])
        
    features = {
        'tag': tag,
        'word-len': len(word) > 3,
        'vend_prod_match': similar(word, cves[cve_id].get_cpe(cpe_type='a')),
        'has-uppercase': any(w.isupper() for w in word),
        'alnum': word.isalnum(),
        'version_pos': ver_pos,
        'ver_follows': ver_follows,
    }
    if pos == 0:
        features['prev-tag'] = '<start>'
    else:
        if pos > 1:
            features['prev-tag'] = tagged[pos - 1][1]
            features['prev-bigram'] = " ".join(w.lower() for w, t in tagged[pos - 2: pos])
            
        if pos < len(tagged) - 1:    
            features['next-bigram'] = " ".join(w.lower() for w, t in tagged[pos + 1: pos + 3])
            features['next-bigram-tags'] = " ".join(t for w, t in tagged[pos + 1: pos + 3])
            
        features['prev-word'] = tagged[pos - 1][0]
            
            
    return features

# # Lets not split the dataset here for now
# feature_list = create_feature_list_long(extract_features_v3, descriptions, labels, cve_ids)

# classifier = nltk.NaiveBayesClassifier.train(feature_list)
# classifier.show_most_informative_features()

# eval_accuracy(classifier, extract_features_v3, descriptions, labels, cve_ids, n=2)
# # eval_accuracy_spec(classifier, extract_features_v3, descriptions, labels, cve_ids, n=2)

#### Current approach utilities

In [None]:
from collections import OrderedDict


def get_first_sentence(description):
    """Get only the first sentence from the description."""
    sentences = nltk.sent_tokenize(description)
    return sentences[0] if sentences else ''


def guess_package_name(description):
    from nltk.corpus import stopwords
    """Guess package name from given description.

    Very naive approach. Words starting with uppercase letter
    are considered to be possible package names (minus stop words).

    Returns a list of possible package names, without duplicates.
    """

    stop_words = set()

    try:
        # Fails when no downloaded stopwords are available.
        stop_words.update(stopwords.words('english'))
    except LookupError:
        # Download stopwords since they are not available.
        nltk.download('stopwords')
        stop_words.update(stopwords.words('english'))

    regexp = re.compile('[A-Z][A-Za-z0-9-:]*')  # ? TODO: tweak
    suspects = regexp.findall(description)

    results = []

    if not suspects:
        return results

    results = [x.lower() for x in suspects if x.lower() not in stop_words]
    # get rid of duplicates, but keep order
    results = list(OrderedDict.fromkeys(results))

    return results


def get_package_name_candidates(description):
    """Try to identify possible package names in the CVE's description."""
    pkg_name_candidates = set()
    first_sentence = get_first_sentence(description)
    names = guess_package_name(first_sentence)
    pkg_name_candidates.update(set(names))
    return pkg_name_candidates

In [None]:
def eval_old_accuracy(sentences: list, labels: list) -> float:
    from collections import Counter
    
    assert len(sentences) == len(labels)
    
    guesses = [None] * len(labels)
    accurates = [None] * len(labels)
    for i, sent in enumerate(sentences):
        
        names = get_package_name_candidates(sent)
        guesses[i] = len(names)
        accurates[i] = False
        for name in names:
            if labels[i].lower().find(name) != -1:
                accurates[i] = True
    
#         print('Expected: `%s`' % labels[i], 'got: `%s`' % most_prob)
    
    # TODO: come up with more sophisticated way of measuring accuracy
    bag = Counter(accurates)
    return bag[True] / len(labels), sum(guesses) / len(guesses)

In [None]:
def get_nof_guesses(sentences: list, labels: list) -> float:
    assert len(sentences) == len(labels)
    
    guesses = [None] * len(labels)
    for i, sent in enumerate(sentences):
        
        names = get_package_name_candidates(sent)
        guesses[i] = len(names)
        
    # TODO: come up with more sophisticated way of measuring accuracy
    return guesses

#### Train and test set split

Toy data set will be split by $0.2$ ratio in order to be able to test model's ability to generalize.

In [None]:
descriptions, labels, cve_ids = list(zip(*toy_df.sample(frac=1)[['description', 'label', 'cve_id']].values))

In [None]:
x = np.arange(1, 15)
feature_extractors = [extract_features_vanilla, extract_features_v0, extract_features_v1, extract_features_v2,
                      extract_features_v3
                     ]

split = int(len(descriptions) * 0.2)
test_set, test_labels = descriptions[:split], labels[:split]
train_set, train_labels = descriptions[split:], labels[split:]

### Evaluate classificators accuracy on the test set using raw output evaluation

#### Evaluation of the current used approach on the test set

In [None]:
old_accuracy, mean_guess = eval_old_accuracy(test_set, test_labels)
print('accuracy:', old_accuracy)
print('mean guess length:', mean_guess)

#### Evaluation of the new approach

In [None]:
accuracy_list = list()
for i, extractor in enumerate(feature_extractors):
    # Lets not split the dataset here for now
    feature_list = create_feature_list_long(extractor, train_set, train_labels, cve_ids[split:])

    classifier = nltk.NaiveBayesClassifier.train(feature_list)
#     classifier.show_most_informative_features()
    
    accuracy_list.append([eval_accuracy(classifier, extractor, test_set, test_labels, cve_ids[:split], i) for i in x])

In [None]:
trace_names = ['vanilla_extractor'] + ['extract_features_v%d' % i for i in range(len(accuracy_list))]
data = [go.Scatter(x=x, y=ac, name=trace_names[i]) for i, ac in enumerate(accuracy_list)]

layout = go.Layout(
    yaxis=dict(
        title='Accuracy',
        titlefont=dict(
            color='grey'
        )
    ),
    xaxis=dict(
        title='Candidates',
        titlefont=dict(
            color='grey'
        )
    ),
    shapes=[
        {
            'type': 'line',
            'x0': mean_guess,
            'x1': mean_guess,
            'y0': -0.05,
            'y1': 1.1,
            'opacity': 0.2,
            'line': {
                'dash': 'dash'
            }
        },
        {
            'type': 'line',
            'x0': -0.5,
            'x1': 20,
            'y0': old_accuracy,
            'y1': old_accuracy,
            'opacity': 0.2,
            'line': {
                'dash': 'dash'
            }
        }
    ]
)

fig = go.Figure(data=data, layout=layout)

In [None]:
iplot(fig, show_link=False)

In [None]:
guess_trace = go.Scatter(y=get_nof_guesses(test_set, test_labels))
layout = go.Layout(
    xaxis=dict(
        ticks='',
        showticklabels=False,
        showgrid=False
    ),
    shapes=[
        {
            'type': 'line',
            'y0': mean_guess,
            'y1': mean_guess,
            'x0': -0.1,
            'x1': 23,
            'opacity': 0.2,
            'line': {
                'dash': 'dash'
            }
        },
    ]
)

fig = go.Figure(data=[guess_trace], layout=layout)

In [None]:
iplot(fig, show_link=False)

### Evaluate accuracy on the test set using predective evaluation

In [None]:
accuracy_list = list()
for i, extractor in enumerate(feature_extractors):
    # Lets not split the dataset here for now
    feature_list = create_feature_list_long(extractor, train_set, train_labels, cve_ids[split:])

    classifier = nltk.NaiveBayesClassifier.train(feature_list)
#     classifier.show_most_informative_features()
    
    accuracy_list.append([eval_accuracy_spec(classifier, extractor, test_set, test_labels, cve_ids[:split], i) for i in x])

In [None]:
trace_names = ['vanilla_extractor'] + ['extract_features_v%d' % i for i in range(len(accuracy_list))]
data = [go.Scatter(x=x, y=ac, name=trace_names[i]) for i, ac in enumerate(accuracy_list)]

layout = go.Layout(
    yaxis=dict(
        title='Accuracy',
        titlefont=dict(
            color='grey'
        )
    ),
    xaxis=dict(
        title='Candidates',
        titlefont=dict(
            color='grey'
        )
    ),
    shapes=[
        {
            'type': 'line',
            'x0': mean_guess,
            'x1': mean_guess,
            'y0': -0.05,
            'y1': 1.1,
            'opacity': 0.2,
            'line': {
                'dash': 'dash'
            }
        },
        {
            'type': 'line',
            'x0': -0.5,
            'x1': 20,
            'y0': old_accuracy,
            'y1': old_accuracy,
            'opacity': 0.2,
            'line': {
                'dash': 'dash'
            }
        }
    ]
)

fig = go.Figure(data=data, layout=layout)

In [None]:
iplot(fig, show_link=False)

### Evaluate accuracy on the toy set using predective evaluation

Lets try to evaluate the accuracy of chosen feature extractors on the whole toy set using the same classifiers. No re-training being done.

#### Evaluation of the currently used approach on the toy set

In [None]:
old_accuracy, mean_guess = eval_old_accuracy(descriptions, labels)
print('old accuracy:', old_accuracy)
print('mean guess length:', mean_guess)

#### Evaluation of the new approach on the toy set

In [None]:
accuracy_list = list()
for i, extractor in enumerate(feature_extractors):
    # Lets not split the dataset here for now
    feature_list = create_feature_list_long(extractor, train_set, train_labels, cve_ids[split:])

    classifier = nltk.NaiveBayesClassifier.train(feature_list)
#     classifier.show_most_informative_features()
    
    accuracy_list.append([eval_accuracy_spec(classifier, extractor, descriptions, labels, cve_ids, i) for i in x])

In [None]:
trace_names = ['vanilla_extractor'] + ['extract_features_v%d' % i for i in range(len(accuracy_list))]
data = [go.Scatter(x=x, y=ac, name=trace_names[i]) for i, ac in enumerate(accuracy_list)]

layout = go.Layout(
    yaxis=dict(
        title='Accuracy',
        titlefont=dict(
            color='grey'
        )
    ),
    xaxis=dict(
        title='Candidates',
        titlefont=dict(
            color='grey'
        )
    ),
    shapes=[
        {
            'type': 'line',
            'x0': mean_guess,
            'x1': mean_guess,
            'y0': -0.05,
            'y1': 1.1,
            'opacity': 0.2,
            'line': {
                'dash': 'dash'
            }
        },
        {
            'type': 'line',
            'x0': -0.5,
            'x1': 20,
            'y0': old_accuracy,
            'y1': old_accuracy,
            'opacity': 0.2,
            'line': {
                'dash': 'dash'
            }
        }
    ]
)

fig = go.Figure(data=data, layout=layout)

In [None]:
iplot(fig, show_link=False)

In [None]:
# Create trendline
from collections import namedtuple

Poly = namedtuple('Trendline', 'coefs residuals rank singular_val rcond')

In [None]:
# get number of guesses for the old project predictions
guesses = np.array(get_nof_guesses(descriptions, labels))

# fit guesses with polynomial
poly = Poly(*np.polyfit(np.arange(len(guesses)), guesses, 20, full=True))
pts = np.linspace(start=0, stop=len(guesses), num=len(guesses) * 10)  # create evaluation points

# create polynomial function
f = np.poly1d(poly.coefs)

x_plot = np.linspace(start=0, stop=len(guesses), num=200)
y_plot = f(x_plot)

In [None]:
# create traces
guess_trace = go.Scatter(y=guesses, name='Project guesses', hoverinfo='name + y')
trendline_trace = go.Scatter(x=x_plot, y=y_plot, name='Polynomial trendline',
                            hoverinfo='skip')

layout = go.Layout(
    title="Number of projects predicted by current approach",
    shapes=[
        {
            'type': 'line',
            'y0': mean_guess,
            'y1': mean_guess,
            'x0': -0.1,
            'x1': 120,
            'opacity': 0.2,
            'line': {
                'dash': 'dash'
            }
        },
    ]
)

fig = go.Figure(data=[guess_trace, trendline_trace], layout=layout)

In [None]:
iplot(fig, show_link=False)

### Evaluate accuracy on the whole dataset for 3  main ecosystems

This evaluation will take a while, the whole dataset presented at the beginning of this notebook will be labeled and split into train and test data.

The classifier will be retrained on the train data.

**NOTE:** *It would be possible to use the same classifiers as previously, but they were trained on such a small portion of data (ever smaller with respect to the real-world data), that the results would not correspond with reality.*

In [None]:
df.info()

In [None]:
# Choose an extractor and train it on the whole df
extractor = extract_features_v3

feature_list = create_feature_list_long(extractor, descriptions, labels, cve_ids)
classifier = nltk.NaiveBayesClassifier.train(feature_list)

predictions = [None] * len(df)
for i, desc in enumerate(df.description.values):
    probs = predict(desc, classifier, extract_features_v3, cve_id=df.cve_id.values[i], n=3,
                   verbose=False)
    predictions[i] = probs

# get just the names

pred_proj_names = [tuple(zip(*p))[0] for p in predictions]

pred_df = pd.Series(pred_proj_names, name='prediction')
df['prediction'] = pred_df.values

In [None]:
# label_tuple are a position of the project token in the token list
pred_tuple = [None] * len(df)
for i, (index, row) in enumerate(df.iterrows()):
    proj = index[1].lower()
    desc = row.description.lower()
    # find the position of proj in the description, if applicable
    tokens = nltk.word_tokenize(desc)
    found = False
    for pos, token in enumerate(tokens):
        if token == proj:
            found = True
            break
    if not found:
        pos = None
    
    pred_tuple[i] = (row.cve_id, token if found else None, pos)

# turn index into series
label_series = pd.DataFrame(pred_tuple, columns=['cve_id', 'label', 'pos'])

del pred_tuple

In [None]:
# Label the dataset
df = df.reset_index().merge(label_series, how='left', on='cve_id')

In [None]:
descriptions, labels, cve_ids = list(zip(*df.sample(frac=1)[['description', 'project', 'cve_id']].values))

# estimate split ratio
split = int(len(descriptions) * 0.25)

# prepare train data
train_descriptions, train_labels, train_cve_ids = descriptions[split:], labels[split:], cve_ids[split:]

# prepare test data
test_descriptions, test_labels, test_cve_ids = descriptions[:split], labels[:split], cve_ids[:split]

#### Evaluation of the currently used approach on the whole data set

In [None]:
old_accuracy, mean_guess = eval_old_accuracy(test_descriptions, test_labels)

In [None]:
print('accuracy:', old_accuracy)
print('mean guess length:', mean_guess)

#### Evaluation of the new approach on the whole data set

In [None]:
x = np.arange(1, 15)
feature_extractors = [extract_features_vanilla, extract_features_v0, extract_features_v1, extract_features_v2,
                      extract_features_v3
                     ]

accuracy_list = list()
for i, extractor in enumerate(feature_extractors):
    # Lets not split the dataset here for now
    feature_list = create_feature_list_long(extractor, train_descriptions, train_labels,
                                            train_cve_ids)

    classifier = nltk.NaiveBayesClassifier.train(feature_list)
#     classifier.show_most_informative_features()
    
    accuracy_list.append(
        [
            eval_accuracy_spec(classifier, extractor, test_descriptions, test_labels, test_cve_ids, i)
            for i in x
        ]
    )

In [None]:
trace_names = ['vanilla_extractor'] + ['extract_features_v%d' % i for i in range(len(accuracy_list))]
data = [go.Scatter(x=x, y=ac, name=trace_names[i]) for i, ac in enumerate(accuracy_list)]

layout = go.Layout(
    yaxis=dict(
        title='Accuracy',
        titlefont=dict(
            color='grey'
        )
    ),
    xaxis=dict(
        title='Candidates',
        titlefont=dict(
            color='grey'
        )
    ),
    shapes=[
        {
            'type': 'line',
            'x0': mean_guess,
            'x1': mean_guess,
            'y0': -0.05,
            'y1': 1.1,
            'opacity': 0.2,
            'line': {
                'dash': 'dash'
            }
        },
        {
            'type': 'line',
            'x0': -0.5,
            'x1': 20,
            'y0': old_accuracy,
            'y1': old_accuracy,
            'opacity': 0.2,
            'line': {
                'dash': 'dash'
            }
        }
    ]
)

fig = go.Figure(data=data, layout=layout)

In [None]:
iplot(fig, show_link=False)

In [None]:
# get number of guesses for the old project predictions
guesses = np.array(get_nof_guesses(test_descriptions, test_labels))

# fit guesses with polynomial
poly = Poly(*np.polyfit(np.arange(len(guesses)), guesses, 20, full=True))
pts = np.linspace(start=0, stop=len(guesses), num=len(guesses) * 10)  # create evaluation points

# create polynomial function
f = np.poly1d(poly.coefs)

x_plot = np.linspace(start=0, stop=len(guesses), num=1000)
y_plot = f(x_plot)

In [None]:
# create traces
guess_trace = go.Scatter(y=guesses, name='Project guesses', hoverinfo='name + y')
trendline_trace = go.Scatter(x=x_plot, y=y_plot, name='Polynomial trendline',
                             hoverinfo='skip')

layout = go.Layout(
    title="Number of projects predicted by current approach",
    shapes=[
        {
            'type': 'line',
            'y0': mean_guess,
            'y1': mean_guess,
            'x0': -0.1,
            'x1': len(guesses) + 10,
            'opacity': 0.2,
            'line': {
                'dash': 'dash'
            }
        },
    ]
)

fig = go.Figure(data=[guess_trace, trendline_trace], layout=layout)

In [None]:
iplot(fig, show_link=False)

#### Incorrect predictions for the whole dataset

Discarding project descriptions where the label was not present at all, get the number of incorrect predictions w.r.t the project name.

In [None]:
predictions = [None] * len(df)
for i, desc in enumerate(df.description.values):
    probs = predict(desc, classifier, extract_features_v3, cve_id=df.cve_id.values[i], n=3,
                   verbose=False)
    predictions[i] = probs

# get just the names

pred_proj_names = [tuple(zip(*p))[0] for p in predictions]

pred_df = pd.Series(pred_proj_names, name='prediction')
df['prediction'] = pred_df.values

In [None]:
# label_tuple are a position of the project token in the token list
pred_tuple = [None] * len(df)
for i, row in df.iterrows():
    proj = row.project.lower()
    
    correct = None if row.label is None else row.label in row.prediction
    pred_tuple[i] = (row.cve_id, correct)

# turn index into series
label_series = pd.DataFrame(pred_tuple, columns=['cve_id', 'correct'])

del pred_tuple

In [None]:
df_pred = df.merge(label_series, how='inner', on='cve_id').set_index(['username', 'project'])

In [None]:
df_incorrect = df_pred[(df_pred.label.notnull()) & (df_pred.correct_y == False)]

In [None]:
print('Number of incorrect predictions: %d out of %d' % (len(df_incorrect), len(df_pred)))

In [None]:
df_incorrect[['description', 'prediction']].reset_index().style