In [1]:
import os
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import json

from nvdlib.nvd import NVD

In [2]:
import nltk

In [3]:
import ast
df = pd.read_csv('dataframe-nvd-2017.csv', converters={'version_range': ast.literal_eval})

In [4]:
import plotly

from plotly import graph_objs as go
from plotly.offline import iplot, init_notebook_mode

init_notebook_mode(connected=True)

In [5]:
%matplotlib inline

## Prepare the DataFrame

In [6]:
nvd = NVD.from_feeds(['2017'])

In [7]:
nvd.update()

In [8]:
GH_BASE_URL = u"http[s]://github.com"

In [9]:
ecos = ['Java', 'JavaScript', 'Python']
df_ecos = df[ecos]
df_ecos = df[['username', 'project', 'version_range', 'url', *ecos]][df_ecos.any(1)]

In [10]:
import re
def get_reference(cve, url=None, pattern=None) -> str:
    for ref in cve.references:
        if url and url == ref:
            return ref

        if re.search(pattern, ref):
            return ref

    return None

In [11]:
def strip_src_url(url: str):
    """ATM assume that the only reference to source is github."""
    url_base_pattern = u"http[s]://github.com/([\w-]+)/([\w-]+[.]*[\w-]+)"
    strip_url = re.search(url_base_pattern, url)
    
    if not strip_url:
        print(url)
        return None
    
    return strip_url[0]

In [12]:
# Get descriptions and append them to the current DataFrame to avoid recreating a new one from scratch
cves = dict()
cve_list = list()

for cve in nvd.cves():
    ref = get_reference(cve, pattern="http[s]://github.com")
    if ref is None:
        continue
        
    ref = strip_src_url(ref)
    cve_list.append((cve.cve_id, ref, cve.description))
    cves[cve.cve_id] = cve

In [13]:
df_desc = pd.DataFrame(cve_list, columns=['cve_id', 'url', 'description'])

In [14]:
df = pd.merge(df_ecos, df_desc, how='inner', on='url').set_index(['username', 'project'])

In [15]:
df.columns.tolist()
df = df[[
    'cve_id',
    'url',
    'description',
    'version_range',
    'Java',
    'JavaScript',
    'Python',
]]

In [16]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,cve_id,url,description,version_range,Java,JavaScript,Python
username,project,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Microsoft,ChakraCore,CVE-2017-0028,https://github.com/Microsoft/ChakraCore,A remote code execution vulnerability exists w...,"(None, None)",0,49258214,84984
Microsoft,ChakraCore,CVE-2017-0152,https://github.com/Microsoft/ChakraCore,A remote code execution vulnerability exists i...,"(None, None)",0,49258214,84984
Microsoft,ChakraCore,CVE-2017-0196,https://github.com/Microsoft/ChakraCore,An information disclosure vulnerability in Mic...,"(None, None)",0,49258214,84984
Microsoft,ChakraCore,CVE-2017-0223,https://github.com/Microsoft/ChakraCore,A remote code execution vulnerability exists i...,"(None, None)",0,49258214,84984
Microsoft,ChakraCore,CVE-2017-0252,https://github.com/Microsoft/ChakraCore,A remote code execution vulnerability exists i...,"(None, None)",0,49258214,84984


## Create toy data set

In [17]:
toy_df = df[['cve_id', 'description']][df.Java > 1E2]

In [18]:
# labels are a position of the project token in the token list
labels = [None] * len(toy_df)
for i, (index, row) in enumerate(toy_df.iterrows()):
    proj = index[1]
    desc = row.description.lower()
    # find the position of proj in the description, if applicable
    tokens = nltk.word_tokenize(desc)
    found = False
    for pos, token in enumerate(tokens):
        if token == proj:
            found = True
            break
    if not found:
        pos = None
    labels[i] = (row.cve_id, token, pos)

# turn index into series
labels = pd.DataFrame(labels, columns=['cve_id', 'label', 'pos'])

In [19]:
toy_df = toy_df.reset_index().merge(labels, how='outer', on='cve_id').set_index(['username', 'project'])

In [20]:
toy_df = toy_df[toy_df.pos.notnull()]

In [21]:
toy_df[['description', 'pos']].reset_index().head()

Unnamed: 0,username,project,description,pos
0,modxcms,revolution,MODX Revolution version 2.x - 2.5.6 is vulnera...,1.0
1,modxcms,revolution,"In MODX Revolution 2.5.7, the ""key"" and ""name""...",2.0
2,modxcms,revolution,Directory traversal in setup/processors/url_se...,16.0
3,modxcms,revolution,"In MODX Revolution before 2.5.7, when PHP 5.3....",2.0
4,modxcms,revolution,"In MODX Revolution before 2.5.7, an attacker i...",2.0


## Decription processing

In [22]:
def similar(word, cpes) -> bool:
    for cpe in cpes:
        vendor, = cpe.vendor
        product, = cpe.product
        if word.lower() in {vendor.lower(), product.lower()}:
            return True
    
    return False

In [23]:
df.loc['FasterXML']

Unnamed: 0_level_0,cve_id,url,description,version_range,Java,JavaScript,Python
project,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
jackson-databind,CVE-2017-15095,https://github.com/FasterXML/jackson-databind,A deserialization flaw was discovered in the j...,"(None, None)",6201368,0,0
jackson-databind,CVE-2017-17485,https://github.com/FasterXML/jackson-databind,FasterXML jackson-databind through 2.8.10 and ...,"(None, None)",6201368,0,0
jackson-databind,CVE-2017-7525,https://github.com/FasterXML/jackson-databind,A deserialization flaw was discovered in the j...,"(None, None)",6201368,0,0


In [24]:
descriptions, labels, cve_ids = list(zip(*toy_df[['description', 'label', 'cve_id']].values))

In [25]:
def create_feature_list_long(feature_extractor, sents, labels, cve_ids) -> list:
    """Uses all sentences to create feature list given feature extractor."""
    feature_list = list()
    for i, desc in enumerate(sents):
        label = labels[i]
        tagged_sent = nltk.pos_tag(nltk.word_tokenize(desc), tagset='universal')
        for j, (word, tag) in enumerate(tagged_sent):
            is_label = word == label
            features = feature_extractor(tagged_sent, j, cve_ids[i])
            feature_list.append((features, is_label))
    
    return feature_list

In [26]:
def create_feature_list_short(feature_extractor, sents, labels, cve_ids) -> list:
    """Uses only first sentence to create feature list given feature extractor."""
    feature_list = list()
    for i, desc in enumerate(sents):
        label = labels[i]
        first_sent = nltk.sent_tokenize(desc)[0]
        tagged_sent = nltk.pos_tag(nltk.word_tokenize(first_sent), tagset='universal')
        for j, (word, tag) in enumerate(tagged_sent):
            is_label = word == label
            features = feature_extractor(tagged_sent, j, cve_ids[i])
            feature_list.append((features, is_label))
    
    return feature_list

In [27]:
def create_feature_list_v2(feature_extractor, sents, labels, cve_ids) -> list:
    """Uses only first sentence to create feature list given feature extractor."""
    feature_list = list()
    for i, desc in enumerate(sents):
        label = labels[i]
        for j, sent in enumerate(nltk.sent_tokenize(desc)):
            tagged_sent = nltk.pos_tag(nltk.word_tokenize(sent), tagset='universal')
            for k, (word, tag) in enumerate(tagged_sent):
                is_label = word == label
                features = feature_extractor(tagged_sent, sent_pos=j, word_pos=k, cve_id=cve_ids[i])
                feature_list.append((features, is_label))
    
    return feature_list

In [28]:
def eval_accuracy(classifier, extractor, sentences: list, labels: list, cve_ids: list, n=1) -> float:
    from collections import Counter
    
    assert len(sentences) == len(labels)
    
    accurates = [None] * len(labels)
    for i, sent in enumerate(sentences):
        tagged = nltk.pos_tag(nltk.word_tokenize(sent), tagset='universal')
        prob_dist = [classifier.prob_classify(extractor(tagged, j, cve_ids[i])) for j in range(len(tagged))]
        probs = [(word, tag, prob.prob(True)) for (word, tag), prob in zip(tagged, prob_dist)]
        probs  = sorted(probs, key=lambda x: x[2], reverse=True)
        
        most_prob = set([prob[0].lower() for prob in probs[:n]])
        accurates[i] = labels[i] in most_prob
    
#         if not accurates[i]:
#             print('Sentence: ', sent)
#             print('Expected: `%s`' % labels[i], 'got: `%s`' % probs, '\n')
    
    bag = Counter(accurates)
    return bag[True] / len(labels)

In [29]:
def eval_accuracy_v2(classifier, extractor, sentences: list, labels: list, cve_ids: list, n=1) -> float:
    from collections import Counter
    
    assert len(sentences) == len(labels)
    
    accurates = [None] * len(labels)
    for i, sent in enumerate(sentences):
        for s in nltk.sent_tokenize(sent):
            tagged = nltk.pos_tag(nltk.word_tokenize(s), tagset='universal')
            prob_dist = [classifier.prob_classify(extractor(tagged, s, j, cve_ids[i])) for j in range(len(tagged))]
            probs = [(word, tag, prob.prob(True)) for (word, tag), prob in zip(tagged, prob_dist)]
            probs  = sorted(probs, key=lambda x: x[2], reverse=True)

            most_prob = set([prob[0].lower() for prob in probs[:n]])
            accurates[i] = labels[i] in most_prob
    
#         print('Expected: `%s`' % labels[i], 'got: `%s`' % most_prob)
    
    # TODO: come up with more sophisticated way of measuring accuracy
    bag = Counter(accurates)
    return bag[True] / len(labels)

In [30]:
def extract_features_vanilla(tagged: list, pos: int, cve_id=None):
    """Extract contextual features from the sentence w.r.t given position of a word."""
    word, tag = tagged[pos]
    features = {
        'tag': tag,
        'has-uppercase': word[0].isupper(),
        'word-len': len(word) > 3,
    }
    if pos == 0:
        features['prev-tag'] = '<start>'
    else:
        features['prev-word'] = tagged[pos - 1][0].lower()
        features['prev-tag'] = tagged[pos - 1][1]
        
    return features

# Lets not split the dataset here for now
feature_list = create_feature_list_long(extract_features_vanilla, descriptions, labels, cve_ids)

classifier = nltk.NaiveBayesClassifier.train(feature_list)
classifier.show_most_informative_features()

eval_accuracy(classifier, extract_features_vanilla, descriptions, labels, cve_ids, n=1)

Most Informative Features
               prev-word = 'the'           False : True   =      8.2 : 1.0
               prev-word = None            False : True   =      8.0 : 1.0
               prev-word = 'a'             False : True   =      7.7 : 1.0
                prev-tag = 'ADP'            True : False  =      5.0 : 1.0
                     tag = 'ADV'            True : False  =      5.0 : 1.0
                     tag = 'ADJ'            True : False  =      3.6 : 1.0
                prev-tag = 'NOUN'          False : True   =      2.5 : 1.0
               prev-word = 'aka'            True : False  =      2.3 : 1.0
                prev-tag = '<start>'        True : False  =      1.8 : 1.0
                word-len = True             True : False  =      1.7 : 1.0


0.15702479338842976

In [31]:
def extract_features_v0(tagged: list, pos: int, cve_id=None):
    """Extract contextual features from the sentence w.r.t given position of a word."""
    word, tag = tagged[pos]
    features = {
        'tag': tag,
        'vend_prod_match': similar(word, cves[cve_id].get_cpe(cpe_type='a')),
        'word-len': len(word) > 3,
    }
    if pos == 0:
        features['prev-tag'] = '<start>'
    else:
        if pos > 1:
            features['prev-tag'] = tagged[pos - 1][1]
            features['prev-bigram'] = " ".join(w.lower() for w, t in tagged[pos - 2: pos])
            
        features['prev-word'] = tagged[pos - 1][0].lower()
        features['prev-tag'] = tagged[pos - 1][1]
        
    return features

# Lets not split the dataset here for now
feature_list = create_feature_list_long(extract_features_v0, descriptions, labels, cve_ids)

classifier = nltk.NaiveBayesClassifier.train(feature_list)
classifier.show_most_informative_features()

eval_accuracy(classifier, extract_features_v0, descriptions, labels, cve_ids, n=1)

Most Informative Features
         vend_prod_match = True             True : False  =     37.6 : 1.0
             prev-bigram = None            False : True   =     19.0 : 1.0
         vend_prod_match = False           False : True   =     10.9 : 1.0
               prev-word = 'the'           False : True   =      8.2 : 1.0
               prev-word = None            False : True   =      8.0 : 1.0
               prev-word = 'a'             False : True   =      7.7 : 1.0
             prev-bigram = 'is for'         True : False  =      5.1 : 1.0
                prev-tag = 'ADP'            True : False  =      5.0 : 1.0
                     tag = 'ADV'            True : False  =      5.0 : 1.0
             prev-bigram = 'in the'        False : True   =      4.5 : 1.0


0.8760330578512396

In [32]:
def extract_features_v1(tagged: list, pos: int, cve_id=None):
    """Extract contextual features from the sentence w.r.t given position of a word."""
    word, tag = tagged[pos]
    cases = [w.isupper() for w in word]
    features = {
        'tag': tag,
        'has-uppercase': any(cases) and not all(cases),
        'vend_prod_match': similar(word, cves[cve_id].get_cpe(cpe_type='a')),
        'word-len-threshold': len(word) > 3
    }
    if pos == 0:
        features['prev-tag'] = '<start>'
    else:
        if pos > 1:
            features['prev-tag'] = tagged[pos - 1][1]
            features['prev-bigram'] = " ".join(w.lower() for w, t in tagged[pos - 2: pos])
            
        features['prev-word'] = tagged[pos - 1][0].lower()
        features['prev-tag'] = tagged[pos - 1][1]
        
    return features

# Lets not split the dataset here for now
feature_list = create_feature_list_long(extract_features_v1, descriptions, labels, cve_ids)

classifier = nltk.NaiveBayesClassifier.train(feature_list)
classifier.show_most_informative_features()

eval_accuracy(classifier, extract_features_v1, descriptions, labels, cve_ids, n=1)

Most Informative Features
         vend_prod_match = True             True : False  =     37.6 : 1.0
             prev-bigram = None            False : True   =     19.0 : 1.0
         vend_prod_match = False           False : True   =     10.9 : 1.0
               prev-word = 'the'           False : True   =      8.2 : 1.0
               prev-word = None            False : True   =      8.0 : 1.0
               prev-word = 'a'             False : True   =      7.7 : 1.0
             prev-bigram = 'is for'         True : False  =      5.1 : 1.0
                prev-tag = 'ADP'            True : False  =      5.0 : 1.0
                     tag = 'ADV'            True : False  =      5.0 : 1.0
             prev-bigram = 'in the'        False : True   =      4.5 : 1.0


0.8842975206611571

In [33]:
regex_tokenizer = nltk.RegexpTokenizer(pattern=u"[-_]", gaps=True)
# nltk.pos_tag(regex_tokenizer.tokenize(v), tagset='universal')

In [34]:
def extract_features_v2(tagged: list, pos: int, cve_id=None):
    """Extract contextual features from the sentence w.r.t given position of a word."""
    word, tag = tagged[pos]
    features = {
        'tag': tag,
        'word-len': len(word) > 3,
        'has-uppercase': any(w.isupper() for w in word),
        'vend_prod_match': similar(word, cves[cve_id].get_cpe(cpe_type='a')),
    }
    if pos == 0:
        features['prev-tag'] = '<start>'
    else:
        if pos > 1:
            features['prev-tag'] = tagged[pos - 1][1]
            features['prev-bigram'] = " ".join(w.lower() for w, t in tagged[pos - 2: pos])
            
        
        if pos < len(tagged):    
            features['next-bigram'] = " ".join(w.lower() for w, t in tagged[pos + 1: pos + 3])
            features['next-bigram-tags'] = " ".join(t for w, t in tagged[pos + 1: pos + 3])
        
    return features

# Lets not split the dataset here for now
feature_list = create_feature_list_long(extract_features_v2, descriptions, labels, cve_ids)

classifier = nltk.NaiveBayesClassifier.train(feature_list)
classifier.show_most_informative_features()

eval_accuracy(classifier, extract_features_v2, descriptions, labels, cve_ids, n=1)

Most Informative Features
         vend_prod_match = True             True : False  =     37.6 : 1.0
             next-bigram = '3.0 .'          True : False  =     32.5 : 1.0
        next-bigram-tags = 'NUM ADJ'        True : False  =     27.3 : 1.0
             prev-bigram = None            False : True   =     19.0 : 1.0
             next-bigram = None            False : True   =     15.6 : 1.0
         vend_prod_match = False           False : True   =     10.9 : 1.0
        next-bigram-tags = 'NUM .'          True : False  =      9.4 : 1.0
        next-bigram-tags = 'NUM NOUN'       True : False  =      5.8 : 1.0
                prev-tag = 'ADP'            True : False  =      5.2 : 1.0
             prev-bigram = 'is for'         True : False  =      5.1 : 1.0


0.8512396694214877

In [35]:
def extract_features_v3(tagged: list, pos: int, cve_id=None):
    """Extract contextual features from the sentence w.r.t given position of a word."""
    word, tag = tagged[pos]
    try:
        ver_pos = [pos for pos, (w, t) in enumerate(tagged[pos:]) if t == 'NUM'][0]
    except:
        ver_pos = None
        
    ver_follows = any([pos for pos, (w, t) in enumerate(tagged[pos:]) if t == 'NUM' or re.search(u'version', w)])
    features = {
        'tag': tag,
        'word-len': len(word) > 3,
        'vend_prod_match': similar(word, cves[cve_id].get_cpe(cpe_type='a')),
        'version_pos': ver_pos,
        'ver_follows': ver_follows
    }
    if pos == 0:
        features['prev-tag'] = '<start>'
    else:
        if pos < len(tagged) - 1:    
            features['next-bigram'] = " ".join(w.lower() for w, t in tagged[pos + 1: pos + 3])
            features['next-bigram-tags'] = " ".join(t for w, t in tagged[pos + 1: pos + 3])
            
        features['prev-word'] = tagged[pos - 1][0]
            
            
    return features

# Lets not split the dataset here for now
feature_list = create_feature_list_long(extract_features_v3, descriptions, labels, cve_ids)

classifier = nltk.NaiveBayesClassifier.train(feature_list)
classifier.show_most_informative_features()

eval_accuracy(classifier, extract_features_v3, descriptions, labels, cve_ids, n=2)

Most Informative Features
         vend_prod_match = True             True : False  =     37.6 : 1.0
             next-bigram = '3.0 .'          True : False  =     32.6 : 1.0
             next-bigram = None            False : True   =     31.3 : 1.0
        next-bigram-tags = 'NUM ADJ'        True : False  =     27.5 : 1.0
         vend_prod_match = False           False : True   =     10.9 : 1.0
        next-bigram-tags = 'NUM .'          True : False  =      9.5 : 1.0
               prev-word = None            False : True   =      8.4 : 1.0
             version_pos = None            False : True   =      8.0 : 1.0
               prev-word = 'the'           False : True   =      7.1 : 1.0
               prev-word = 'a'             False : True   =      7.0 : 1.0


0.9752066115702479

In [36]:
def extract_features_v4(tagged: list, sent_pos:int, word_pos: int, cve_id=None):
    """Extract contextual features from the sentence w.r.t given position of a word."""
    word, tag = tagged[word_pos]
    try:
        ver_pos = [pos for pos, (w, t) in enumerate(tagged[word_pos:]) if t == 'NUM'][0]
    except:
        ver_pos = None
    features = {
        'tag': tag,
        'word_len': len(word) > 3,
        'vend_prod_match': similar(word, cves[cve_id].get_cpe(cpe_type='a')),
        'version_pos': ver_pos,
        'sent_pos': sent_pos
    }
    if word_pos == 0:
        features['prev-tag'] = '<start>'
    else:
        if word_pos > 1:
            features['prev_tag'] = tagged[word_pos - 1][1]
            features['prev_bigram'] = " ".join(w.lower() for w, t in tagged[word_pos - 2: word_pos])
        
        if word_pos < len(tagged) - 1:    
            features['next_bigram'] = " ".join(w.lower() for w, t in tagged[word_pos + 1: word_pos + 3])
            features['next_bigram-tags'] = " ".join(t for w, t in tagged[word_pos + 1: word_pos + 3])
            
    return features

# Lets not split the dataset here for now
feature_list = create_feature_list_v2(extract_features_v4, descriptions, labels, cve_ids)

classifier = nltk.NaiveBayesClassifier.train(feature_list)
classifier.show_most_informative_features()

eval_accuracy_v2(classifier, extract_features_v4, descriptions, labels, cve_ids, n=1)

Most Informative Features
             next_bigram = None            False : True   =     49.2 : 1.0
         vend_prod_match = True             True : False  =     37.6 : 1.0
             next_bigram = '3.0 .'          True : False  =     33.3 : 1.0
             prev_bigram = None            False : True   =     29.9 : 1.0
        next_bigram-tags = 'NUM ADJ'        True : False  =     27.7 : 1.0
         vend_prod_match = False           False : True   =     10.9 : 1.0
             version_pos = 1                True : False  =      9.9 : 1.0
        next_bigram-tags = 'NUM .'          True : False  =      9.5 : 1.0
        next_bigram-tags = 'NUM NOUN'       True : False  =      5.9 : 1.0
             version_pos = None            False : True   =      5.9 : 1.0


0.6115702479338843

## Test accuracy on the toy dataset using multiple feature extractors

In [37]:
x = np.arange(1, 15)
feature_extractors = [extract_features_vanilla, extract_features_v0, extract_features_v1, extract_features_v2,
                      extract_features_v3
                     ]

split = int(len(descriptions) * 0.2)
test_set, test_labels = descriptions[:split], labels[:split]
train_set, train_labels = descriptions[split:], labels[split:]

accuracy_list = list()
for i, extractor in enumerate(feature_extractors):
    # Lets not split the dataset here for now
    feature_list = create_feature_list_long(extractor, train_set, train_labels, cve_ids[split:])

    classifier = nltk.NaiveBayesClassifier.train(feature_list)
#     classifier.show_most_informative_features()
    
    accuracy_list.append([eval_accuracy(classifier, extractor, test_set, test_labels, cve_ids[:split], i) for i in x])
    
feature_list = create_feature_list_v2(extract_features_v4, train_set, train_labels, cve_ids[split:])

classifier = nltk.NaiveBayesClassifier.train(feature_list)
classifier.show_most_informative_features()

accuracy_list.append([eval_accuracy_v2(classifier, extract_features_v4, test_set, test_labels, cve_ids[:split], i) for i in x])

Most Informative Features
         vend_prod_match = True             True : False  =     45.1 : 1.0
             next_bigram = None            False : True   =     40.0 : 1.0
             next_bigram = '3.0 .'          True : False  =     34.2 : 1.0
        next_bigram-tags = 'NUM ADJ'        True : False  =     24.4 : 1.0
             prev_bigram = None            False : True   =     24.3 : 1.0
             version_pos = 1                True : False  =     13.9 : 1.0
        next_bigram-tags = 'NUM .'          True : False  =     12.7 : 1.0
         vend_prod_match = False           False : True   =     10.6 : 1.0
        next_bigram-tags = 'NUM CONJ'       True : False  =      8.1 : 1.0
        next_bigram-tags = 'NUM NOUN'       True : False  =      6.6 : 1.0


### Current approach

In [38]:
from collections import OrderedDict


def get_first_sentence(description):
    """Get only the first sentence from the description."""
    sentences = nltk.sent_tokenize(description)
    return sentences[0] if sentences else ''


def guess_package_name(description):
    from nltk.corpus import stopwords
    """Guess package name from given description.

    Very naive approach. Words starting with uppercase letter
    are considered to be possible package names (minus stop words).

    Returns a list of possible package names, without duplicates.
    """

    stop_words = set()

    try:
        # Fails when no downloaded stopwords are available.
        stop_words.update(stopwords.words('english'))
    except LookupError:
        # Download stopwords since they are not available.
        nltk.download('stopwords')
        stop_words.update(stopwords.words('english'))

    regexp = re.compile('[A-Z][A-Za-z0-9-:]*')  # ? TODO: tweak
    suspects = regexp.findall(description)

    results = []

    if not suspects:
        return results

    results = [x.lower() for x in suspects if x.lower() not in stop_words]
    # get rid of duplicates, but keep order
    results = list(OrderedDict.fromkeys(results))

    return results


def get_package_name_candidates(description):
    """Try to identify possible package names in the CVE's description."""
    pkg_name_candidates = set()
    first_sentence = get_first_sentence(description)
    names = guess_package_name(first_sentence)
    pkg_name_candidates.update(set(names))
    return pkg_name_candidates

In [39]:
def eval_old_accuracy(sentences: list, labels: list) -> float:
    from collections import Counter
    
    assert len(sentences) == len(labels)
    
    guesses = [None] * len(labels)
    accurates = [None] * len(labels)
    for i, sent in enumerate(sentences):
        
        names = get_package_name_candidates(sent)
        guesses[i] = len(names)
        accurates[i] = labels[i] in names # only taking the first word .. needs better implementation
    
#         print('Expected: `%s`' % labels[i], 'got: `%s`' % most_prob)
    
    # TODO: come up with more sophisticated way of measuring accuracy
    bag = Counter(accurates)
    return bag[True] / len(labels), sum(guesses) / len(guesses)

In [40]:
def get_nof_guesses(sentences: list, labels: list) -> float:
    assert len(sentences) == len(labels)
    
    guesses = [None] * len(labels)
    for i, sent in enumerate(sentences):
        
        names = get_package_name_candidates(sent)
        guesses[i] = len(names)
        
    # TODO: come up with more sophisticated way of measuring accuracy
    return guesses

### Evaluation of the current approach on the test sets

In [41]:
old_accuracy, mean_guess = eval_old_accuracy(test_set, test_labels)

### Evaluation of the current approach on the whole set

In [42]:
eval_old_accuracy(descriptions, labels)

(0.7355371900826446, 3.347107438016529)

## Draw extractor accuracy

In [43]:
trace_names = ['vanilla_extractor'] + ['extract_features_v%d' % i for i in range(len(accuracy_list))]
data = [go.Scatter(x=x, y=ac, name=trace_names[i]) for i, ac in enumerate(accuracy_list)]

layout = go.Layout(
    yaxis=dict(
        title='Accuracy',
        titlefont=dict(
            color='grey'
        )
    ),
    xaxis=dict(
        title='Candidates',
        titlefont=dict(
            color='grey'
        )
    ),
    shapes=[
        {
            'type': 'line',
            'x0': mean_guess,
            'x1': mean_guess,
            'y0': -0.05,
            'y1': 1.1,
            'opacity': 0.2,
            'line': {
                'dash': 'dash'
            }
        },
        {
            'type': 'line',
            'x0': -0.5,
            'x1': 20,
            'y0': old_accuracy,
            'y1': old_accuracy,
            'opacity': 0.2,
            'line': {
                'dash': 'dash'
            }
        }
    ]
)

fig = go.Figure(data=data, layout=layout)

In [44]:
iplot(fig, show_link=False)

In [45]:
guess_trace = go.Scatter(y=get_nof_guesses(test_set, test_labels))
layout = go.Layout(
    xaxis=dict(
        ticks='',
        showticklabels=False,
        showgrid=False
    ),
    shapes=[
        {
            'type': 'line',
            'y0': mean_guess,
            'y1': mean_guess,
            'x0': -0.1,
            'x1': 23,
            'opacity': 0.2,
            'line': {
                'dash': 'dash'
            }
        },
    ]
)

fig = go.Figure(data=[guess_trace], layout=layout)

In [46]:
iplot(fig, show_link=False)

## Remove words/stopwords when evaluating accuracy

In [47]:
stopwords = set(nltk.corpus.stopwords.words())

def eval_accuracy_spec(classifier, extractor, sentences: list, labels: list, cve_ids: list, n=1) -> float:
    from collections import Counter
    
    assert len(sentences) == len(labels)
    
    accurates = [None] * len(labels)
    for i, sent in enumerate(sentences):
        tagged = nltk.pos_tag(nltk.word_tokenize(sent), tagset='universal')
        prob_dist = [classifier.prob_classify(extractor(tagged, j, cve_ids[i])) for j in range(len(tagged))]
        # get probs and remove stopwords
        probs = [(word, tag, prob.prob(True)) for (word, tag), prob in zip(tagged, prob_dist) if word not in stopwords and tag == 'NOUN']
        probs = sorted(probs, key=lambda x: x[2], reverse=True)
        
        most_prob = set([prob[0].lower() for prob in probs[:n]])
        accurates[i] = labels[i] in most_prob
    
#         if not accurates[i]:
#             print('Sentence: ', sent)
#             print('Expected: `%s`' % labels[i], 'got: `%s`' % most_prob)
    
    bag = Counter(accurates)
    return bag[True] / len(labels)

In [48]:
accuracy_list = list()
for i, extractor in enumerate(feature_extractors):
    # Lets not split the dataset here for now
    feature_list = create_feature_list_long(extractor, train_set, train_labels, cve_ids[split:])

    classifier = nltk.NaiveBayesClassifier.train(feature_list)
#     classifier.show_most_informative_features()
    
    accuracy_list.append([eval_accuracy_spec(classifier, extractor, test_set, test_labels, cve_ids[:split], i) for i in x])
    
feature_list = create_feature_list_v2(extract_features_v4, train_set, train_labels, cve_ids)

classifier = nltk.NaiveBayesClassifier.train(feature_list)
classifier.show_most_informative_features()

accuracy_list.append([eval_accuracy_v2(classifier, extract_features_v4, test_set, test_labels, cve_ids, i) for i in x])

Most Informative Features
             next_bigram = None            False : True   =     40.0 : 1.0
             next_bigram = '3.0 .'          True : False  =     34.2 : 1.0
        next_bigram-tags = 'NUM ADJ'        True : False  =     24.4 : 1.0
             prev_bigram = None            False : True   =     24.3 : 1.0
             version_pos = 1                True : False  =     13.9 : 1.0
        next_bigram-tags = 'NUM .'          True : False  =     12.7 : 1.0
        next_bigram-tags = 'NUM CONJ'       True : False  =      8.1 : 1.0
        next_bigram-tags = 'NUM NOUN'       True : False  =      6.6 : 1.0
             version_pos = None            False : True   =      6.0 : 1.0
                     tag = 'ADV'            True : False  =      5.5 : 1.0


In [49]:
trace_names = ['vanilla_extractor'] + ['extract_features_v%d' % i for i in range(len(accuracy_list))]
data = [go.Scatter(x=x, y=ac, name=trace_names[i]) for i, ac in enumerate(accuracy_list)]

layout = go.Layout(
    yaxis=dict(
        title='Accuracy',
        titlefont=dict(
            color='grey'
        )
    ),
    xaxis=dict(
        title='Candidates',
        titlefont=dict(
            color='grey'
        )
    ),
    shapes=[
        {
            'type': 'line',
            'x0': mean_guess,
            'x1': mean_guess,
            'y0': -0.05,
            'y1': 1.1,
            'opacity': 0.2,
            'line': {
                'dash': 'dash'
            }
        },
        {
            'type': 'line',
            'x0': -0.5,
            'x1': 20,
            'y0': old_accuracy,
            'y1': old_accuracy,
            'opacity': 0.2,
            'line': {
                'dash': 'dash'
            }
        }
    ]
)

fig = go.Figure(data=data, layout=layout)

In [50]:
iplot(fig, show_link=False)

## Try predictions on df

In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1125 entries, (Microsoft, ChakraCore) to (memcached, memcached)
Data columns (total 7 columns):
cve_id           1125 non-null object
url              1125 non-null object
description      1125 non-null object
version_range    1125 non-null object
Java             1125 non-null int64
JavaScript       1125 non-null int64
Python           1125 non-null int64
dtypes: int64(3), object(4)
memory usage: 81.9+ KB


In [52]:
def predict(sent:str, classifier=None, extractor=None, cve_id: list = None, n=1) -> float:
    tagged = nltk.pos_tag(nltk.word_tokenize(sent), tagset='universal')
    
    prob_dist = [classifier.prob_classify(extractor(tagged, j, cve_id)) for j in range(len(tagged))]
    
    probs = [(word, tag, prob.prob(True)) for (word, tag), prob in zip(tagged, prob_dist) if word not in stopwords and tag == 'NOUN' and not re.match(u"version[s]?", word)]
    probs = sorted(probs, key=lambda x: x[2], reverse=True)

    return probs[:n]

In [53]:
# Choose a classifier and train it on the whole df
feature_list = create_feature_list_long(extract_features_v3, descriptions, labels, cve_ids)
classifier = nltk.NaiveBayesClassifier.train(feature_list)

In [54]:
predictions = [None] * len(df)
for i, desc in enumerate(df.description.values):
    probs = predict(desc, classifier, extract_features_v3, cve_id=df.cve_id.values[i], n=3)
    predictions[i] = probs

In [55]:
# get just the names
pred_proj_names = [tuple(zip(*p))[0] for p in predictions]

In [56]:
pred_df = pd.Series(pred_proj_names, name='prediction')

In [58]:
df['prediction'] = pred_proj_names
df[(df.Java == 0) | (df.JavaScript == 0)][['description', 'prediction']].tail(100).reset_index().style

Unnamed: 0,username,project,description,prediction
0,paintballrefjosh,MaNGOSWebV4,paintballrefjosh/MaNGOSWebV4 4.0.8 is vulnerable to a reflected XSS in inc/admin/template_files/admin.faq.php (id parameter).,"('paintballrefjosh/MaNGOSWebV4', 'parameter', 'inc/admin/template_files/admin.faq.php')"
1,paintballrefjosh,MaNGOSWebV4,paintballrefjosh/MaNGOSWebV4 4.0.8 is vulnerable to a reflected XSS in inc/admin/template_files/admin.donate.php (id parameter).,"('paintballrefjosh/MaNGOSWebV4', 'parameter', 'inc/admin/template_files/admin.donate.php')"
2,paintballrefjosh,MaNGOSWebV4,paintballrefjosh/MaNGOSWebV4 4.0.8 is vulnerable to a reflected XSS in inc/admin/template_files/admin.fplinks.php (linkid parameter).,"('paintballrefjosh/MaNGOSWebV4', 'parameter', 'inc/admin/template_files/admin.fplinks.php')"
3,paintballrefjosh,MaNGOSWebV4,paintballrefjosh/MaNGOSWebV4 4.0.8 is vulnerable to a reflected XSS in inc/admin/template_files/admin.shop.php (id parameter).,"('paintballrefjosh/MaNGOSWebV4', 'parameter', 'inc/admin/template_files/admin.shop.php')"
4,paintballrefjosh,MaNGOSWebV4,paintballrefjosh/MaNGOSWebV4 4.0.8 is vulnerable to a reflected XSS in inc/admin/template_files/admin.vote.php (id parameter).,"('paintballrefjosh/MaNGOSWebV4', 'parameter', 'inc/admin/template_files/admin.vote.php')"
5,phpipam,phpipam,Multiple Cross-Site Scripting (XSS) issues were discovered in phpipam 1.2. The vulnerabilities exist due to insufficient filtration of user-supplied data passed to several pages (instructions in app/admin/instructions/preview.php; subnetId in app/admin/powerDNS/refresh-ptr-records.php). An attacker could execute arbitrary HTML and script code in a browser in the context of the vulnerable website.,"('pages', 'app/admin/instructions/preview.php', 'Cross-Site')"
6,reasoncms,reasoncms,"A Cross-Site Scripting (XSS) issue was discovered in reasoncms before 4.7.1. The vulnerability exists due to insufficient filtration of user-supplied data (nyroModalSel) passed to the ""reasoncms-master/www/nyroModal/demoSent.php"" URL. An attacker could execute arbitrary HTML and script code in a browser in the context of the vulnerable website.","('reasoncms', 'Scripting', 'data')"
7,Telaxus,EPESI,"Multiple Cross-Site Scripting (XSS) issues were discovered in EPESI 1.8.1.1. The vulnerabilities exist due to insufficient filtration of user-supplied data (state, element, id, tab, cid) passed to the ""EPESI-master/modules/Utils/RecordBrowser/favorites.php"" URL. An attacker could execute arbitrary HTML and script code in a browser in the context of the vulnerable website.","('EPESI', 'Cross-Site', 'Scripting')"
8,Telaxus,EPESI,"Multiple Cross-Site Scripting (XSS) issues were discovered in EPESI 1.8.1.1. The vulnerabilities exist due to insufficient filtration of user-supplied data (visible, tab, cid) passed to the EPESI-master/modules/Utils/RecordBrowser/Filters/save_filters.php URL. An attacker could execute arbitrary HTML and script code in a browser in the context of the vulnerable website.","('EPESI', 'Cross-Site', 'Scripting')"
9,Telaxus,EPESI,"Multiple Cross-Site Scripting (XSS) issues were discovered in EPESI 1.8.1.1. The vulnerabilities exist due to insufficient filtration of user-supplied data (element, state, cat, id, cid) passed to the EPESI-master/modules/Utils/Watchdog/subscribe.php URL. An attacker could execute arbitrary HTML and script code in a browser in the context of the vulnerable website.","('EPESI', 'Cross-Site', 'Scripting')"
