In [1]:
import os
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import json

from nvdlib.nvd import NVD

In [2]:
import nltk

In [3]:
import ast
df = pd.read_csv('dataframe-nvd-2017.csv', converters={'version_range': ast.literal_eval})

In [4]:
import plotly

from plotly import graph_objs as go
from plotly.offline import iplot, init_notebook_mode

init_notebook_mode(connected=True)

In [5]:
%matplotlib inline

## Prepare the DataFrame

In [6]:
nvd = NVD.from_feeds(['2017'])

In [7]:
nvd.update()

In [8]:
GH_BASE_URL = u"http[s]://github.com"

In [9]:
ecos = ['Java', 'JavaScript', 'Python']
df_ecos = df[ecos]
df_ecos = df[['username', 'project', 'version_range', 'url', *ecos]][df_ecos.any(1)]

df_ecos.set_index(['username', 'project'])

Unnamed: 0_level_0,Unnamed: 1_level_0,version_range,url,Java,JavaScript,Python
username,project,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Microsoft,ChakraCore,"(None, None)",https://github.com/Microsoft/ChakraCore,0,49258214,84984
torproject,tor,"(0.3.0.7, 0.2.4.28)",https://github.com/torproject/tor,0,0,245996
lota,phamm,"(0.6.6, 0.6.6)",https://github.com/lota/phamm,0,1170,0
JiounDai,CVE-2017-0478,"(None, None)",https://github.com/JiounDai/CVE-2017-0478,6631,0,0
alephsecurity,PoCs,"(None, None)",https://github.com/alephsecurity/PoCs,0,0,1950
zulip,zulip,"(1.4.2, 1.4.2)",https://github.com/zulip/zulip,0,2403124,4774989
recurly,recurly-client-python,"(2.4.0, 2.6.1)",https://github.com/recurly/recurly-client-python,0,0,156458
fedora-infra,fedmsg,"(0.18.1, 0.18.1)",https://github.com/fedora-infra/fedmsg,0,0,385668
twisted,txaws,"(-, -)",https://github.com/twisted/txaws,0,0,783299
akeneo,pim-community-dev,"(1.4.0, 1.6.5)",https://github.com/akeneo/pim-community-dev,0,2167590,0


In [10]:
import re
def get_reference(cve, url=None, pattern=None) -> str:
    for ref in cve.references:
        if url and url == ref:
            return ref

        if re.search(pattern, ref):
            return ref

    return None

In [11]:
def strip_src_url(url: str):
    """ATM assume that the only reference to source is github."""
    url_base_pattern = u"http[s]://github.com/([\w-]+)/([\w-]+[.]*[\w-]+)"
    strip_url = re.search(url_base_pattern, url)
    
    if not strip_url:
        print(url)
        return None
    
    return strip_url[0]

In [12]:
# Get descriptions and append them to the current DataFrame to avoid recreating a new one from scratch
cves = dict()
cve_list = list()

for cve in nvd.cves():
    ref = get_reference(cve, pattern="http[s]://github.com")
    if ref is None:
        continue
        
    ref = strip_src_url(ref)
    cve_list.append((cve.cve_id, ref, cve.description))
    cves[cve.cve_id] = cve

In [13]:
df_desc = pd.DataFrame(cve_list, columns=['cve_id', 'url', 'description'])

In [14]:
df = pd.merge(df_ecos, df_desc, how='inner', on='url').set_index(['username', 'project'])

In [15]:
df.columns.tolist()
df = df[[
    'cve_id',
    'url',
    'description',
    'version_range',
    'Java',
    'JavaScript',
    'Python',
]]

In [16]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,cve_id,url,description,version_range,Java,JavaScript,Python
username,project,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Microsoft,ChakraCore,CVE-2017-0028,https://github.com/Microsoft/ChakraCore,A remote code execution vulnerability exists w...,"(None, None)",0,49258214,84984
Microsoft,ChakraCore,CVE-2017-0152,https://github.com/Microsoft/ChakraCore,A remote code execution vulnerability exists i...,"(None, None)",0,49258214,84984
Microsoft,ChakraCore,CVE-2017-0196,https://github.com/Microsoft/ChakraCore,An information disclosure vulnerability in Mic...,"(None, None)",0,49258214,84984
Microsoft,ChakraCore,CVE-2017-0223,https://github.com/Microsoft/ChakraCore,A remote code execution vulnerability exists i...,"(None, None)",0,49258214,84984
Microsoft,ChakraCore,CVE-2017-0252,https://github.com/Microsoft/ChakraCore,A remote code execution vulnerability exists i...,"(None, None)",0,49258214,84984
torproject,tor,CVE-2017-0375,https://github.com/torproject/tor,The hidden-service feature in Tor before 0.3.0...,"(0.3.0.7, 0.2.4.28)",0,0,245996
torproject,tor,CVE-2017-0376,https://github.com/torproject/tor,The hidden-service feature in Tor before 0.3.0...,"(0.3.0.7, 0.2.4.28)",0,0,245996
torproject,tor,CVE-2017-0377,https://github.com/torproject/tor,Tor 0.3.x before 0.3.0.9 has a guard-selection...,"(0.3.0.7, 0.2.4.28)",0,0,245996
torproject,tor,CVE-2017-0380,https://github.com/torproject/tor,The rend_service_intro_established function in...,"(0.3.0.7, 0.2.4.28)",0,0,245996
lota,phamm,CVE-2017-0378,https://github.com/lota/phamm,XSS exists in the login_form function in views...,"(0.6.6, 0.6.6)",0,1170,0


## Create toy data set

In [17]:
toy_df = df[['cve_id', 'description']][df.Java > 1E2]

In [18]:
# labels are a position of the project token in the token list
labels = [None] * len(toy_df)
for i, (index, row) in enumerate(toy_df.iterrows()):
    proj = index[1]
    desc = row.description.lower()
    # find the position of proj in the description, if applicable
    tokens = nltk.word_tokenize(desc)
    found = False
    for pos, token in enumerate(tokens):
        if token == proj:
            found = True
            break
    if not found:
        pos = None
    labels[i] = (row.cve_id, token, pos)

# turn index into series
labels = pd.DataFrame(labels, columns=['cve_id', 'label', 'pos'])

In [19]:
toy_df = toy_df.reset_index().merge(labels, how='outer', on='cve_id').set_index(['username', 'project'])

In [20]:
toy_df = toy_df[toy_df.pos.notnull()]

In [21]:
toy_df[['description', 'pos']].reset_index().style

Unnamed: 0,username,project,description,pos
0,modxcms,revolution,MODX Revolution version 2.x - 2.5.6 is vulnerable to blind SQL injection caused by improper sanitization by the escape method resulting in authenticated user accessing database and possibly escalating privileges.,1
1,modxcms,revolution,"In MODX Revolution 2.5.7, the ""key"" and ""name"" parameters in the System Settings module are vulnerable to XSS. A malicious payload sent to connectors/index.php will be triggered by every user, when they visit this module.",2
2,modxcms,revolution,Directory traversal in setup/processors/url_search.php (aka the search page of an unused processor) in MODX Revolution 2.5.7 might allow remote attackers to obtain system directory information.,16
3,modxcms,revolution,"In MODX Revolution before 2.5.7, when PHP 5.3.3 is used, an attacker is able to include and execute arbitrary files on the web server due to insufficient validation of the action parameter to setup/index.php, aka directory traversal.",2
4,modxcms,revolution,"In MODX Revolution before 2.5.7, an attacker is able to trigger Reflected XSS by injecting payloads into several fields on the setup page, as demonstrated by the database_type parameter.",2
5,modxcms,revolution,"In MODX Revolution before 2.5.7, a user with file upload permissions is able to execute arbitrary code by uploading a file with the name .htaccess.",2
6,modxcms,revolution,"In MODX Revolution before 2.5.7, a user with resource edit permissions can inject an XSS payload into the title of any post via the pagetitle parameter to connectors/index.php.",2
7,modxcms,revolution,"In MODX Revolution before 2.5.7, an attacker might be able to trigger XSS by injecting a payload into the HTTP Host header of a request. This is exploitable only in conjunction with other issues such as Cache Poisoning.",2
8,ngallagher,simplexml,"SimpleXML (latest version 2.7.1) is vulnerable to an XXE vulnerability resulting SSRF, information disclosure, DoS and so on.",0
9,swagger-api,swagger-parser,"A vulnerability in Swagger-Parser's version <= 1.0.30 and Swagger codegen version <= 2.2.2 yaml parsing functionality results in arbitrary code being executed when a maliciously crafted yaml Open-API specification is parsed. This in particular, affects the 'generate' and 'validate' command in swagger-codegen (<= 2.2.2) and can lead to arbitrary code being executed when these commands are used on a well-crafted yaml specification.",3


## Decription processing

In [22]:
def similar(word, cpes) -> bool:
    for cpe in cpes:
        vendor, = cpe.vendor
        product, = cpe.product
        if word.lower() in {vendor.lower(), product.lower()}:
            return True
    
    return False

In [23]:
df.loc['FasterXML']

Unnamed: 0_level_0,cve_id,url,description,version_range,Java,JavaScript,Python
project,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
jackson-databind,CVE-2017-15095,https://github.com/FasterXML/jackson-databind,A deserialization flaw was discovered in the j...,"(None, None)",6201368,0,0
jackson-databind,CVE-2017-17485,https://github.com/FasterXML/jackson-databind,FasterXML jackson-databind through 2.8.10 and ...,"(None, None)",6201368,0,0
jackson-databind,CVE-2017-7525,https://github.com/FasterXML/jackson-databind,A deserialization flaw was discovered in the j...,"(None, None)",6201368,0,0


In [24]:
descriptions, labels, cve_ids = list(zip(*toy_df[['description', 'label', 'cve_id']].values))

In [25]:
def create_feature_list_long(feature_extractor, sents, labels, cve_ids) -> list:
    """Uses all sentences to create feature list given feature extractor."""
    feature_list = list()
    for i, desc in enumerate(sents):
        label = labels[i]
        tagged_sent = nltk.pos_tag(nltk.word_tokenize(desc), tagset='universal')
        for j, (word, tag) in enumerate(tagged_sent):
            is_label = word == label
            features = feature_extractor(tagged_sent, j, cve_ids[i])
            feature_list.append((features, is_label))
    
    return feature_list

In [26]:
def create_feature_list_short(feature_extractor, sents, labels, cve_ids) -> list:
    """Uses only first sentence to create feature list given feature extractor."""
    feature_list = list()
    for i, desc in enumerate(sents):
        label = labels[i]
        first_sent = nltk.sent_tokenize(desc)[0]
        tagged_sent = nltk.pos_tag(nltk.word_tokenize(first_sent), tagset='universal')
        for j, (word, tag) in enumerate(tagged_sent):
            is_label = word == label
            features = feature_extractor(tagged_sent, j, cve_ids[i])
            feature_list.append((features, is_label))
    
    return feature_list

In [27]:
def create_feature_list_v2(feature_extractor, sents, labels, cve_ids) -> list:
    """Uses only first sentence to create feature list given feature extractor."""
    feature_list = list()
    for i, desc in enumerate(sents):
        label = labels[i]
        for j, sent in enumerate(nltk.sent_tokenize(desc)):
            tagged_sent = nltk.pos_tag(nltk.word_tokenize(sent), tagset='universal')
            for k, (word, tag) in enumerate(tagged_sent):
                is_label = word == label
                features = feature_extractor(tagged_sent, sent_pos=j, word_pos=k, cve_id=cve_ids[i])
                feature_list.append((features, is_label))
    
    return feature_list

In [28]:
def eval_accuracy(classifier, extractor, sentences: list, labels: list, cve_ids: list, n=1) -> float:
    from collections import Counter
    
    assert len(sentences) == len(labels)
    
    accurates = [None] * len(labels)
    for i, sent in enumerate(sentences):
        tagged = nltk.pos_tag(nltk.word_tokenize(sent), tagset='universal')
        prob_dist = [classifier.prob_classify(extractor(tagged, j, cve_ids[i])) for j in range(len(tagged))]
        probs = [(word, tag, prob.prob(True)) for (word, tag), prob in zip(tagged, prob_dist)]
        probs  = sorted(probs, key=lambda x: x[2], reverse=True)
        
        most_prob = set([prob[0].lower() for prob in probs[:n]])
        accurates[i] = labels[i] in most_prob
    
#         if not accurates[i]:
#             print('Sentence: ', sent)
#             print('Expected: `%s`' % labels[i], 'got: `%s`' % probs, '\n')
    
    bag = Counter(accurates)
    return bag[True] / len(labels)

In [29]:
def eval_accuracy_v2(classifier, extractor, sentences: list, labels: list, cve_ids: list, n=1) -> float:
    from collections import Counter
    
    assert len(sentences) == len(labels)
    
    accurates = [None] * len(labels)
    for i, sent in enumerate(sentences):
        for s in nltk.sent_tokenize(sent):
            tagged = nltk.pos_tag(nltk.word_tokenize(s), tagset='universal')
            prob_dist = [classifier.prob_classify(extractor(tagged, s, j, cve_ids[i])) for j in range(len(tagged))]
            probs = [(word, tag, prob.prob(True)) for (word, tag), prob in zip(tagged, prob_dist)]
            probs  = sorted(probs, key=lambda x: x[2], reverse=True)

            most_prob = set([prob[0].lower() for prob in probs[:n]])
            accurates[i] = labels[i] in most_prob
    
#         print('Expected: `%s`' % labels[i], 'got: `%s`' % most_prob)
    
    # TODO: come up with more sophisticated way of measuring accuracy
    bag = Counter(accurates)
    return bag[True] / len(labels)

In [30]:
def extract_features_vanilla(tagged: list, pos: int, cve_id=None):
    """Extract contextual features from the sentence w.r.t given position of a word."""
    word, tag = tagged[pos]
    features = {
        'tag': tag,
        'has-uppercase': word[0].isupper(),
        'word-len': len(word) > 3,
    }
    if pos == 0:
        features['prev-tag'] = '<start>'
    else:
        features['prev-word'] = tagged[pos - 1][0].lower()
        features['prev-tag'] = tagged[pos - 1][1]
        
    return features

# Lets not split the dataset here for now
feature_list = create_feature_list_long(extract_features_vanilla, descriptions, labels, cve_ids)

classifier = nltk.NaiveBayesClassifier.train(feature_list)
classifier.show_most_informative_features()

eval_accuracy(classifier, extract_features_vanilla, descriptions, labels, cve_ids, n=1)

Most Informative Features
               prev-word = 'the'           False : True   =      8.2 : 1.0
               prev-word = None            False : True   =      8.0 : 1.0
               prev-word = 'a'             False : True   =      7.7 : 1.0
                prev-tag = 'ADP'            True : False  =      5.0 : 1.0
                     tag = 'ADV'            True : False  =      5.0 : 1.0
                     tag = 'ADJ'            True : False  =      3.6 : 1.0
                prev-tag = 'NOUN'          False : True   =      2.5 : 1.0
               prev-word = 'aka'            True : False  =      2.3 : 1.0
                prev-tag = '<start>'        True : False  =      1.8 : 1.0
                word-len = True             True : False  =      1.7 : 1.0


0.15702479338842976

In [31]:
def extract_features_v0(tagged: list, pos: int, cve_id=None):
    """Extract contextual features from the sentence w.r.t given position of a word."""
    word, tag = tagged[pos]
    features = {
        'tag': tag,
        'vend_prod_match': similar(word, cves[cve_id].get_cpe(cpe_type='a')),
        'word-len': len(word) > 3,
    }
    if pos == 0:
        features['prev-tag'] = '<start>'
    else:
        if pos > 1:
            features['prev-tag'] = tagged[pos - 1][1]
            features['prev-bigram'] = " ".join(w.lower() for w, t in tagged[pos - 2: pos])
            
        features['prev-word'] = tagged[pos - 1][0].lower()
        features['prev-tag'] = tagged[pos - 1][1]
        
    return features

# Lets not split the dataset here for now
feature_list = create_feature_list_long(extract_features_v0, descriptions, labels, cve_ids)

classifier = nltk.NaiveBayesClassifier.train(feature_list)
classifier.show_most_informative_features()

eval_accuracy(classifier, extract_features_v0, descriptions, labels, cve_ids, n=1)

Most Informative Features
         vend_prod_match = True             True : False  =     37.6 : 1.0
             prev-bigram = None            False : True   =     19.0 : 1.0
         vend_prod_match = False           False : True   =     10.9 : 1.0
               prev-word = 'the'           False : True   =      8.2 : 1.0
               prev-word = None            False : True   =      8.0 : 1.0
               prev-word = 'a'             False : True   =      7.7 : 1.0
             prev-bigram = 'is for'         True : False  =      5.1 : 1.0
                prev-tag = 'ADP'            True : False  =      5.0 : 1.0
                     tag = 'ADV'            True : False  =      5.0 : 1.0
             prev-bigram = 'in the'        False : True   =      4.5 : 1.0


0.8760330578512396

In [32]:
def extract_features_v1(tagged: list, pos: int, cve_id=None):
    """Extract contextual features from the sentence w.r.t given position of a word."""
    word, tag = tagged[pos]
    cases = [w.isupper() for w in word]
    features = {
        'tag': tag,
        'has-uppercase': any(cases) and not all(cases),
        'vend_prod_match': similar(word, cves[cve_id].get_cpe(cpe_type='a')),
        'word-len-threshold': len(word) > 3
    }
    if pos == 0:
        features['prev-tag'] = '<start>'
    else:
        if pos > 1:
            features['prev-tag'] = tagged[pos - 1][1]
            features['prev-bigram'] = " ".join(w.lower() for w, t in tagged[pos - 2: pos])
            
        features['prev-word'] = tagged[pos - 1][0].lower()
        features['prev-tag'] = tagged[pos - 1][1]
        
    return features

# Lets not split the dataset here for now
feature_list = create_feature_list_long(extract_features_v1, descriptions, labels, cve_ids)

classifier = nltk.NaiveBayesClassifier.train(feature_list)
classifier.show_most_informative_features()

eval_accuracy(classifier, extract_features_v1, descriptions, labels, cve_ids, n=1)

Most Informative Features
         vend_prod_match = True             True : False  =     37.6 : 1.0
             prev-bigram = None            False : True   =     19.0 : 1.0
         vend_prod_match = False           False : True   =     10.9 : 1.0
               prev-word = 'the'           False : True   =      8.2 : 1.0
               prev-word = None            False : True   =      8.0 : 1.0
               prev-word = 'a'             False : True   =      7.7 : 1.0
             prev-bigram = 'is for'         True : False  =      5.1 : 1.0
                prev-tag = 'ADP'            True : False  =      5.0 : 1.0
                     tag = 'ADV'            True : False  =      5.0 : 1.0
             prev-bigram = 'in the'        False : True   =      4.5 : 1.0


0.8842975206611571

In [33]:
regex_tokenizer = nltk.RegexpTokenizer(pattern=u"[-_]", gaps=True)
# nltk.pos_tag(regex_tokenizer.tokenize(v), tagset='universal')

In [34]:
def extract_features_v2(tagged: list, pos: int, cve_id=None):
    """Extract contextual features from the sentence w.r.t given position of a word."""
    word, tag = tagged[pos]
    features = {
        'tag': tag,
        'word-len': len(word) > 3,
        'has-uppercase': any(w.isupper() for w in word),
        'vend_prod_match': similar(word, cves[cve_id].get_cpe(cpe_type='a')),
    }
    if pos == 0:
        features['prev-tag'] = '<start>'
    else:
        if pos > 1:
            features['prev-tag'] = tagged[pos - 1][1]
            features['prev-bigram'] = " ".join(w.lower() for w, t in tagged[pos - 2: pos])
            
        
        if pos < len(tagged):    
            features['next-bigram'] = " ".join(w.lower() for w, t in tagged[pos + 1: pos + 3])
            features['next-bigram-tags'] = " ".join(t for w, t in tagged[pos + 1: pos + 3])
        
    return features

# Lets not split the dataset here for now
feature_list = create_feature_list_long(extract_features_v2, descriptions, labels, cve_ids)

classifier = nltk.NaiveBayesClassifier.train(feature_list)
classifier.show_most_informative_features()

eval_accuracy(classifier, extract_features_v2, descriptions, labels, cve_ids, n=1)

Most Informative Features
         vend_prod_match = True             True : False  =     37.6 : 1.0
             next-bigram = '3.0 .'          True : False  =     32.5 : 1.0
        next-bigram-tags = 'NUM ADJ'        True : False  =     27.3 : 1.0
             prev-bigram = None            False : True   =     19.0 : 1.0
             next-bigram = None            False : True   =     15.6 : 1.0
         vend_prod_match = False           False : True   =     10.9 : 1.0
        next-bigram-tags = 'NUM .'          True : False  =      9.4 : 1.0
        next-bigram-tags = 'NUM NOUN'       True : False  =      5.8 : 1.0
                prev-tag = 'ADP'            True : False  =      5.2 : 1.0
             prev-bigram = 'is for'         True : False  =      5.1 : 1.0


0.8512396694214877

In [35]:
def extract_features_v3(tagged: list, pos: int, cve_id=None):
    """Extract contextual features from the sentence w.r.t given position of a word."""
    word, tag = tagged[pos]
    try:
        ver_pos = [pos for pos, (w, t) in enumerate(tagged[pos:]) if t == 'NUM'][0]
    except:
        ver_pos = None
        
    ver_follows = any([pos for pos, (w, t) in enumerate(tagged[pos:]) if t == 'NUM' or re.search(u'version', w)])
    features = {
        'tag': tag,
        'word-len': len(word) > 3,
        'vend_prod_match': similar(word, cves[cve_id].get_cpe(cpe_type='a')),
        'version_pos': ver_pos,
        'ver_follows': ver_follows
    }
    if pos == 0:
        features['prev-tag'] = '<start>'
    else:
        if pos < len(tagged) - 1:    
            features['next-bigram'] = " ".join(w.lower() for w, t in tagged[pos + 1: pos + 3])
            features['next-bigram-tags'] = " ".join(t for w, t in tagged[pos + 1: pos + 3])
            
        features['prev-word'] = tagged[pos - 1][0]
            
            
    return features

# Lets not split the dataset here for now
feature_list = create_feature_list_long(extract_features_v3, descriptions, labels, cve_ids)

classifier = nltk.NaiveBayesClassifier.train(feature_list)
classifier.show_most_informative_features()

eval_accuracy(classifier, extract_features_v3, descriptions, labels, cve_ids, n=2)

Most Informative Features
         vend_prod_match = True             True : False  =     37.6 : 1.0
             next-bigram = '3.0 .'          True : False  =     32.6 : 1.0
             next-bigram = None            False : True   =     31.3 : 1.0
        next-bigram-tags = 'NUM ADJ'        True : False  =     27.5 : 1.0
         vend_prod_match = False           False : True   =     10.9 : 1.0
        next-bigram-tags = 'NUM .'          True : False  =      9.5 : 1.0
               prev-word = None            False : True   =      8.4 : 1.0
             version_pos = None            False : True   =      8.0 : 1.0
               prev-word = 'the'           False : True   =      7.1 : 1.0
               prev-word = 'a'             False : True   =      7.0 : 1.0


0.9752066115702479

In [36]:
def extract_features_v4(tagged: list, sent_pos:int, word_pos: int, cve_id=None):
    """Extract contextual features from the sentence w.r.t given position of a word."""
    word, tag = tagged[word_pos]
    try:
        ver_pos = [pos for pos, (w, t) in enumerate(tagged[word_pos:]) if t == 'NUM'][0]
    except:
        ver_pos = None
    features = {
        'tag': tag,
        'word_len': len(word) > 3,
        'vend_prod_match': similar(word, cves[cve_id].get_cpe(cpe_type='a')),
        'version_pos': ver_pos,
        'sent_pos': sent_pos
    }
    if word_pos == 0:
        features['prev-tag'] = '<start>'
    else:
        if word_pos > 1:
            features['prev_tag'] = tagged[word_pos - 1][1]
            features['prev_bigram'] = " ".join(w.lower() for w, t in tagged[word_pos - 2: word_pos])
        
        if word_pos < len(tagged) - 1:    
            features['next_bigram'] = " ".join(w.lower() for w, t in tagged[word_pos + 1: word_pos + 3])
            features['next_bigram-tags'] = " ".join(t for w, t in tagged[word_pos + 1: word_pos + 3])
            
    return features

# Lets not split the dataset here for now
feature_list = create_feature_list_v2(extract_features_v4, descriptions, labels, cve_ids)

classifier = nltk.NaiveBayesClassifier.train(feature_list)
classifier.show_most_informative_features()

eval_accuracy_v2(classifier, extract_features_v4, descriptions, labels, cve_ids, n=1)

Most Informative Features
             next_bigram = None            False : True   =     49.2 : 1.0
         vend_prod_match = True             True : False  =     37.6 : 1.0
             next_bigram = '3.0 .'          True : False  =     33.3 : 1.0
             prev_bigram = None            False : True   =     29.9 : 1.0
        next_bigram-tags = 'NUM ADJ'        True : False  =     27.7 : 1.0
         vend_prod_match = False           False : True   =     10.9 : 1.0
             version_pos = 1                True : False  =      9.9 : 1.0
        next_bigram-tags = 'NUM .'          True : False  =      9.5 : 1.0
        next_bigram-tags = 'NUM NOUN'       True : False  =      5.9 : 1.0
             version_pos = None            False : True   =      5.9 : 1.0


0.6115702479338843

## Test accuracy on the toy dataset using multiple feature extractors

In [37]:
x = np.arange(1, 15)
feature_extractors = [extract_features_vanilla, extract_features_v0, extract_features_v1, extract_features_v2,
                      extract_features_v3
                     ]

split = int(len(descriptions) * 0.2)
test_set, test_labels = descriptions[:split], labels[:split]
train_set, train_labels = descriptions[split:], labels[split:]

accuracy_list = list()
for i, extractor in enumerate(feature_extractors):
    # Lets not split the dataset here for now
    feature_list = create_feature_list_long(extractor, train_set, train_labels, cve_ids[split:])

    classifier = nltk.NaiveBayesClassifier.train(feature_list)
#     classifier.show_most_informative_features()
    
    accuracy_list.append([eval_accuracy(classifier, extractor, test_set, test_labels, cve_ids[:split], i) for i in x])
    
feature_list = create_feature_list_v2(extract_features_v4, train_set, train_labels, cve_ids[split:])

classifier = nltk.NaiveBayesClassifier.train(feature_list)
classifier.show_most_informative_features()

accuracy_list.append([eval_accuracy_v2(classifier, extract_features_v4, test_set, test_labels, cve_ids[:split], i) for i in x])

Most Informative Features
         vend_prod_match = True             True : False  =     45.1 : 1.0
             next_bigram = None            False : True   =     40.0 : 1.0
             next_bigram = '3.0 .'          True : False  =     34.2 : 1.0
        next_bigram-tags = 'NUM ADJ'        True : False  =     24.4 : 1.0
             prev_bigram = None            False : True   =     24.3 : 1.0
             version_pos = 1                True : False  =     13.9 : 1.0
        next_bigram-tags = 'NUM .'          True : False  =     12.7 : 1.0
         vend_prod_match = False           False : True   =     10.6 : 1.0
        next_bigram-tags = 'NUM CONJ'       True : False  =      8.1 : 1.0
        next_bigram-tags = 'NUM NOUN'       True : False  =      6.6 : 1.0


### Current approach

In [38]:
from collections import OrderedDict


def get_first_sentence(description):
    """Get only the first sentence from the description."""
    sentences = nltk.sent_tokenize(description)
    return sentences[0] if sentences else ''


def guess_package_name(description):
    from nltk.corpus import stopwords
    """Guess package name from given description.

    Very naive approach. Words starting with uppercase letter
    are considered to be possible package names (minus stop words).

    Returns a list of possible package names, without duplicates.
    """

    stop_words = set()

    try:
        # Fails when no downloaded stopwords are available.
        stop_words.update(stopwords.words('english'))
    except LookupError:
        # Download stopwords since they are not available.
        nltk.download('stopwords')
        stop_words.update(stopwords.words('english'))

    regexp = re.compile('[A-Z][A-Za-z0-9-:]*')  # ? TODO: tweak
    suspects = regexp.findall(description)

    results = []

    if not suspects:
        return results

    results = [x.lower() for x in suspects if x.lower() not in stop_words]
    # get rid of duplicates, but keep order
    results = list(OrderedDict.fromkeys(results))

    return results


def get_package_name_candidates(description):
    """Try to identify possible package names in the CVE's description."""
    pkg_name_candidates = set()
    first_sentence = get_first_sentence(description)
    names = guess_package_name(first_sentence)
    pkg_name_candidates.update(set(names))
    return pkg_name_candidates

In [39]:
def eval_old_accuracy(sentences: list, labels: list) -> float:
    from collections import Counter
    
    assert len(sentences) == len(labels)
    
    guesses = [None] * len(labels)
    accurates = [None] * len(labels)
    for i, sent in enumerate(sentences):
        
        names = get_package_name_candidates(sent)
        guesses[i] = len(names)
        accurates[i] = labels[i] in names # only taking the first word .. needs better implementation
    
#         print('Expected: `%s`' % labels[i], 'got: `%s`' % most_prob)
    
    # TODO: come up with more sophisticated way of measuring accuracy
    bag = Counter(accurates)
    return bag[True] / len(labels), sum(guesses) / len(guesses)

In [40]:
def get_nof_guesses(sentences: list, labels: list) -> float:
    assert len(sentences) == len(labels)
    
    guesses = [None] * len(labels)
    for i, sent in enumerate(sentences):
        
        names = get_package_name_candidates(sent)
        guesses[i] = len(names)
        
    # TODO: come up with more sophisticated way of measuring accuracy
    return guesses

### Evaluation of the current approach on the test sets

In [41]:
old_accuracy, mean_guess = eval_old_accuracy(test_set, test_labels)

### Evaluation of the current approach on the whole set

In [42]:
eval_old_accuracy(descriptions, labels)

(0.7355371900826446, 3.347107438016529)

## Draw extractor accuracy

In [43]:
trace_names = ['vanilla_extractor'] + ['extract_features_v%d' % i for i in range(len(accuracy_list))]
data = [go.Scatter(x=x, y=ac, name=trace_names[i]) for i, ac in enumerate(accuracy_list)]

layout = go.Layout(
    yaxis=dict(
        title='Accuracy',
        titlefont=dict(
            color='grey'
        )
    ),
    xaxis=dict(
        title='Candidates',
        titlefont=dict(
            color='grey'
        )
    ),
    shapes=[
        {
            'type': 'line',
            'x0': mean_guess,
            'x1': mean_guess,
            'y0': -0.05,
            'y1': 1.1,
            'opacity': 0.2,
            'line': {
                'dash': 'dash'
            }
        },
        {
            'type': 'line',
            'x0': -0.5,
            'x1': 20,
            'y0': old_accuracy,
            'y1': old_accuracy,
            'opacity': 0.2,
            'line': {
                'dash': 'dash'
            }
        }
    ]
)

fig = go.Figure(data=data, layout=layout)

In [44]:
iplot(fig, show_link=False)

In [45]:
guess_trace = go.Scatter(y=get_nof_guesses(test_set, test_labels))
layout = go.Layout(
    xaxis=dict(
        ticks='',
        showticklabels=False,
        showgrid=False
    ),
    shapes=[
        {
            'type': 'line',
            'y0': mean_guess,
            'y1': mean_guess,
            'x0': -0.1,
            'x1': 23,
            'opacity': 0.2,
            'line': {
                'dash': 'dash'
            }
        },
    ]
)

fig = go.Figure(data=[guess_trace], layout=layout)

In [46]:
iplot(fig, show_link=False)

## Remove words/stopwords when evaluating accuracy

In [47]:
stopwords = set(nltk.corpus.stopwords.words())

def eval_accuracy_spec(classifier, extractor, sentences: list, labels: list, cve_ids: list, n=1) -> float:
    from collections import Counter
    
    assert len(sentences) == len(labels)
    
    accurates = [None] * len(labels)
    for i, sent in enumerate(sentences):
        tagged = nltk.pos_tag(nltk.word_tokenize(sent), tagset='universal')
        prob_dist = [classifier.prob_classify(extractor(tagged, j, cve_ids[i])) for j in range(len(tagged))]
        # get probs and remove stopwords
        probs = [(word, tag, prob.prob(True)) for (word, tag), prob in zip(tagged, prob_dist) if word not in stopwords and tag == 'NOUN']
        print(probs)
        probs = sorted(probs, key=lambda x: x[2], reverse=True)
        
        most_prob = set([prob[0].lower() for prob in probs[:n]])
        accurates[i] = labels[i] in most_prob
    
        if not accurates[i]:
            print('Sentence: ', sent)
            print('Expected: `%s`' % labels[i], 'got: `%s`' % most_prob)
    
    bag = Counter(accurates)
    return bag[True] / len(labels)

In [48]:
accuracy_list = list()
for i, extractor in enumerate(feature_extractors):
    # Lets not split the dataset here for now
    feature_list = create_feature_list_long(extractor, train_set, train_labels, cve_ids[split:])

    classifier = nltk.NaiveBayesClassifier.train(feature_list)
#     classifier.show_most_informative_features()
    
    accuracy_list.append([eval_accuracy_spec(classifier, extractor, test_set, test_labels, cve_ids[:split], i) for i in x])
    
feature_list = create_feature_list_v2(extract_features_v4, train_set, train_labels, cve_ids)

classifier = nltk.NaiveBayesClassifier.train(feature_list)
classifier.show_most_informative_features()

accuracy_list.append([eval_accuracy_v2(classifier, extract_features_v4, test_set, test_labels, cve_ids, i) for i in x])

[('MODX', 'NOUN', 0.0030510463144199), ('Revolution', 'NOUN', 0.005122980583748046), ('version', 'NOUN', 0.036648633813404635), ('SQL', 'NOUN', 4.9624460059082774e-05), ('injection', 'NOUN', 0.036648633813404635), ('sanitization', 'NOUN', 0.019112587043665932), ('escape', 'NOUN', 0.0018203790518677395), ('method', 'NOUN', 0.036648633813404635), ('user', 'NOUN', 0.001352519117281349), ('database', 'NOUN', 0.014679171686772737), ('privileges', 'NOUN', 0.014679171686772737)]
Sentence:  MODX Revolution version 2.x - 2.5.6 is vulnerable to blind SQL injection caused by improper sanitization by the escape method resulting in authenticated user accessing database and possibly escalating privileges.
Expected: `revolution` got: `{'version'}`
[('MODX', 'NOUN', 0.007989493693372844), ('Revolution', 'NOUN', 0.005122980583748046), ('name', 'NOUN', 0.0004506386520118372), ('parameters', 'NOUN', 0.0004749859279574337), ('System', 'NOUN', 0.00024678921490626734), ('Settings', 'NOUN', 0.005122980583748

[('MODX', 'NOUN', 0.007989493693372844), ('Revolution', 'NOUN', 0.005122980583748046), ('user', 'NOUN', 0.0019043770103975462), ('file', 'NOUN', 0.02778263840425215), ('permissions', 'NOUN', 0.019112587043665932), ('code', 'NOUN', 0.0004750180486773723), ('file', 'NOUN', 0.0019043770103975462), ('name', 'NOUN', 0.0018203790518677395), ('.htaccess', 'NOUN', 0.005405316957769816)]
Sentence:  In MODX Revolution before 2.5.7, a user with file upload permissions is able to execute arbitrary code by uploading a file with the name .htaccess.
Expected: `revolution` got: `{'permissions', 'file'}`
[('MODX', 'NOUN', 0.007989493693372844), ('Revolution', 'NOUN', 0.005122980583748046), ('user', 'NOUN', 0.0019043770103975462), ('resource', 'NOUN', 0.02778263840425215), ('edit', 'NOUN', 0.036648633813404635), ('permissions', 'NOUN', 0.036648633813404635), ('payload', 'NOUN', 0.0004750180486773723), ('title', 'NOUN', 0.0018203790518677395), ('post', 'NOUN', 0.013867726364218418), ('parameter', 'NOUN',

Expected: `revolution` got: `{'conjunction', 'header', 'modx'}`
[('SimpleXML', 'NOUN', 0.0030510463144199), ('version', 'NOUN', 0.019112587043665932), ('XXE', 'NOUN', 3.077401469962619e-06), ('vulnerability', 'NOUN', 0.012522157924691966), ('SSRF', 'NOUN', 0.0020124691908496566), ('information', 'NOUN', 8.412130067504175e-05), ('disclosure', 'NOUN', 0.012522157924691966), ('DoS', 'NOUN', 2.8024369610121127e-07)]
Sentence:  SimpleXML (latest version 2.7.1) is vulnerable to an XXE vulnerability resulting SSRF, information disclosure, DoS and so on.
Expected: `simplexml` got: `{'disclosure', 'version', 'vulnerability'}`
[('vulnerability', 'NOUN', 0.0019043770103975462), ('Swagger-Parser', 'NOUN', 0.007989493693372844), ('version', 'NOUN', 0.008125093331013947), ('<', 'NOUN', 0.0009353623407946211), ('Swagger', 'NOUN', 0.0001090177181564065), ('version', 'NOUN', 0.014679171686772737), ('<', 'NOUN', 0.0009353623407946211), ('yaml', 'NOUN', 0.05145119843070482), ('functionality', 'NOUN', 0.0

Expected: `nv-websocket-client` got: `{'servers', 'certificate', 'client', 'name'}`
[('ELabftw', 'NOUN', 0.0030510463144199), ('version', 'NOUN', 0.036648633813404635), ('scripting', 'NOUN', 0.0007787928378122423), ('experiment', 'NOUN', 0.0018203790518677395), ('infos', 'NOUN', 0.036648633813404635), ('component', 'NOUN', 0.036648633813404635), ('execution', 'NOUN', 0.0004750180486773723), ('JavaScript', 'NOUN', 0.0005376493676017715), ('denial', 'NOUN', 0.0008048491969533984), ('service', 'NOUN', 0.00395849315498599)]
Sentence:  ELabftw version 1.7.8 is vulnerable to stored cross-site scripting in the experiment infos component resulting in arbitrary execution of JavaScript and denial of service.
Expected: `elabftw` got: `{'version', 'infos', 'service', 'component'}`
[('Primetek', 'NOUN', 0.0030510463144199), ('Primefaces', 'NOUN', 0.005122980583748046), ('encryption', 'NOUN', 0.006453085790469036), ('flaw', 'NOUN', 0.012522157924691966), ('code', 'NOUN', 0.00023470388573745442), ('e

[('Odoo', 'NOUN', 0.007989493693372844), ('Odoo', 'NOUN', 1.1387222515489144e-05), ('Community', 'NOUN', 0.005122980583748046), ('Edition', 'NOUN', 0.0017135123883797992), ('Odoo', 'NOUN', 0.0001090177181564065), ('Enterprise', 'NOUN', 0.005122980583748046), ('Edition', 'NOUN', 0.005122980583748046), ('attackers', 'NOUN', 0.00023470388573745442), ('authentication', 'NOUN', 0.002970720510479935), ('circumstances', 'NOUN', 0.0038818714941932647), ('parameters', 'NOUN', 0.051732965788054946), ('characters', 'NOUN', 0.05145119843070482), ('database', 'NOUN', 0.0018203790518677395), ('layer', 'NOUN', 0.036648633813404635), ('Psycopg', 'NOUN', 0.0073302996519779085)]
[('Odoo', 'NOUN', 0.007989493693372844), ('Odoo', 'NOUN', 1.1387222515489144e-05), ('Community', 'NOUN', 0.005122980583748046), ('Edition', 'NOUN', 0.0017135123883797992), ('Odoo', 'NOUN', 0.0001090177181564065), ('Enterprise', 'NOUN', 0.005122980583748046), ('Edition', 'NOUN', 0.005122980583748046), ('access', 'NOUN', 0.0017682

Expected: `odoo` got: `{'code', 'unpickle', 'module', 'handling', 'data', 'users'}`
[('Odoo', 'NOUN', 0.007989493693372844), ('Odoo', 'NOUN', 1.1387222515489144e-05), ('Community', 'NOUN', 0.005122980583748046), ('Edition', 'NOUN', 0.0017135123883797992), ('Odoo', 'NOUN', 0.0001090177181564065), ('Enterprise', 'NOUN', 0.005122980583748046), ('Edition', 'NOUN', 0.005122980583748046), ('attackers', 'NOUN', 0.00023470388573745442), ('authentication', 'NOUN', 0.002970720510479935), ('circumstances', 'NOUN', 0.0038818714941932647), ('parameters', 'NOUN', 0.051732965788054946), ('characters', 'NOUN', 0.05145119843070482), ('database', 'NOUN', 0.0018203790518677395), ('layer', 'NOUN', 0.036648633813404635), ('Psycopg', 'NOUN', 0.0073302996519779085)]
[('Odoo', 'NOUN', 0.007989493693372844), ('Odoo', 'NOUN', 1.1387222515489144e-05), ('Community', 'NOUN', 0.005122980583748046), ('Edition', 'NOUN', 0.0017135123883797992), ('Odoo', 'NOUN', 0.0001090177181564065), ('Enterprise', 'NOUN', 0.00512298

[('Odoo', 'NOUN', 0.007989493693372844), ('Odoo', 'NOUN', 1.1387222515489144e-05), ('Community', 'NOUN', 0.005122980583748046), ('Edition', 'NOUN', 0.0017135123883797992), ('Odoo', 'NOUN', 0.0001090177181564065), ('Enterprise', 'NOUN', 0.005122980583748046), ('Edition', 'NOUN', 0.005122980583748046), ('insecure', 'NOUN', 8.412130067504175e-05), ('handling', 'NOUN', 0.036648633813404635), ('anonymization', 'NOUN', 0.00395849315498599), ('data', 'NOUN', 0.036648633813404635), ('Database', 'NOUN', 0.00024678921490626734), ('Anonymization', 'NOUN', 0.005122980583748046), ('module', 'NOUN', 0.036648633813404635), ('users', 'NOUN', 0.019112587043665932), ('Python', 'NOUN', 6.432336566729362e-05), ('code', 'NOUN', 0.036648633813404635), ('unpickle', 'NOUN', 0.051732965788054946)]
[('Odoo', 'NOUN', 0.007989493693372844), ('Odoo', 'NOUN', 1.1387222515489144e-05), ('Community', 'NOUN', 0.005122980583748046), ('Edition', 'NOUN', 0.0017135123883797992), ('Odoo', 'NOUN', 0.0001090177181564065), ('E

[('MODX', 'NOUN', 0.007989493693372844), ('Revolution', 'NOUN', 0.005122980583748046), ('PHP', 'NOUN', 1.5071653090008816e-05), ('attacker', 'NOUN', 0.0009229772670731871), ('files', 'NOUN', 0.0004750180486773723), ('web', 'NOUN', 4.487938119902145e-05), ('server', 'NOUN', 0.0075511172679902415), ('validation', 'NOUN', 0.000721146081189337), ('action', 'NOUN', 0.0018203790518677395), ('parameter', 'NOUN', 0.036648633813404635), ('directory', 'NOUN', 0.004941414476575678), ('traversal', 'NOUN', 0.012522157924691966)]
[('MODX', 'NOUN', 0.007989493693372844), ('Revolution', 'NOUN', 0.005122980583748046), ('attacker', 'NOUN', 0.0009229772670731871), ('XSS', 'NOUN', 4.9624460059082774e-05), ('payloads', 'NOUN', 0.014679171686772737), ('fields', 'NOUN', 0.019112587043665932), ('setup', 'NOUN', 0.0018203790518677395), ('page', 'NOUN', 0.012522157924691966), ('database_type', 'NOUN', 0.0018203790518677395), ('parameter', 'NOUN', 0.036648633813404635)]
[('MODX', 'NOUN', 0.007989493693372844), (

[('vulnerability', 'NOUN', 0.0019043770103975462), ('Swagger-Parser', 'NOUN', 0.007989493693372844), ('version', 'NOUN', 0.008125093331013947), ('<', 'NOUN', 0.0009353623407946211), ('Swagger', 'NOUN', 0.0001090177181564065), ('version', 'NOUN', 0.014679171686772737), ('<', 'NOUN', 0.0009353623407946211), ('yaml', 'NOUN', 0.05145119843070482), ('functionality', 'NOUN', 0.014679171686772737), ('results', 'NOUN', 0.012522157924691966), ('code', 'NOUN', 0.0004750180486773723), ('specification', 'NOUN', 0.019112587043665932), ("'generate", 'NOUN', 0.0018203790518677395), ("'validate", 'NOUN', 0.0008048491969533984), ('command', 'NOUN', 0.005185835901809435), ('swagger-codegen', 'NOUN', 0.05615927106978823), ('=', 'NOUN', 0.00015981696101958245), ('code', 'NOUN', 0.0004750180486773723), ('commands', 'NOUN', 0.04048043587696315), ('yaml', 'NOUN', 0.019112587043665932), ('specification', 'NOUN', 0.036648633813404635)]
Sentence:  A vulnerability in Swagger-Parser's version <= 1.0.30 and Swagge

[('Plexus-utils', 'NOUN', 0.0030510463144199), ('injection', 'NOUN', 0.0021237458089810395), ('contents', 'NOUN', 0.0018203790518677395), ('strings', 'NOUN', 0.014679171686772737)]
[('AndroidSVG', 'NOUN', 0.0030510463144199), ('version', 'NOUN', 0.036648633813404635), ('attacks', 'NOUN', 0.004941414476575678), ('SVG', 'NOUN', 6.0749674005071955e-06), ('component', 'NOUN', 0.014679171686772737), ('denial', 'NOUN', 0.05615927106978823), ('service', 'NOUN', 0.00395849315498599), ('code', 'NOUN', 0.00017946007661894897), ('execution', 'NOUN', 0.000603490379112261)]
[('Awstats', 'NOUN', 0.0030510463144199), ('version', 'NOUN', 0.036648633813404635), ('path', 'NOUN', 0.0019043770103975462), ('traversal', 'NOUN', 0.0075511172679902415), ('flaw', 'NOUN', 0.036648633813404635), ('handling', 'NOUN', 0.0018203790518677395), ('config', 'NOUN', 0.0004506386520118372), ('migrate', 'NOUN', 0.0004506386520118372), ('parameters', 'NOUN', 0.0004749859279574337), ('code', 'NOUN', 0.00023470388573745442),

[('Odoo', 'NOUN', 0.007989493693372844), ('Odoo', 'NOUN', 1.1387222515489144e-05), ('Community', 'NOUN', 0.005122980583748046), ('Edition', 'NOUN', 0.0017135123883797992), ('Odoo', 'NOUN', 0.0001090177181564065), ('Enterprise', 'NOUN', 0.005122980583748046), ('Edition', 'NOUN', 0.005122980583748046), ('access', 'NOUN', 0.001768231067784493), ('control', 'NOUN', 0.004209191414650334), ('OAuth', 'NOUN', 0.015986040275787265), ('tokens', 'NOUN', 0.036648633813404635), ('OAuth', 'NOUN', 0.00024678921490626734), ('module', 'NOUN', 0.036648633813404635), ('users', 'NOUN', 0.001352519117281349), ('OAuth', 'NOUN', 0.0020124691908496566), ('sessions', 'NOUN', 0.036648633813404635), ('users', 'NOUN', 0.001768231067784493)]
[('Directory', 'NOUN', 0.0030510463144199), ('traversal', 'NOUN', 0.012522157924691966), ('vulnerability', 'NOUN', 0.036648633813404635), ('tools.file_open', 'NOUN', 0.05615927106978823), ('Odoo', 'NOUN', 0.007989493693372844), ('allows', 'NOUN', 0.05145119843070482), ('users'

[('Heimdal', 'NOUN', 0.0030510463144199), ('allows', 'NOUN', 0.017759566807471334), ('attackers', 'NOUN', 0.00017946007661894897), ('services', 'NOUN', 0.002970720510479935), ('Orpheus', 'NOUN', 0.0038531305804421637), ('Lyre', 'NOUN', 0.0007051000078030074), ('attacks', 'NOUN', 0.036648633813404635), ('names', 'NOUN', 0.019112587043665932), ('way', 'NOUN', 4.6954109440902444e-05), ('Kerberos', 'NOUN', 0.00024678921490626734), ('protocol', 'NOUN', 0.05145119843070482), ('specification', 'NOUN', 0.036648633813404635), ('_krb5_extract_ticket', 'NOUN', 0.05615927106978823), ('KDC-REP', 'NOUN', 0.00024678921490626734), ('service', 'NOUN', 0.036648633813404635), ('name', 'NOUN', 0.0006669733124948976), ('version', 'NOUN', 0.019112587043665932), ("'enc_part", 'NOUN', 0.05615927106978823), ('version', 'NOUN', 0.006453085790469036), ("'ticket", 'NOUN', 0.05615927106978823), ('Use', 'NOUN', 4.471062802431542e-07), ('version', 'NOUN', 0.006453085790469036), ('opportunity', 'NOUN', 0.000922977267

Expected: `heimdal` got: `{"'ticket", 'protocol', 'version', 'code', 'server', "'enc_part", '_krb5_extract_ticket', 'specification', 'service', 'allows', 'attacks', 'names'}`
[('MODX', 'NOUN', 0.477077699287432), ('Revolution', 'NOUN', 0.6055342355151037), ('version', 'NOUN', 0.01640051832586987), ('SQL', 'NOUN', 0.00016066953436644666), ('injection', 'NOUN', 0.01640051832586987), ('sanitization', 'NOUN', 0.008467853278504385), ('escape', 'NOUN', 6.14821839936484e-05), ('method', 'NOUN', 0.01640051832586987), ('user', 'NOUN', 0.0005932526533755057), ('database', 'NOUN', 0.006487293148395452), ('privileges', 'NOUN', 0.006487293148395452)]
[('MODX', 'NOUN', 0.7059625718398442), ('Revolution', 'NOUN', 0.01640051832586987), ('name', 'NOUN', 0.00019756253716495746), ('parameters', 'NOUN', 0.0002082393694280276), ('System', 'NOUN', 4.036794287376816e-05), ('Settings', 'NOUN', 0.01640051832586987), ('module', 'NOUN', 0.01640051832586987), ('XSS', 'NOUN', 5.401858523246492e-07), ('A', 'NOUN', 

[('MODX', 'NOUN', 0.7059625718398442), ('Revolution', 'NOUN', 0.01640051832586987), ('user', 'NOUN', 9.29103567762982e-05), ('file', 'NOUN', 0.012370025279008545), ('permissions', 'NOUN', 0.008467853278504385), ('code', 'NOUN', 5.951302690927025e-06), ('file', 'NOUN', 0.0008355721445381926), ('name', 'NOUN', 0.0007986791374652502), ('.htaccess', 'NOUN', 0.0007933691249199272)]
[('MODX', 'NOUN', 0.7059625718398442), ('Revolution', 'NOUN', 0.01640051832586987), ('user', 'NOUN', 9.29103567762982e-05), ('resource', 'NOUN', 0.012370025279008545), ('edit', 'NOUN', 0.01640051832586987), ('permissions', 'NOUN', 0.01640051832586987), ('payload', 'NOUN', 4.165763133344855e-05), ('title', 'NOUN', 0.000266368207641968), ('post', 'NOUN', 0.006125868764863884), ('parameter', 'NOUN', 0.008467853278504385)]
[('MODX', 'NOUN', 0.7059625718398442), ('Revolution', 'NOUN', 0.01640051832586987), ('attacker', 'NOUN', 3.680862316294119e-05), ('XSS', 'NOUN', 5.35622486698513e-05), ('payload', 'NOUN', 0.0008355

[('ELabftw', 'NOUN', 0.477077699287432), ('version', 'NOUN', 0.0032042154907906332), ('scripting', 'NOUN', 0.0003414901521040833), ('experiment', 'NOUN', 4.036794287376816e-05), ('infos', 'NOUN', 0.01640051832586987), ('component', 'NOUN', 0.01640051832586987), ('execution', 'NOUN', 0.00020825345528458633), ('JavaScript', 'NOUN', 0.0017388528867597305), ('denial', 'NOUN', 0.0003529206816476542), ('service', 'NOUN', 3.055839532947374e-05)]
[('Primetek', 'NOUN', 0.477077699287432), ('Primefaces', 'NOUN', 0.6055342355151037), ('encryption', 'NOUN', 0.002838642565337057), ('flaw', 'NOUN', 0.005527272868605732), ('code', 'NOUN', 0.00010288303069911603), ('execution', 'NOUN', 1.0586542047391059e-05)]
[('Plexus-utils', 'NOUN', 0.477077699287432), ('injection', 'NOUN', 0.0009319382411133231), ('contents', 'NOUN', 0.0007986791374652502), ('strings', 'NOUN', 0.006487293148395452)]
[('AndroidSVG', 'NOUN', 0.477077699287432), ('version', 'NOUN', 0.0032042154907906332), ('attacks', 'NOUN', 0.002171

[('vulnerability', 'NOUN', 0.0001611956129289766), ('Swagger-Parser', 'NOUN', 0.8058733255199692), ('version', 'NOUN', 5.38884347438187e-05), ('<', 'NOUN', 0.0004101798561982246), ('=', 'NOUN', 0.00013676401729604005), ('yaml', 'NOUN', 5.38884347438187e-05), ('functionality', 'NOUN', 0.006487293148395452), ('results', 'NOUN', 0.005527272868605732), ('code', 'NOUN', 0.00020825345528458633), ('specification', 'NOUN', 0.008467853278504385), ("'generate", 'NOUN', 0.0007986791374652502), ("'validate", 'NOUN', 0.0003529206816476542), ('command', 'NOUN', 0.0022795645161635433), ('swagger-codegen', 'NOUN', 0.8058733255199692), ('=', 'NOUN', 7.00532142934312e-05), ('code', 'NOUN', 6.942745743346319e-05), ('commands', 'NOUN', 0.018155173869535544), ('yaml', 'NOUN', 0.008467853278504385), ('specification', 'NOUN', 0.01640051832586987)]
[('Java', 'NOUN', 0.00015407375343029318), ('WebSocket', 'NOUN', 0.005527272868605732), ('client', 'NOUN', 0.01640051832586987), ('nv-websocket-client', 'NOUN', 0.

[('vulnerability', 'NOUN', 0.0001611956129289766), ('Swagger-Parser', 'NOUN', 0.8058733255199692), ('version', 'NOUN', 0.003577512713378401), ('<', 'NOUN', 0.0004101798561982246), ('Swagger', 'NOUN', 0.1442714202146935), ('version', 'NOUN', 0.006487293148395452), ('<', 'NOUN', 0.0004101798561982246), ('yaml', 'NOUN', 0.023221909886098614), ('functionality', 'NOUN', 0.006487293148395452), ('results', 'NOUN', 0.005527272868605732), ('code', 'NOUN', 0.00020825345528458633), ('specification', 'NOUN', 0.008467853278504385), ("'generate", 'NOUN', 0.0007986791374652502), ("'validate", 'NOUN', 0.0003529206816476542), ('command', 'NOUN', 0.0022795645161635433), ('swagger-codegen', 'NOUN', 0.8058733255199692), ('=', 'NOUN', 7.00532142934312e-05), ('code', 'NOUN', 6.942745743346319e-05), ('commands', 'NOUN', 0.018155173869535544), ('yaml', 'NOUN', 0.008467853278504385), ('specification', 'NOUN', 0.01640051832586987)]
[('vulnerability', 'NOUN', 0.0001611956129289766), ('Swagger-Parser', 'NOUN', 0.

[('MODX', 'NOUN', 0.7059625718398442), ('Revolution', 'NOUN', 0.01640051832586987), ('name', 'NOUN', 0.00019756253716495746), ('parameters', 'NOUN', 0.0002082393694280276), ('System', 'NOUN', 4.036794287376816e-05), ('Settings', 'NOUN', 0.01640051832586987), ('module', 'NOUN', 0.01640051832586987), ('XSS', 'NOUN', 5.401858523246492e-07), ('A', 'NOUN', 1.4477591139077116e-06), ('payload', 'NOUN', 0.0003414901521040833), ('user', 'NOUN', 0.052557153103252006), ('module', 'NOUN', 0.0020503296217463177)]
[('Directory', 'NOUN', 0.0019068177225202729), ('traversal', 'NOUN', 0.0010703582692949871), ('setup/processors/url_search.php', 'NOUN', 0.025416058964788525), ('search', 'NOUN', 0.0007986791374652502), ('page', 'NOUN', 0.01640051832586987), ('processor', 'NOUN', 0.008467853278504385), ('MODX', 'NOUN', 0.8058733255199692), ('Revolution', 'NOUN', 0.01640051832586987), ('attackers', 'NOUN', 2.057829986610861e-05), ('system', 'NOUN', 0.006487293148395452), ('information', 'NOUN', 0.0028386425

[('vulnerability', 'NOUN', 0.0001611956129289766), ('Swagger-Parser', 'NOUN', 0.8058733255199692), ('version', 'NOUN', 0.003577512713378401), ('<', 'NOUN', 0.0004101798561982246), ('Swagger', 'NOUN', 0.1442714202146935), ('version', 'NOUN', 0.006487293148395452), ('<', 'NOUN', 0.0004101798561982246), ('yaml', 'NOUN', 0.023221909886098614), ('functionality', 'NOUN', 0.006487293148395452), ('results', 'NOUN', 0.005527272868605732), ('code', 'NOUN', 0.00020825345528458633), ('specification', 'NOUN', 0.008467853278504385), ("'generate", 'NOUN', 0.0007986791374652502), ("'validate", 'NOUN', 0.0003529206816476542), ('command', 'NOUN', 0.0022795645161635433), ('swagger-codegen', 'NOUN', 0.8058733255199692), ('=', 'NOUN', 7.00532142934312e-05), ('code', 'NOUN', 6.942745743346319e-05), ('commands', 'NOUN', 0.018155173869535544), ('yaml', 'NOUN', 0.008467853278504385), ('specification', 'NOUN', 0.01640051832586987)]
[('vulnerability', 'NOUN', 0.0001611956129289766), ('Swagger-Parser', 'NOUN', 0.

[('MODX', 'NOUN', 0.7059625718398442), ('Revolution', 'NOUN', 0.01640051832586987), ('attacker', 'NOUN', 3.680862316294119e-05), ('XSS', 'NOUN', 5.35622486698513e-05), ('payload', 'NOUN', 0.0008355721445381926), ('HTTP', 'NOUN', 0.000266368207641968), ('Host', 'NOUN', 0.0023763367503435006), ('header', 'NOUN', 0.005527272868605732), ('request', 'NOUN', 0.00035827341239266515), ('conjunction', 'NOUN', 0.025416058964788525), ('issues', 'NOUN', 0.000775776810965966), ('Cache', 'NOUN', 0.002820240375052719), ('Poisoning', 'NOUN', 0.003323712146752152)]
[('SimpleXML', 'NOUN', 0.477077699287432), ('version', 'NOUN', 0.008467853278504385), ('XXE', 'NOUN', 9.964766756279263e-06), ('vulnerability', 'NOUN', 0.005527272868605732), ('SSRF', 'NOUN', 0.006487293148395452), ('information', 'NOUN', 3.687166175823559e-05), ('disclosure', 'NOUN', 0.005527272868605732), ('DoS', 'NOUN', 9.074475904096685e-07)]
[('vulnerability', 'NOUN', 0.0001611956129289766), ('Swagger-Parser', 'NOUN', 0.8058733255199692

[('Heimdal', 'NOUN', 0.477077699287432), ('allows', 'NOUN', 0.007862355908973289), ('attackers', 'NOUN', 4.627656062412226e-06), ('services', 'NOUN', 0.0002611179164674232), ('Orpheus', 'NOUN', 0.012370025279008545), ('Lyre', 'NOUN', 0.0022795645161635433), ('attacks', 'NOUN', 0.01640051832586987), ('names', 'NOUN', 0.008467853278504385), ('way', 'NOUN', 6.236572855039711e-07), ('Kerberos', 'NOUN', 0.0007986791374652502), ('protocol', 'NOUN', 0.023221909886098614), ('specification', 'NOUN', 0.01640051832586987), ('_krb5_extract_ticket', 'NOUN', 0.025416058964788525), ('KDC-REP', 'NOUN', 0.0007986791374652502), ('service', 'NOUN', 0.01640051832586987), ('name', 'NOUN', 0.00029244042278248696), ('version', 'NOUN', 0.008467853278504385), ("'enc_part", 'NOUN', 0.025416058964788525), ('version', 'NOUN', 0.002838642565337057), ("'ticket", 'NOUN', 0.025416058964788525), ('Use', 'NOUN', 1.4477591139077116e-06), ('version', 'NOUN', 0.002838642565337057), ('opportunity', 'NOUN', 0.00040474587340

[('Odoo', 'NOUN', 0.7059625718398442), ('Odoo', 'NOUN', 0.017303814748255222), ('Community', 'NOUN', 0.01640051832586987), ('Edition', 'NOUN', 0.005527272868605732), ('Odoo', 'NOUN', 0.008795367536842182), ('Enterprise', 'NOUN', 0.01640051832586987), ('Edition', 'NOUN', 0.01640051832586987), ('attackers', 'NOUN', 3.429669593171885e-05), ('authentication', 'NOUN', 0.0013042273538202028), ('circumstances', 'NOUN', 0.0017051216315391784), ('parameters', 'NOUN', 0.0233528884234383), ('characters', 'NOUN', 0.023221909886098614), ('database', 'NOUN', 0.0007986791374652502), ('layer', 'NOUN', 0.01640051832586987), ('Psycopg', 'NOUN', 0.0233528884234383)]
[('Odoo', 'NOUN', 0.7059625718398442), ('Odoo', 'NOUN', 0.017303814748255222), ('Community', 'NOUN', 0.01640051832586987), ('Edition', 'NOUN', 0.005527272868605732), ('Odoo', 'NOUN', 0.008795367536842182), ('Enterprise', 'NOUN', 0.01640051832586987), ('Edition', 'NOUN', 0.01640051832586987), ('access', 'NOUN', 0.000775776810965966), ('control

[('Primetek', 'NOUN', 0.477077699287432), ('Primefaces', 'NOUN', 0.6055342355151037), ('encryption', 'NOUN', 0.002838642565337057), ('flaw', 'NOUN', 0.005527272868605732), ('code', 'NOUN', 0.00010288303069911603), ('execution', 'NOUN', 1.0586542047391059e-05)]
[('Plexus-utils', 'NOUN', 0.477077699287432), ('injection', 'NOUN', 0.0009319382411133231), ('contents', 'NOUN', 0.0007986791374652502), ('strings', 'NOUN', 0.006487293148395452)]
[('AndroidSVG', 'NOUN', 0.477077699287432), ('version', 'NOUN', 0.0032042154907906332), ('attacks', 'NOUN', 0.002171823888285149), ('SVG', 'NOUN', 9.934978699580556e-07), ('component', 'NOUN', 0.006487293148395452), ('denial', 'NOUN', 0.025416058964788525), ('service', 'NOUN', 3.055839532947374e-05), ('code', 'NOUN', 6.0515416187073026e-06), ('execution', 'NOUN', 1.0586542047391059e-05)]
[('Awstats', 'NOUN', 0.477077699287432), ('version', 'NOUN', 0.0032042154907906332), ('path', 'NOUN', 4.401232121007091e-05), ('traversal', 'NOUN', 0.003323712146752152

[('Directory', 'NOUN', 0.0019068177225202729), ('traversal', 'NOUN', 0.0010703582692949871), ('vulnerability', 'NOUN', 0.01640051832586987), ('WebCalendar', 'NOUN', 0.8058733255199692), ('allows', 'NOUN', 5.971790339552361e-05), ('attackers', 'NOUN', 0.0005932526533755057), ('files', 'NOUN', 0.00020825345528458633), ('vectors', 'NOUN', 9.815324070388623e-05)]
[('Heimdal', 'NOUN', 0.477077699287432), ('allows', 'NOUN', 0.007862355908973289), ('attackers', 'NOUN', 4.627656062412226e-06), ('services', 'NOUN', 0.0002611179164674232), ('Orpheus', 'NOUN', 0.012370025279008545), ('Lyre', 'NOUN', 0.0022795645161635433), ('attacks', 'NOUN', 0.01640051832586987), ('names', 'NOUN', 0.008467853278504385), ('way', 'NOUN', 6.236572855039711e-07), ('Kerberos', 'NOUN', 0.0007986791374652502), ('protocol', 'NOUN', 0.023221909886098614), ('specification', 'NOUN', 0.01640051832586987), ('_krb5_extract_ticket', 'NOUN', 0.025416058964788525), ('KDC-REP', 'NOUN', 0.0007986791374652502), ('service', 'NOUN', 

[('Directory', 'NOUN', 0.0019068177225202729), ('traversal', 'NOUN', 0.0010703582692949871), ('vulnerability', 'NOUN', 0.01640051832586987), ('tools.file_open', 'NOUN', 0.008618044133501626), ('Odoo', 'NOUN', 0.9256717054072853), ('allows', 'NOUN', 0.023221909886098614), ('users', 'NOUN', 0.00011870686920720563), ('files', 'NOUN', 0.0012185376667949816), ('Odoo', 'NOUN', 0.028524733700936358), ('service', 'NOUN', 0.01640051832586987)]
[('vulnerability', 'NOUN', 1.0447335525722652e-05), ('WebCalendar', 'NOUN', 0.8058733255199692), ('allows', 'NOUN', 5.971790339552361e-05), ('attacker', 'NOUN', 8.097539424714391e-05), ('script', 'NOUN', 0.0003414901521040833), ('HTML', 'NOUN', 0.00014811862551760295), ('vectors', 'NOUN', 9.815324070388623e-05)]
[('Directory', 'NOUN', 0.0019068177225202729), ('traversal', 'NOUN', 0.0010703582692949871), ('vulnerability', 'NOUN', 0.01640051832586987), ('WebCalendar', 'NOUN', 0.8058733255199692), ('allows', 'NOUN', 5.971790339552361e-05), ('attackers', 'NOU

[('Odoo', 'NOUN', 0.7059625718398442), ('Odoo', 'NOUN', 0.017303814748255222), ('Community', 'NOUN', 0.01640051832586987), ('Edition', 'NOUN', 0.005527272868605732), ('Odoo', 'NOUN', 0.008795367536842182), ('Enterprise', 'NOUN', 0.01640051832586987), ('Edition', 'NOUN', 0.01640051832586987), ('insecure', 'NOUN', 3.687166175823559e-05), ('handling', 'NOUN', 0.01640051832586987), ('anonymization', 'NOUN', 0.0017388528867597305), ('data', 'NOUN', 0.01640051832586987), ('Database', 'NOUN', 4.036794287376816e-05), ('Anonymization', 'NOUN', 0.01640051832586987), ('module', 'NOUN', 0.01640051832586987), ('users', 'NOUN', 0.008467853278504385), ('Python', 'NOUN', 5.951302690927025e-06), ('code', 'NOUN', 0.01640051832586987), ('unpickle', 'NOUN', 0.007907403278805484)]
[('Odoo', 'NOUN', 0.7059625718398442), ('Odoo', 'NOUN', 0.017303814748255222), ('Community', 'NOUN', 0.01640051832586987), ('Edition', 'NOUN', 0.005527272868605732), ('Odoo', 'NOUN', 0.008795367536842182), ('Enterprise', 'NOUN', 

[('MODX', 'NOUN', 0.7247875512600701), ('Revolution', 'NOUN', 0.0029261306234129466), ('PHP', 'NOUN', 0.024926886724335393), ('attacker', 'NOUN', 4.0374914321279336e-05), ('files', 'NOUN', 6.527928349359658e-06), ('web', 'NOUN', 7.192365902955619e-06), ('server', 'NOUN', 0.0036445772245455436), ('validation', 'NOUN', 1.2849818875373207e-05), ('action', 'NOUN', 2.5049785252051253e-05), ('parameter', 'NOUN', 0.01796104446833434), ('directory', 'NOUN', 0.0007951804566438524), ('traversal', 'NOUN', 0.006059572321393881)]
Sentence:  In MODX Revolution before 2.5.7, when PHP 5.3.3 is used, an attacker is able to include and execute arbitrary files on the web server due to insufficient validation of the action parameter to setup/index.php, aka directory traversal.
Expected: `revolution` got: `{'php', 'modx'}`
[('MODX', 'NOUN', 0.7247875512600701), ('Revolution', 'NOUN', 0.0029261306234129466), ('attacker', 'NOUN', 4.0374914321279336e-05), ('XSS', 'NOUN', 0.00017623427004004812), ('payloads', 

[('Directory', 'NOUN', 0.0003361392097251651), ('traversal', 'NOUN', 0.0011739449198078355), ('vulnerability', 'NOUN', 0.01796104446833434), ('tools.file_open', 'NOUN', 0.009445170742379522), ('Odoo', 'NOUN', 0.6867115226114922), ('allows', 'NOUN', 0.025414727639005184), ('users', 'NOUN', 0.00013020703385221752), ('files', 'NOUN', 0.001336445570549549), ('Odoo', 'NOUN', 0.005141363583130368), ('service', 'NOUN', 0.01796104446833434)]
[('vulnerability', 'NOUN', 1.1459579790514208e-05), ('WebCalendar', 'NOUN', 0.42218200657197497), ('allows', 'NOUN', 6.550366935607659e-05), ('attacker', 'NOUN', 8.882050816232635e-05), ('script', 'NOUN', 0.0003745651816579684), ('HTML', 'NOUN', 0.00016246769926721755), ('vectors', 'NOUN', 0.00010766241138012774)]
[('Directory', 'NOUN', 0.0003361392097251651), ('traversal', 'NOUN', 0.0011739449198078355), ('vulnerability', 'NOUN', 0.01796104446833434), ('WebCalendar', 'NOUN', 0.42218200657197497), ('allows', 'NOUN', 6.550366935607659e-05), ('attackers', 'N

[('Directory', 'NOUN', 0.0003361392097251651), ('traversal', 'NOUN', 0.0011739449198078355), ('vulnerability', 'NOUN', 0.01796104446833434), ('tools.file_open', 'NOUN', 0.009445170742379522), ('Odoo', 'NOUN', 0.6867115226114922), ('allows', 'NOUN', 0.025414727639005184), ('users', 'NOUN', 0.00013020703385221752), ('files', 'NOUN', 0.001336445570549549), ('Odoo', 'NOUN', 0.005141363583130368), ('service', 'NOUN', 0.01796104446833434)]
[('vulnerability', 'NOUN', 1.1459579790514208e-05), ('WebCalendar', 'NOUN', 0.42218200657197497), ('allows', 'NOUN', 6.550366935607659e-05), ('attacker', 'NOUN', 8.882050816232635e-05), ('script', 'NOUN', 0.0003745651816579684), ('HTML', 'NOUN', 0.00016246769926721755), ('vectors', 'NOUN', 0.00010766241138012774)]
[('Directory', 'NOUN', 0.0003361392097251651), ('traversal', 'NOUN', 0.0011739449198078355), ('vulnerability', 'NOUN', 0.01796104446833434), ('WebCalendar', 'NOUN', 0.42218200657197497), ('allows', 'NOUN', 6.550366935607659e-05), ('attackers', 'N

[('Plexus-utils', 'NOUN', 0.1383584472827663), ('injection', 'NOUN', 0.0010221426477257005), ('contents', 'NOUN', 0.0008759964040804655), ('strings', 'NOUN', 0.007111385432072678)]
[('AndroidSVG', 'NOUN', 0.1383584472827663), ('version', 'NOUN', 0.003513585248318367), ('attacks', 'NOUN', 0.0023817535222247217), ('SVG', 'NOUN', 1.0897590599170692e-06), ('component', 'NOUN', 0.007111385432072678), ('denial', 'NOUN', 0.027810168644691644), ('service', 'NOUN', 3.351913857074105e-05), ('code', 'NOUN', 6.637879428757772e-06), ('execution', 'NOUN', 1.1612273901101247e-05)]
[('Awstats', 'NOUN', 0.1383584472827663), ('version', 'NOUN', 0.003513585248318367), ('path', 'NOUN', 4.827652612209872e-05), ('traversal', 'NOUN', 0.0036445772245455436), ('flaw', 'NOUN', 0.01796104446833434), ('handling', 'NOUN', 4.427907193267174e-05), ('config', 'NOUN', 1.4449621094946244e-05), ('migrate', 'NOUN', 0.0002167004790668032), ('parameters', 'NOUN', 0.00022841134297055227), ('code', 'NOUN', 2.2572112976414933

[('SimpleXML', 'NOUN', 0.1383584472827663), ('version', 'NOUN', 0.0092807000906801), ('XXE', 'NOUN', 1.0930255355112198e-05), ('vulnerability', 'NOUN', 0.006059572321393881), ('SSRF', 'NOUN', 0.007111385432072678), ('information', 'NOUN', 4.044406031405286e-05), ('disclosure', 'NOUN', 0.006059572321393881), ('DoS', 'NOUN', 1.5971619986685366e-07)]
[('vulnerability', 'NOUN', 0.00017681130304014192), ('Swagger-Parser', 'NOUN', 0.42218200657197497), ('version', 'NOUN', 0.003922782798267004), ('<', 'NOUN', 0.0004499048330045003), ('Swagger', 'NOUN', 0.028818525195740345), ('version', 'NOUN', 0.007111385432072678), ('<', 'NOUN', 0.0004499048330045003), ('yaml', 'NOUN', 0.025414727639005184), ('functionality', 'NOUN', 0.007111385432072678), ('results', 'NOUN', 0.006059572321393881), ('code', 'NOUN', 0.00022842679300050383), ('specification', 'NOUN', 0.0092807000906801), ("'generate", 'NOUN', 0.0008759964040804655), ("'validate", 'NOUN', 0.0003871023864606765), ('command', 'NOUN', 0.002499882

[('AndroidSVG', 'NOUN', 0.1383584472827663), ('version', 'NOUN', 0.003513585248318367), ('attacks', 'NOUN', 0.0023817535222247217), ('SVG', 'NOUN', 1.0897590599170692e-06), ('component', 'NOUN', 0.007111385432072678), ('denial', 'NOUN', 0.027810168644691644), ('service', 'NOUN', 3.351913857074105e-05), ('code', 'NOUN', 6.637879428757772e-06), ('execution', 'NOUN', 1.1612273901101247e-05)]
[('Awstats', 'NOUN', 0.1383584472827663), ('version', 'NOUN', 0.003513585248318367), ('path', 'NOUN', 4.827652612209872e-05), ('traversal', 'NOUN', 0.0036445772245455436), ('flaw', 'NOUN', 0.01796104446833434), ('handling', 'NOUN', 4.427907193267174e-05), ('config', 'NOUN', 1.4449621094946244e-05), ('migrate', 'NOUN', 0.0002167004790668032), ('parameters', 'NOUN', 0.00022841134297055227), ('code', 'NOUN', 2.2572112976414933e-05), ('execution', 'NOUN', 1.1612273901101247e-05)]
[('Odoo', 'NOUN', 0.2970504960048405), ('Odoo', 'NOUN', 0.0030896251699885054), ('Community', 'NOUN', 0.0029261306234129466), (

[('MODX', 'NOUN', 0.7247875512600701), ('Revolution', 'NOUN', 0.0029261306234129466), ('attacker', 'NOUN', 4.0374914321279336e-05), ('XSS', 'NOUN', 5.875165938388624e-05), ('payload', 'NOUN', 0.0009164576152997895), ('HTTP', 'NOUN', 0.0002921694276054659), ('Host', 'NOUN', 0.0004190697343014633), ('header', 'NOUN', 0.006059572321393881), ('request', 'NOUN', 0.0003929733456514727), ('conjunction', 'NOUN', 0.027810168644691644), ('issues', 'NOUN', 0.0008508788733684004), ('Cache', 'NOUN', 0.0004975349943239279), ('Poisoning', 'NOUN', 0.0005865992976173321)]
[('SimpleXML', 'NOUN', 0.1383584472827663), ('version', 'NOUN', 0.0092807000906801), ('XXE', 'NOUN', 1.0930255355112198e-05), ('vulnerability', 'NOUN', 0.006059572321393881), ('SSRF', 'NOUN', 0.007111385432072678), ('information', 'NOUN', 4.044406031405286e-05), ('disclosure', 'NOUN', 0.006059572321393881), ('DoS', 'NOUN', 1.5971619986685366e-07)]
[('vulnerability', 'NOUN', 0.00017681130304014192), ('Swagger-Parser', 'NOUN', 0.4221820

[('vulnerability', 'NOUN', 0.00017681130304014192), ('Swagger-Parser', 'NOUN', 0.42218200657197497), ('version', 'NOUN', 5.9109446323917724e-05), ('<', 'NOUN', 0.0004499048330045003), ('=', 'NOUN', 0.00015001327213226494), ('yaml', 'NOUN', 5.9109446323917724e-05), ('functionality', 'NOUN', 0.007111385432072678), ('results', 'NOUN', 0.006059572321393881), ('code', 'NOUN', 0.00022842679300050383), ('specification', 'NOUN', 0.0092807000906801), ("'generate", 'NOUN', 0.0008759964040804655), ("'validate", 'NOUN', 0.0003871023864606765), ('command', 'NOUN', 0.0024998823244899923), ('swagger-codegen', 'NOUN', 0.8199333123997553), ('=', 'NOUN', 7.684023936459937e-05), ('code', 'NOUN', 7.615386138838881e-05), ('commands', 'NOUN', 0.019879282955218562), ('yaml', 'NOUN', 0.0092807000906801), ('specification', 'NOUN', 0.01796104446833434)]
[('Java', 'NOUN', 2.712132446090393e-05), ('WebSocket', 'NOUN', 0.0009772833135586845), ('client', 'NOUN', 0.01796104446833434), ('nv-websocket-client', 'NOUN',

[('MODX', 'NOUN', 0.7247875512600701), ('Revolution', 'NOUN', 0.0029261306234129466), ('user', 'NOUN', 0.00010191164410204333), ('file', 'NOUN', 0.013552329928513204), ('permissions', 'NOUN', 0.0092807000906801), ('code', 'NOUN', 6.527928349359658e-06), ('file', 'NOUN', 0.0009164576152997895), ('name', 'NOUN', 0.0008759964040804655), ('.htaccess', 'NOUN', 0.0008701727959038898)]
[('MODX', 'NOUN', 0.7247875512600701), ('Revolution', 'NOUN', 0.0029261306234129466), ('user', 'NOUN', 0.00010191164410204333), ('resource', 'NOUN', 0.013552329928513204), ('edit', 'NOUN', 0.01796104446833434), ('permissions', 'NOUN', 0.01796104446833434), ('payload', 'NOUN', 4.569370873397793e-05), ('title', 'NOUN', 0.0002921694276054659), ('post', 'NOUN', 0.00671542622378454), ('parameter', 'NOUN', 0.0092807000906801)]
[('MODX', 'NOUN', 0.7247875512600701), ('Revolution', 'NOUN', 0.0029261306234129466), ('attacker', 'NOUN', 4.0374914321279336e-05), ('XSS', 'NOUN', 5.875165938388624e-05), ('payload', 'NOUN', 0

[('MODX', 'NOUN', 0.7247875512600701), ('Revolution', 'NOUN', 0.0029261306234129466), ('attacker', 'NOUN', 4.0374914321279336e-05), ('XSS', 'NOUN', 5.875165938388624e-05), ('payload', 'NOUN', 0.0009164576152997895), ('HTTP', 'NOUN', 0.0002921694276054659), ('Host', 'NOUN', 0.0004190697343014633), ('header', 'NOUN', 0.006059572321393881), ('request', 'NOUN', 0.0003929733456514727), ('conjunction', 'NOUN', 0.027810168644691644), ('issues', 'NOUN', 0.0008508788733684004), ('Cache', 'NOUN', 0.0004975349943239279), ('Poisoning', 'NOUN', 0.0005865992976173321)]
[('SimpleXML', 'NOUN', 0.1383584472827663), ('version', 'NOUN', 0.0092807000906801), ('XXE', 'NOUN', 1.0930255355112198e-05), ('vulnerability', 'NOUN', 0.006059572321393881), ('SSRF', 'NOUN', 0.007111385432072678), ('information', 'NOUN', 4.044406031405286e-05), ('disclosure', 'NOUN', 0.006059572321393881), ('DoS', 'NOUN', 1.5971619986685366e-07)]
[('vulnerability', 'NOUN', 0.00017681130304014192), ('Swagger-Parser', 'NOUN', 0.4221820

[('MODX', 'NOUN', 0.7247875512600701), ('Revolution', 'NOUN', 0.0029261306234129466), ('name', 'NOUN', 0.0002167004790668032), ('parameters', 'NOUN', 0.00022841134297055227), ('System', 'NOUN', 7.105230259507718e-06), ('Settings', 'NOUN', 0.0029261306234129466), ('module', 'NOUN', 0.01796104446833434), ('XSS', 'NOUN', 5.925251279002041e-07), ('A', 'NOUN', 1.5880341460458938e-06), ('payload', 'NOUN', 0.0003745651816579684), ('user', 'NOUN', 0.05735740086299299), ('module', 'NOUN', 0.0022485420199589157)]
[('Directory', 'NOUN', 0.0003361392097251651), ('traversal', 'NOUN', 0.0011739449198078355), ('setup/processors/url_search.php', 'NOUN', 0.027810168644691644), ('search', 'NOUN', 0.0008759964040804655), ('page', 'NOUN', 0.01796104446833434), ('processor', 'NOUN', 0.0092807000906801), ('MODX', 'NOUN', 0.8199333123997553), ('Revolution', 'NOUN', 0.0029261306234129466), ('attackers', 'NOUN', 2.2572112976414933e-05), ('system', 'NOUN', 0.007111385432072678), ('information', 'NOUN', 0.003112

[('MODX', 'NOUN', 0.10893259565910848), ('Revolution', 'NOUN', 0.29509590080563497), ('version', 'NOUN', 0.08631345447731184), ('SQL', 'NOUN', 2.7947344539079148e-06), ('injection', 'NOUN', 0.0031675037655813526), ('sanitization', 'NOUN', 8.286675654510596e-05), ('escape', 'NOUN', 0.0006710062988714714), ('method', 'NOUN', 0.0031675037655813526), ('user', 'NOUN', 0.0013437509464937422), ('database', 'NOUN', 0.0024103948844549057), ('privileges', 'NOUN', 5.9706926656440816e-06)]
[('MODX', 'NOUN', 0.29509590080563497), ('Revolution', 'NOUN', 0.011188005003110553), ('name', 'NOUN', 0.003086241203467722), ('parameters', 'NOUN', 9.098222056070754e-06), ('System', 'NOUN', 2.365572858168102e-05), ('Settings', 'NOUN', 0.00021353209394324373), ('module', 'NOUN', 0.0030311914293807206), ('XSS', 'NOUN', 1.1903736995250876e-06), ('A', 'NOUN', 1.5418651528912578e-06), ('payload', 'NOUN', 0.0007341403113058306), ('user', 'NOUN', 0.15829801643721242), ('module', 'NOUN', 5.496598500049622e-05)]
Senten

[('vulnerability', 'NOUN', 9.202225882533639e-05), ('WebCalendar', 'NOUN', 0.9620868269327733), ('allows', 'NOUN', 2.3716432372455865e-06), ('attacker', 'NOUN', 0.00043764088540084313), ('script', 'NOUN', 0.0005625610012830959), ('HTML', 'NOUN', 0.00010276464127368764), ('vectors', 'NOUN', 2.7004228281147203e-06)]
[('Directory', 'NOUN', 0.0002559306320436742), ('traversal', 'NOUN', 0.00019398425854110147), ('vulnerability', 'NOUN', 0.0043084894330294495), ('WebCalendar', 'NOUN', 0.9620868269327733), ('allows', 'NOUN', 0.00016594527346164913), ('attackers', 'NOUN', 0.0016617403939050911), ('files', 'NOUN', 0.0010052329214746269), ('vectors', 'NOUN', 2.7004228281147203e-06)]
[('Heimdal', 'NOUN', 0.10893259565910848), ('allows', 'NOUN', 0.00032649881474860896), ('attackers', 'NOUN', 1.9582044434595214e-05), ('services', 'NOUN', 0.0004599419945216231), ('Orpheus', 'NOUN', 0.0692428135776676), ('Lyre', 'NOUN', 0.00017525035163416434), ('attacks', 'NOUN', 0.29706994038664447), ('names', 'NOU

Sentence:  Directory traversal in setup/processors/url_search.php (aka the search page of an unused processor) in MODX Revolution 2.5.7 might allow remote attackers to obtain system directory information.
Expected: `revolution` got: `{'traversal', 'setup/processors/url_search.php', 'modx'}`
[('MODX', 'NOUN', 0.03221961403967802), ('Revolution', 'NOUN', 0.0034959080659452925), ('PHP', 'NOUN', 0.043522873055704306), ('attacker', 'NOUN', 0.001351185925049807), ('files', 'NOUN', 1.026043864780332e-05), ('web', 'NOUN', 0.0003462225138978494), ('server', 'NOUN', 0.0004228607934083374), ('validation', 'NOUN', 1.140059136428659e-06), ('action', 'NOUN', 6.977953111294995e-05), ('parameter', 'NOUN', 0.003120724226917038), ('directory', 'NOUN', 0.0009662463551024282), ('traversal', 'NOUN', 1.1229225321254348e-05)]
[('MODX', 'NOUN', 0.03221961403967802), ('Revolution', 'NOUN', 0.0034959080659452925), ('attacker', 'NOUN', 0.001351185925049807), ('XSS', 'NOUN', 6.972097880875238e-06), ('payloads', '

[('MODX', 'NOUN', 0.03221961403967802), ('Revolution', 'NOUN', 0.0034959080659452925), ('attacker', 'NOUN', 0.001351185925049807), ('XSS', 'NOUN', 6.972097880875238e-06), ('payloads', 'NOUN', 0.0022198963452342924), ('fields', 'NOUN', 0.0003589901172066429), ('setup', 'NOUN', 0.00882568299695354), ('page', 'NOUN', 0.0005679649607967249), ('database_type', 'NOUN', 0.0020506176929166774), ('parameter', 'NOUN', 1.1229225321254348e-05)]
[('MODX', 'NOUN', 0.03221961403967802), ('Revolution', 'NOUN', 0.0034959080659452925), ('user', 'NOUN', 0.002348010341705939), ('file', 'NOUN', 0.018044944937290055), ('permissions', 'NOUN', 0.0021886673054414456), ('code', 'NOUN', 9.168845343280683e-05), ('file', 'NOUN', 0.007257051131518352), ('name', 'NOUN', 0.026017799293180665), ('.htaccess', 'NOUN', 3.7431031285170803e-06)]
[('MODX', 'NOUN', 0.03221961403967802), ('Revolution', 'NOUN', 0.0034959080659452925), ('user', 'NOUN', 0.002348010341705939), ('resource', 'NOUN', 0.01730748196503853), ('edit', '

[('SimpleXML', 'NOUN', 0.10893259565910848), ('version', 'NOUN', 0.06380395442281121), ('XXE', 'NOUN', 2.5728896846122122e-05), ('vulnerability', 'NOUN', 0.002524250600682266), ('SSRF', 'NOUN', 0.0003140715500348153), ('information', 'NOUN', 0.0034127870147270655), ('disclosure', 'NOUN', 0.004909050515781275), ('DoS', 'NOUN', 2.521635888802398e-05)]
[('vulnerability', 'NOUN', 0.003509830408821605), ('Swagger-Parser', 'NOUN', 0.9221314780000263), ('version', 'NOUN', 0.010829512277664396), ('<', 'NOUN', 0.0014836005197826866), ('Swagger', 'NOUN', 0.30807545752533805), ('version', 'NOUN', 0.0028363419037733075), ('<', 'NOUN', 0.0014836005197826866), ('yaml', 'NOUN', 0.004875197773104956), ('functionality', 'NOUN', 0.0003800726812641851), ('results', 'NOUN', 0.00416688757025837), ('code', 'NOUN', 0.0016816538113062277), ('specification', 'NOUN', 0.0016816538113062277), ("'generate", 'NOUN', 0.674135328186933), ("'validate", 'NOUN', 0.0797545955495612), ('command', 'NOUN', 0.008777876370383

[('ELabftw', 'NOUN', 0.10893259565910848), ('version', 'NOUN', 0.005998898162173517), ('scripting', 'NOUN', 1.045966838230853e-05), ('experiment', 'NOUN', 0.00019747093211557485), ('infos', 'NOUN', 0.00178001846076269), ('component', 'NOUN', 0.0031675037655813526), ('execution', 'NOUN', 0.0031120466663989866), ('JavaScript', 'NOUN', 0.048980893988009436), ('denial', 'NOUN', 0.00023346441453603485), ('service', 'NOUN', 4.343134190798721e-06)]
[('Primetek', 'NOUN', 0.10893259565910848), ('Primefaces', 'NOUN', 0.25660968415576874), ('encryption', 'NOUN', 0.0012848115354704257), ('flaw', 'NOUN', 0.0031675037655813526), ('code', 'NOUN', 0.2336530901293256), ('execution', 'NOUN', 4.4456694568468124e-07)]
[('Plexus-utils', 'NOUN', 0.10893259565910848), ('injection', 'NOUN', 0.1834790775096097), ('contents', 'NOUN', 0.020071635535461613), ('strings', 'NOUN', 5.9706926656440816e-06)]
[('AndroidSVG', 'NOUN', 0.10893259565910848), ('version', 'NOUN', 0.005998898162173517), ('attacks', 'NOUN', 7.7

[('Java', 'NOUN', 7.788030434343275e-05), ('WebSocket', 'NOUN', 9.567688440360151e-05), ('client', 'NOUN', 0.00178001846076269), ('nv-websocket-client', 'NOUN', 0.45370947572615783), ('server', 'NOUN', 0.0029012048726302065), ('hostname', 'NOUN', 0.0002745293177159735), ('domain', 'NOUN', 0.0034882271692738276), ('name', 'NOUN', 1.4498285135213274e-05), ('subject', 'NOUN', 0.006917334217148869), ('Common', 'NOUN', 0.0013358315455840194), ('Name', 'NOUN', 0.0005905216790409834), ('CN', 'NOUN', 1.336283407221593e-06), ('field', 'NOUN', 3.078068429676454e-05), ('X.509', 'NOUN', 9.140489478883136e-05), ('certificate', 'NOUN', 4.712799415058792e-05), ('attackers', 'NOUN', 0.0022533705773505343), ('SSL/TLS', 'NOUN', 4.5537546679931956e-05), ('servers', 'NOUN', 0.0004975340099776192), ('certificate', 'NOUN', 8.101224730879775e-06)]
[('ELabftw', 'NOUN', 0.10893259565910848), ('version', 'NOUN', 0.005998898162173517), ('scripting', 'NOUN', 1.045966838230853e-05), ('experiment', 'NOUN', 0.000197

[('Odoo', 'NOUN', 0.814749833639779), ('Odoo', 'NOUN', 0.009467762409213517), ('Community', 'NOUN', 0.0010758182794864566), ('Edition', 'NOUN', 0.0071792168325470395), ('Odoo', 'NOUN', 0.007343524448237127), ('Enterprise', 'NOUN', 0.0010758182794864566), ('Edition', 'NOUN', 0.0071792168325470395), ('insecure', 'NOUN', 0.0004485406628192573), ('handling', 'NOUN', 0.0043084894330294495), ('anonymization', 'NOUN', 0.015519986288050179), ('data', 'NOUN', 1.4498285135213274e-05), ('Database', 'NOUN', 2.365572858168102e-05), ('Anonymization', 'NOUN', 0.00021353209394324373), ('module', 'NOUN', 0.00013732811292724715), ('users', 'NOUN', 0.0003225330423081569), ('Python', 'NOUN', 1.3472233818584324e-05), ('code', 'NOUN', 0.004337972589895182), ('unpickle', 'NOUN', 0.005686879111684807)]
[('Odoo', 'NOUN', 0.814749833639779), ('Odoo', 'NOUN', 0.009467762409213517), ('Community', 'NOUN', 0.0010758182794864566), ('Edition', 'NOUN', 0.0071792168325470395), ('Odoo', 'NOUN', 0.007343524448237127), ('

[('MODX', 'NOUN', 0.8683135290829723), ('Revolution', 'NOUN', 0.9952149400277436), ('version', 'NOUN', 0.9444030599552373), ('SQL', 'NOUN', 8.639233761843758e-06), ('injection', 'NOUN', 0.0006251595543015433), ('sanitization', 'NOUN', 2.2613849268823193e-05), ('escape', 'NOUN', 6.336555446547773e-06), ('method', 'NOUN', 0.0006251595543015433), ('user', 'NOUN', 4.5288222641374016e-05), ('database', 'NOUN', 0.0008938050525954044), ('privileges', 'NOUN', 2.210656276593673e-06)]
[('MODX', 'NOUN', 0.8850984604482687), ('Revolution', 'NOUN', 0.9444030599552373), ('name', 'NOUN', 2.490157471573966e-05), ('parameters', 'NOUN', 7.714160744863847e-08), ('System', 'NOUN', 2.8388825964896435e-06), ('Settings', 'NOUN', 0.00035092420514156754), ('module', 'NOUN', 0.0005981903921491989), ('XSS', 'NOUN', 2.0874205710364934e-08), ('A', 'NOUN', 3.082819199009126e-08), ('payload', 'NOUN', 0.00020043748637982355), ('user', 'NOUN', 0.007506698982012778), ('module', 'NOUN', 3.158086379228234e-07)]
[('Direct

[('vulnerability', 'NOUN', 0.03420475251835801), ('WebCalendar', 'NOUN', 0.998201087233925), ('allows', 'NOUN', 5.476050794934793e-07), ('attacker', 'NOUN', 9.26740805225302e-07), ('script', 'NOUN', 0.0001535731493125963), ('HTML', 'NOUN', 5.383924478049321e-06), ('vectors', 'NOUN', 7.622968948578074e-08)]
[('Directory', 'NOUN', 0.003148680163852031), ('traversal', 'NOUN', 0.014449944846556038), ('vulnerability', 'NOUN', 0.15044062517985407), ('WebCalendar', 'NOUN', 0.998201087233925), ('allows', 'NOUN', 3.8321070772421914e-05), ('attackers', 'NOUN', 5.60226073470477e-05), ('files', 'NOUN', 6.697067995498348e-06), ('vectors', 'NOUN', 7.622968948578074e-08)]
[('Heimdal', 'NOUN', 0.8864184851585389), ('allows', 'NOUN', 0.0038209813126459185), ('attackers', 'NOUN', 0.0005269250136893873), ('services', 'NOUN', 0.05573694199260519), ('Orpheus', 'NOUN', 0.07710781250235733), ('Lyre', 'NOUN', 0.003200482508608205), ('attacks', 'NOUN', 0.9542818288385793), ('names', 'NOUN', 0.00054437977349764

[('MODX', 'NOUN', 0.34105765041160346), ('Revolution', 'NOUN', 0.5865545346253866), ('user', 'NOUN', 1.4603084572856297e-05), ('resource', 'NOUN', 7.489207831797759e-06), ('edit', 'NOUN', 0.00035092420514156754), ('permissions', 'NOUN', 0.0004594475561741344), ('payload', 'NOUN', 2.390129967471317e-06), ('title', 'NOUN', 5.306590768922719e-06), ('post', 'NOUN', 2.96955369349606e-06), ('parameter', 'NOUN', 0.0006159036355421464)]
[('MODX', 'NOUN', 0.34105765041160346), ('Revolution', 'NOUN', 0.5865545346253866), ('attacker', 'NOUN', 4.838489696918067e-06), ('XSS', 'NOUN', 7.184215370984359e-06), ('payload', 'NOUN', 1.6799211130909758e-06), ('HTTP', 'NOUN', 2.8388825964896435e-06), ('Host', 'NOUN', 2.8154277667055393e-05), ('header', 'NOUN', 3.266412064985544e-05), ('request', 'NOUN', 3.7113059892891135e-07), ('conjunction', 'NOUN', 8.81606392618633e-05), ('issues', 'NOUN', 0.00012004075860506163), ('Cache', 'NOUN', 3.704330671309343e-05), ('Poisoning', 'NOUN', 2.210656276593673e-06)]
[(

[('Plexus-utils', 'NOUN', 0.8864184851585389), ('injection', 'NOUN', 0.011745856907681965), ('contents', 'NOUN', 1.4868820072335067e-05), ('strings', 'NOUN', 2.210656276593673e-06)]
[('AndroidSVG', 'NOUN', 0.8864184851585389), ('version', 'NOUN', 0.5714050164356131), ('attacks', 'NOUN', 9.514123267404741e-07), ('SVG', 'NOUN', 2.2130997615505878e-07), ('component', 'NOUN', 0.0006251595543015433), ('denial', 'NOUN', 1.59960398640227e-06), ('service', 'NOUN', 5.924499037559489e-06), ('code', 'NOUN', 0.0010260847463531367), ('execution', 'NOUN', 4.556047624705796e-06)]
[('Awstats', 'NOUN', 0.8864184851585389), ('version', 'NOUN', 0.9156552226751903), ('path', 'NOUN', 2.6961277422382364e-06), ('traversal', 'NOUN', 2.8154277667055393e-05), ('flaw', 'NOUN', 2.854231549119261e-06), ('handling', 'NOUN', 1.5161766069715953e-07), ('config', 'NOUN', 1.5801883799333377e-05), ('migrate', 'NOUN', 2.490157471573966e-05), ('parameters', 'NOUN', 1.6906487767200193e-05), ('code', 'NOUN', 7.80170553384601

[('SimpleXML', 'NOUN', 0.9083486844914204), ('version', 'NOUN', 0.9444030599552373), ('XXE', 'NOUN', 9.094008013883306e-08), ('vulnerability', 'NOUN', 0.00016603676432517346), ('SSRF', 'NOUN', 0.0009702433116337953), ('information', 'NOUN', 5.14014434502173e-06), ('disclosure', 'NOUN', 0.0003236237664078284), ('DoS', 'NOUN', 3.160233745121413e-07)]
[('vulnerability', 'NOUN', 0.0018605400055953754), ('Swagger-Parser', 'NOUN', 0.9309462393688022), ('version', 'NOUN', 0.030326219893654444), ('<', 'NOUN', 0.06713478147019014), ('Swagger', 'NOUN', 0.09562203017032266), ('version', 'NOUN', 0.1796033881918692), ('<', 'NOUN', 0.06713478147019014), ('yaml', 'NOUN', 0.19461691240126708), ('functionality', 'NOUN', 0.06391769411946426), ('results', 'NOUN', 0.10515763059902429), ('code', 'NOUN', 0.004775244577621033), ('specification', 'NOUN', 0.13737561892184097), ("'generate", 'NOUN', 0.1367054820086479), ("'validate", 'NOUN', 0.0031592905637227416), ('command', 'NOUN', 0.003767222836249654), ('s

[('AndroidSVG', 'NOUN', 0.8864184851585389), ('version', 'NOUN', 0.5714050164356131), ('attacks', 'NOUN', 9.514123267404741e-07), ('SVG', 'NOUN', 2.2130997615505878e-07), ('component', 'NOUN', 0.0006251595543015433), ('denial', 'NOUN', 1.59960398640227e-06), ('service', 'NOUN', 5.924499037559489e-06), ('code', 'NOUN', 0.0010260847463531367), ('execution', 'NOUN', 4.556047624705796e-06)]
[('Awstats', 'NOUN', 0.8864184851585389), ('version', 'NOUN', 0.9156552226751903), ('path', 'NOUN', 2.6961277422382364e-06), ('traversal', 'NOUN', 2.8154277667055393e-05), ('flaw', 'NOUN', 2.854231549119261e-06), ('handling', 'NOUN', 1.5161766069715953e-07), ('config', 'NOUN', 1.5801883799333377e-05), ('migrate', 'NOUN', 2.490157471573966e-05), ('parameters', 'NOUN', 1.6906487767200193e-05), ('code', 'NOUN', 7.801705533846014e-07), ('execution', 'NOUN', 3.62403456309922e-08)]
[('Odoo', 'NOUN', 0.9966825814086888), ('Odoo', 'NOUN', 0.024295547738791314), ('Community', 'NOUN', 0.3033919865071487), ('Editi

[('Java', 'NOUN', 2.215129492197324e-06), ('WebSocket', 'NOUN', 5.242209922260807e-05), ('client', 'NOUN', 0.00035092420514156754), ('nv-websocket-client', 'NOUN', 0.009526114535855678), ('server', 'NOUN', 6.336555446547773e-06), ('hostname', 'NOUN', 2.861914407639645e-05), ('domain', 'NOUN', 2.413285920556187e-06), ('name', 'NOUN', 2.854231549119261e-06), ('subject', 'NOUN', 0.0001001081878074313), ('Common', 'NOUN', 0.0001534472632015623), ('Name', 'NOUN', 0.0009702433116337953), ('CN', 'NOUN', 2.4475716220341628e-08), ('field', 'NOUN', 8.399549123135223e-06), ('X.509', 'NOUN', 1.9391242928082182e-05), ('certificate', 'NOUN', 9.278182331206033e-06), ('attackers', 'NOUN', 0.0006159036355421464), ('SSL/TLS', 'NOUN', 0.0001407555368534055), ('servers', 'NOUN', 9.798596069908157e-05), ('certificate', 'NOUN', 2.210656276593673e-06)]
[('ELabftw', 'NOUN', 0.8864184851585389), ('version', 'NOUN', 0.5714050164356131), ('scripting', 'NOUN', 9.514123267404741e-07), ('experiment', 'NOUN', 2.8388

[('vulnerability', 'NOUN', 0.0018605400055953754), ('Swagger-Parser', 'NOUN', 0.9309462393688022), ('version', 'NOUN', 0.030326219893654444), ('<', 'NOUN', 0.06713478147019014), ('Swagger', 'NOUN', 0.09562203017032266), ('version', 'NOUN', 0.1796033881918692), ('<', 'NOUN', 0.06713478147019014), ('yaml', 'NOUN', 0.19461691240126708), ('functionality', 'NOUN', 0.06391769411946426), ('results', 'NOUN', 0.10515763059902429), ('code', 'NOUN', 0.004775244577621033), ('specification', 'NOUN', 0.13737561892184097), ("'generate", 'NOUN', 0.1367054820086479), ("'validate", 'NOUN', 0.0031592905637227416), ('command', 'NOUN', 0.003767222836249654), ('swagger-codegen', 'NOUN', 0.9574839300441667), ('=', 'NOUN', 0.12230447475664998), ('code', 'NOUN', 1.1211063219465053e-05), ('commands', 'NOUN', 0.00015319610911000917), ('yaml', 'NOUN', 0.00107314281744077), ('specification', 'NOUN', 2.210656276593673e-06)]
[('vulnerability', 'NOUN', 0.0020789720050082617), ('Swagger-Parser', 'NOUN', 0.985758423623

[('Java', 'NOUN', 2.215129492197324e-06), ('WebSocket', 'NOUN', 5.242209922260807e-05), ('client', 'NOUN', 0.00035092420514156754), ('nv-websocket-client', 'NOUN', 0.009526114535855678), ('server', 'NOUN', 6.336555446547773e-06), ('hostname', 'NOUN', 2.861914407639645e-05), ('domain', 'NOUN', 2.413285920556187e-06), ('name', 'NOUN', 2.854231549119261e-06), ('subject', 'NOUN', 0.0001001081878074313), ('Common', 'NOUN', 0.0001534472632015623), ('Name', 'NOUN', 0.0009702433116337953), ('CN', 'NOUN', 2.4475716220341628e-08), ('field', 'NOUN', 8.399549123135223e-06), ('X.509', 'NOUN', 1.9391242928082182e-05), ('certificate', 'NOUN', 9.278182331206033e-06), ('attackers', 'NOUN', 0.0006159036355421464), ('SSL/TLS', 'NOUN', 0.0001407555368534055), ('servers', 'NOUN', 9.798596069908157e-05), ('certificate', 'NOUN', 2.210656276593673e-06)]
[('ELabftw', 'NOUN', 0.8864184851585389), ('version', 'NOUN', 0.5714050164356131), ('scripting', 'NOUN', 9.514123267404741e-07), ('experiment', 'NOUN', 2.8388

[('Odoo', 'NOUN', 0.9966825814086888), ('Odoo', 'NOUN', 0.024295547738791314), ('Community', 'NOUN', 0.3033919865071487), ('Edition', 'NOUN', 0.7834888393006215), ('Odoo', 'NOUN', 0.12733293682804228), ('Enterprise', 'NOUN', 0.3033919865071487), ('Edition', 'NOUN', 0.9156552226751903), ('access', 'NOUN', 1.5641461092381227e-05), ('control', 'NOUN', 9.464204646333983e-05), ('OAuth', 'NOUN', 2.8154277667055393e-05), ('tokens', 'NOUN', 2.854231549119261e-06), ('OAuth', 'NOUN', 6.336555446547773e-06), ('module', 'NOUN', 2.7038018647034964e-05), ('users', 'NOUN', 5.60226073470477e-05), ('OAuth', 'NOUN', 0.0001407555368534055), ('sessions', 'NOUN', 0.0008230668829909114), ('users', 'NOUN', 2.009691563028131e-07)]
[('Directory', 'NOUN', 0.003620451670542673), ('traversal', 'NOUN', 0.0022861083924746012), ('vulnerability', 'NOUN', 0.03893160445107227), ('tools.file_open', 'NOUN', 0.018601042156985582), ('Odoo', 'NOUN', 0.9988495720146102), ('allows', 'NOUN', 9.192329838266498e-05), ('users', '

[('vulnerability', 'NOUN', 0.0020789720050082617), ('Swagger-Parser', 'NOUN', 0.9857584236232351), ('version', 'NOUN', 0.0006854085734517694), ('<', 'NOUN', 0.010604707649251905), ('=', 'NOUN', 0.12230447475664998), ('yaml', 'NOUN', 0.0016869754884054574), ('functionality', 'NOUN', 0.06391769411946426), ('results', 'NOUN', 0.10515763059902429), ('code', 'NOUN', 0.004775244577621033), ('specification', 'NOUN', 0.13737561892184097), ("'generate", 'NOUN', 0.1367054820086479), ("'validate", 'NOUN', 0.0031592905637227416), ('command', 'NOUN', 0.003767222836249654), ('swagger-codegen', 'NOUN', 0.9574839300441667), ('=', 'NOUN', 0.12230447475664998), ('code', 'NOUN', 1.1211063219465053e-05), ('commands', 'NOUN', 0.00015319610911000917), ('yaml', 'NOUN', 0.00107314281744077), ('specification', 'NOUN', 2.210656276593673e-06)]
[('Java', 'NOUN', 2.215129492197324e-06), ('WebSocket', 'NOUN', 5.242209922260807e-05), ('client', 'NOUN', 0.00035092420514156754), ('nv-websocket-client', 'NOUN', 0.00952

[('MODX', 'NOUN', 0.8683135290829723), ('Revolution', 'NOUN', 0.9952149400277436), ('version', 'NOUN', 0.9444030599552373), ('SQL', 'NOUN', 8.639233761843758e-06), ('injection', 'NOUN', 0.0006251595543015433), ('sanitization', 'NOUN', 2.2613849268823193e-05), ('escape', 'NOUN', 6.336555446547773e-06), ('method', 'NOUN', 0.0006251595543015433), ('user', 'NOUN', 4.5288222641374016e-05), ('database', 'NOUN', 0.0008938050525954044), ('privileges', 'NOUN', 2.210656276593673e-06)]
[('MODX', 'NOUN', 0.8850984604482687), ('Revolution', 'NOUN', 0.9444030599552373), ('name', 'NOUN', 2.490157471573966e-05), ('parameters', 'NOUN', 7.714160744863847e-08), ('System', 'NOUN', 2.8388825964896435e-06), ('Settings', 'NOUN', 0.00035092420514156754), ('module', 'NOUN', 0.0005981903921491989), ('XSS', 'NOUN', 2.0874205710364934e-08), ('A', 'NOUN', 3.082819199009126e-08), ('payload', 'NOUN', 0.00020043748637982355), ('user', 'NOUN', 0.007506698982012778), ('module', 'NOUN', 3.158086379228234e-07)]
[('Direct

[('Odoo', 'NOUN', 0.9966825814086888), ('Odoo', 'NOUN', 0.024295547738791314), ('Community', 'NOUN', 0.3033919865071487), ('Edition', 'NOUN', 0.7834888393006215), ('Odoo', 'NOUN', 0.12733293682804228), ('Enterprise', 'NOUN', 0.3033919865071487), ('Edition', 'NOUN', 0.9156552226751903), ('attackers', 'NOUN', 0.0005359184085893051), ('authentication', 'NOUN', 0.008931949512865793), ('circumstances', 'NOUN', 0.050542353944247484), ('parameters', 'NOUN', 0.21001167748716784), ('characters', 'NOUN', 0.10339731584235361), ('database', 'NOUN', 0.0010596774007850269), ('layer', 'NOUN', 0.0010560154835249954), ('Psycopg', 'NOUN', 0.24753908758521723)]
[('Odoo', 'NOUN', 0.9966825814086888), ('Odoo', 'NOUN', 0.024295547738791314), ('Community', 'NOUN', 0.3033919865071487), ('Edition', 'NOUN', 0.7834888393006215), ('Odoo', 'NOUN', 0.12733293682804228), ('Enterprise', 'NOUN', 0.3033919865071487), ('Edition', 'NOUN', 0.9156552226751903), ('access', 'NOUN', 1.5641461092381227e-05), ('control', 'NOUN'

In [49]:
trace_names = ['vanilla_extractor'] + ['extract_features_v%d' % i for i in range(len(accuracy_list))]
data = [go.Scatter(x=x, y=ac, name=trace_names[i]) for i, ac in enumerate(accuracy_list)]

layout = go.Layout(
    yaxis=dict(
        title='Accuracy',
        titlefont=dict(
            color='grey'
        )
    ),
    xaxis=dict(
        title='Candidates',
        titlefont=dict(
            color='grey'
        )
    ),
    shapes=[
        {
            'type': 'line',
            'x0': mean_guess,
            'x1': mean_guess,
            'y0': -0.05,
            'y1': 1.1,
            'opacity': 0.2,
            'line': {
                'dash': 'dash'
            }
        },
        {
            'type': 'line',
            'x0': -0.5,
            'x1': 20,
            'y0': old_accuracy,
            'y1': old_accuracy,
            'opacity': 0.2,
            'line': {
                'dash': 'dash'
            }
        }
    ]
)

fig = go.Figure(data=data, layout=layout)

In [50]:
iplot(fig, show_link=False)

## Try predictions on df

In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1125 entries, (Microsoft, ChakraCore) to (memcached, memcached)
Data columns (total 7 columns):
cve_id           1125 non-null object
url              1125 non-null object
description      1125 non-null object
version_range    1125 non-null object
Java             1125 non-null int64
JavaScript       1125 non-null int64
Python           1125 non-null int64
dtypes: int64(3), object(4)
memory usage: 81.9+ KB


In [52]:
def predict(sent:str, classifier=None, extractor=None, cve_id: list = None, n=1) -> float:
    tagged = nltk.pos_tag(nltk.word_tokenize(sent), tagset='universal')
    
    prob_dist = [classifier.prob_classify(extractor(tagged, j, cve_id)) for j in range(len(tagged))]
    
    probs = [(word, tag, prob.prob(True)) for (word, tag), prob in zip(tagged, prob_dist) if word not in stopwords and tag == 'NOUN' and not re.match(u"version[s]?", word)]
    probs = sorted(probs, key=lambda x: x[2], reverse=True)

    return probs[:n]

In [53]:
# Choose a classifier and train it on the whole df
feature_list = create_feature_list_long(extract_features_v3, descriptions, labels, cve_ids)
classifier = nltk.NaiveBayesClassifier.train(feature_list)

In [54]:
predictions = [None] * len(df)
for i, desc in enumerate(df.description.values):
    probs = predict(desc, classifier, extract_features_v3, cve_id=df.cve_id.values[i], n=3)
    predictions[i] = probs

In [55]:
# get just the names
pred_proj_names = [tuple(zip(*p))[0] for p in predictions]

In [56]:
pred_df = pd.Series(pred_proj_names, name='prediction')

In [57]:
df['prediction'] = pred_proj_names
df[(df.Java == 0) | (df.JavaScript == 0)][['description', 'prediction']].reset_index().style

Unnamed: 0,username,project,description,prediction
0,Microsoft,ChakraCore,"A remote code execution vulnerability exists when Microsoft scripting engine improperly accesses objects in memory. The vulnerability could corrupt memory in a way that enables an attacker to execute arbitrary code in the context of the current user. An attacker who successfully exploited the vulnerability could gain the same user rights as the current user, aka ""Scripting Engine Memory Corruption Vulnerability.""","('Microsoft', 'attacker', 'Vulnerability')"
1,Microsoft,ChakraCore,"A remote code execution vulnerability exists in the way affected Microsoft scripting engine render when handling objects in memory in Microsoft browsers. The vulnerability could corrupt memory in such a way that an attacker could execute arbitrary code in the context of the current user. An attacker who successfully exploited the vulnerability could gain the same user rights as the current user, aka ""Scripting Engine Memory Corruption Vulnerability.""","('Microsoft', 'Microsoft', 'attacker')"
2,Microsoft,ChakraCore,"An information disclosure vulnerability in Microsoft scripting engine allows remote attackers to obtain sensitive information from process memory via a crafted web site, aka ""Microsoft Browser Information Disclosure Vulnerability.""","('Microsoft', 'Microsoft', 'Disclosure')"
3,Microsoft,ChakraCore,"A remote code execution vulnerability exists in Microsoft Chakra Core in the way JavaScript engines render when handling objects in memory. aka ""Scripting Engine Memory Corruption Vulnerability"". This vulnerability is unique from CVE-2017-0252.","('Microsoft', 'Vulnerability', 'render')"
4,Microsoft,ChakraCore,"A remote code execution vulnerability exists in Microsoft Chakra Core in the way JavaScript engines render when handling objects in memory. aka ""Scripting Engine Memory Corruption Vulnerability"". This vulnerability is unique from CVE-2017-0223.","('Microsoft', 'Vulnerability', 'render')"
5,torproject,tor,The hidden-service feature in Tor before 0.3.0.8 allows a denial of service (assertion failure and daemon exit) in the relay_send_end_cell_from_edge_ function via a malformed BEGIN cell.,"('Tor', 'feature', 'BEGIN')"
6,torproject,tor,The hidden-service feature in Tor before 0.3.0.8 allows a denial of service (assertion failure and daemon exit) in the connection_edge_process_relay_cell function via a BEGIN_DIR cell on a rendezvous circuit.,"('Tor', 'feature', 'failure')"
7,torproject,tor,"Tor 0.3.x before 0.3.0.9 has a guard-selection algorithm that only considers the exit relay (not the exit relay's family), which might allow remote attackers to defeat intended anonymity properties by leveraging the existence of large families.","('Tor', 'relay', 'algorithm')"
8,torproject,tor,"The rend_service_intro_established function in or/rendservice.c in Tor before 0.2.8.15, 0.2.9.x before 0.2.9.12, 0.3.0.x before 0.3.0.11, 0.3.1.x before 0.3.1.7, and 0.3.2.x before 0.3.2.1-alpha, when SafeLogging is disabled, allows attackers to obtain sensitive information by leveraging access to the log files of a hidden service, because uninitialized stack data is included in an error message about construction of an introduction point circuit.","('Tor', 'function', 'or/rendservice.c')"
9,lota,phamm,"XSS exists in the login_form function in views/helpers.php in Phamm before 0.6.7, exploitable via the PATH_INFO to main.php.","('Phamm', 'function', 'views/helpers.php')"
