In [1]:
import json
import numpy as np
import pandas as pd
import spacy

In [2]:
f = open("cwe_mitigation_ids_temp.json")
w_mitigation = json.load(f)

f = open("capec_mitigation_temp.json")
ap_mitigation = json.load(f)

f = open("cwe_temp.json")
cwe = json.load(f)

f = open("capec_temp.json")
capec = json.load(f)

f = open("ap_names.json")
ap_names = json.load(f)

f = open("cwe_names.json")
cwe_names = json.load(f)

f = open("technique_names.json")
technique_names = json.load(f)

f = open("tactic_names.json")
tactic_names = json.load(f)

f = open("cve.json")
cve = json.load(f)

with open("w_dict.json", "r") as f:
    w_dict = json.load(f)
with open("ap_dict.json", "r") as f:
    ap_dict = json.load(f)

with open("ap_mitigation_descriptions.json", "r") as f:
    ap_mitigation_descriptions = list(set(json.load(f)))
with open("cwe_mitigation_descriptions.json", "r") as f:
    cwe_mitigation_descriptions = list(set(json.load(f)))
with open("tech_mitigation_names.json", "r") as f:
    tech_mitigation_descriptions = list(set(json.load(f)))

In [3]:
encode = spacy.load("en_core_web_lg")

In [4]:
def process_text(text, stop_words=False, punct=False, lemma=False):
    doc = encode(text)
    result = []
    for token in doc:
        if punct:
            if token.is_punct:
                continue
            if token.is_space:
                continue
        if stop_words:
            if token.is_stop:
                continue
        if lemma:
            result.append(token.lemma_)
        else:
            result.append(token.text)

    return result

In [5]:
aggregated_data = {}

In [6]:
def compute_and_aggregate_data(key, input):
    tokens_original = []
    tokens_stop_words_removed = []
    tokens_punct_removed = []
    chars_original = []
    chars_lemma = []

    for text in input:
        tokens_original.append(len(process_text(text)))
        tokens_stop_words_removed.append(len(process_text(text, stop_words=True)))
        tokens_punct_removed.append(len(process_text(text, punct=True)))
        chars_original.append(len(text))
        chars_lemma.append(len(" ".join((process_text(text, lemma=True)))))

    aggregated_data[key] = {}
    aggregated_data[key]["# of tokens"] = np.mean(tokens_original)
    aggregated_data[key]["# of tokens after removing stop words"] = np.mean(
        tokens_stop_words_removed
    )
    aggregated_data[key]["# of tokens after removing punctuation"] = np.mean(
        tokens_punct_removed
    )
    aggregated_data[key]["# of chars"] = np.mean(chars_original)
    aggregated_data[key]["# of chars after lemmatization"] = np.mean(chars_lemma)

In [7]:
aggregated_data["CWE Names"] = {}
compute_and_aggregate_data("CWE Names", cwe_names)

In [8]:
aggregated_data["CAPEC Names"] = {}
compute_and_aggregate_data("CAPEC Names", ap_names)

In [9]:
aggregated_data["CWE Mitigations"] = {}
compute_and_aggregate_data("CWE Mitigations", cwe_mitigation_descriptions)

In [10]:
aggregated_data["CAPEC Mitigations"] = {}
compute_and_aggregate_data("CAPEC Mitigations", ap_mitigation_descriptions)

In [11]:
aggregated_data["Technique Names"] = {}
compute_and_aggregate_data("Technique Names", technique_names)

In [12]:
aggregated_data["Technique Mitigations"] = {}
compute_and_aggregate_data("Technique Mitigations", tech_mitigation_descriptions)

In [13]:
aggregated_data["Tactic Names"] = {}
compute_and_aggregate_data("Tactic Names", tactic_names)

In [14]:
cve_names = []

for v in cve:
    if v["original_id"].split("-")[1] == "2021":
        cve_names.append(v["metadata"]["description"])

aggregated_data["CVE Names"] = {}
compute_and_aggregate_data("CVE Names", cve_names)

In [15]:
aggregated_df = pd.DataFrame(aggregated_data).T
aggregated_df

Unnamed: 0,# of tokens,# of tokens after removing stop words,# of tokens after removing punctuation,# of chars,# of chars after lemmatization
CWE Names,6.806277,5.436147,6.12987,45.454545,45.69697
CAPEC Names,4.064103,3.591575,3.846154,28.653846,28.128205
CWE Mitigations,42.518929,25.731302,36.602955,245.99723,244.262235
CAPEC Mitigations,25.239437,16.034571,22.18822,146.505762,145.057618
Technique Names,2.830389,2.657244,2.729682,20.40636,20.367491
Technique Mitigations,3.023256,2.813953,2.906977,23.511628,23.604651
Tactic Names,2.142857,2.071429,1.571429,13.714286,14.857143
CVE Names,59.166632,41.176008,50.513994,330.769247,333.20646


In [16]:
with open("descriptive_stats.json", "w") as f:
    json.dump(aggregated_data, f)