In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from transformers import AutoModel, AutoTokenizer, AutoModelForMaskedLM
import json
import spacy
import pandas as pd

In [2]:
negative_example_ids = []
negative_example_json = json.load(open("negative_examples_100_cwe_capec.json"))
for example in negative_example_json:
    negative_example_ids.append((example["ap"], example["cwe"]))

In [3]:
with open("w_dict.json", "r") as f:
    w_dict = json.load(f)
with open("ap_dict.json", "r") as f:
    ap_dict = json.load(f)
with open("technique_dict.json", "r") as f:
    technique_dict = json.load(f)
with open("tactic_dict.json", "r") as f:
    tactic_dict = json.load(f)
with open("cwe_names.json", "r") as f:
    cwe_names = json.load(f)
with open("ap_names.json", "r") as f:
    ap_names = json.load(f)
with open("technique_names.json", "r") as f:
    technique_names = json.load(f)
with open("tactic_names.json", "r") as f:
    tactic_names = json.load(f)
with open("ap_mitigation_descriptions.json", "r") as f:
    ap_mitigation_descriptions = json.load(f)
with open("cwe_mitigation_descriptions.json", "r") as f:
    cwe_mitigation_descriptions = json.load(f)
with open("tech_mitigation_names.json", "r") as f:
    tech_mitigation_names = json.load(f)
with open("tech_detection_names.json", "r") as f:
    tech_detection_names = json.load(f)
with open("ap_detection_descriptions.json", "r") as f:
    ap_detection_descriptions = json.load(f)
with open("cwe_detection_descriptions.json", "r") as f:
    cwe_detection_descriptions = json.load(f)

In [4]:
f = open("cwe_mitigation_ids_temp.json")
w_mitigation = json.load(f)

f = open("capec_mitigation_temp.json")
ap_mitigation = json.load(f)

f = open("technique_mitigation_temp.json")
technique_mitigation = json.load(f)

f = open("technique_detection_temp.json")
technique_detection = json.load(f)

f = open("capec_detection_temp.json")
ap_detection = json.load(f)

f = open("cwe_detection_temp.json")
w_detection = json.load(f)

In [5]:
positive_example_ids = []
for ap in ap_dict:
    for cwe in ap_dict[ap]["cwes"]:
        positive_example_ids.append((ap, cwe))

In [6]:
example_ids = positive_example_ids + negative_example_ids

In [7]:
ap_name_vectorizer = CountVectorizer()
ap_name_vectorizer.fit(ap_names)

technique_name_vectorizer = CountVectorizer()
technique_name_vectorizer.fit(technique_names)

cwe_name_vectorizer = CountVectorizer()
cwe_name_vectorizer.fit(cwe_names)

tactic_name_vectorizer = CountVectorizer()
tactic_name_vectorizer.fit(tactic_names)

ap_mitigation_vectorizer = CountVectorizer()
cwe_mitigation_vectorizer = CountVectorizer()
tech_mitigation_vectorizer = CountVectorizer()

ap_mitigation_vectorizer.fit(ap_mitigation_descriptions)
cwe_mitigation_vectorizer.fit(cwe_mitigation_descriptions)
tech_mitigation_vectorizer.fit(tech_mitigation_names)

ap_detection_vectorizer = CountVectorizer()
cwe_detection_vectorizer = CountVectorizer()
tech_detection_vectorizer = CountVectorizer()

ap_detection_vectorizer.fit(ap_detection_descriptions)
cwe_detection_vectorizer.fit(cwe_detection_descriptions)
tech_detection_vectorizer.fit(tech_detection_names)

CountVectorizer()

In [8]:
encode = spacy.load("en_core_web_lg")

In [9]:
device = "cpu"

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
pretrained_model = AutoModel.from_pretrained("bert-base-uncased").to(device)

model_path = "bert_base"
finetuned_model = AutoModelForMaskedLM.from_pretrained(model_path).to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
def vector_encoding(
    encoding_type, text, vectorizer=None, bert_output_type=None, bert_finetuned=False
):
    if encoding_type == "None":
        return text
    elif encoding_type == "BoW":
        return vectorizer_transform(text, vectorizer)
    elif encoding_type == "spaCy":
        return spaCy_vector(text)
    elif encoding_type == "BERT":
        if bert_finetuned:
            model = finetuned_model
        else:
            model = pretrained_model

        if bert_output_type == "pooler_output":
            return get_pooler_output(model, text)
        elif bert_output_type == "hidden_state":
            return get_hidden_state(model, text)


def vectorizer_transform(input_to_BoW, vectorizer):
    return vectorizer.transform([input_to_BoW])[0].toarray().flatten()


def spaCy_vector(text):
    return encode(text).vector


def get_pooler_output(model, text):
    inputs = tokenizer(text.lower(), truncation=True, return_tensors="pt")
    outputs = model(**inputs)
    pooled_output = outputs.pooler_output
    return pooled_output.detach().cpu().numpy().flatten()


def get_hidden_state(model, text):
    inputs = tokenizer(text.lower(), truncation=True, return_tensors="pt")
    outputs = model(**inputs, output_hidden_states=True)
    hidden_states = outputs.hidden_states
    return hidden_states[-1][:, 0, :].detach().cpu().numpy().flatten()


def append_data(
    encoding_type, data_combo, ap, cwe, bert_output_type=None, bert_finetuned=False
):
    output = []
    vectorizer = CountVectorizer()

    ap_text = ap_names
    technique_text = technique_names
    cwe_text = cwe_names
    tactic_text = tactic_names

    if data_combo == "A0":
        vectorizer.fit(ap_text + technique_text)

    elif data_combo == "A1":
        vectorizer.fit(ap_text + technique_text + cwe_text + tactic_text)

    elif data_combo == "A1 + MI":
        vectorizer.fit(
            ap_text
            + technique_text
            + cwe_text
            + tactic_text
            + cwe_mitigation_descriptions
            + ap_mitigation_descriptions
            + tech_mitigation_names
        )

    elif data_combo == "A1 + D":
        vectorizer.fit(
            ap_text
            + technique_text
            + cwe_text
            + tactic_text
            + cwe_detection_descriptions
            + ap_detection_descriptions
            + tech_detection_names
        )

    elif data_combo == "A1 + MI + D":
        vectorizer.fit(
            ap_text
            + technique_text
            + cwe_text
            + tactic_text
            + cwe_mitigation_descriptions
            + ap_mitigation_descriptions
            + tech_mitigation_names
            + cwe_detection_descriptions
            + ap_detection_descriptions
            + tech_detection_names
        )

    output.append(ap_dict[ap]["name"])
    output.append(w_dict[cwe]["name"])

    techniques = ap_dict[ap]["techniques"]

    if "A1" in data_combo:
        output.append(w_dict[cwe]["name"])

        for technique in techniques:
            output.append(technique_dict[technique]["name"])

        for technique in techniques:
            for tac in technique_dict[technique]["tactics"]:
                output.append(tactic_dict[tac]["name"])

    if data_combo in ["A1 + MI", "A1 + MI + D"]:
        for mitigation in w_dict[cwe]["mitigations"]:
            for cwe_mit in w_mitigation:
                if mitigation == cwe_mit["_id"]:
                    output.append(cwe_mit["metadata"]["Description"])

        for mitigation in ap_dict[ap]["mitigations"]:
            for ap_mit in ap_mitigation:
                if mitigation == ap_mit["_id"]:
                    output.append(ap_mit["metadata"])

        for technique in techniques:
            for mitigation in technique_dict[technique]["mitigations"]:
                for tech_mit in technique_mitigation:
                    if mitigation == tech_mit["_id"]:
                        output.append(tech_mit["name"])

    if data_combo in ["A1 + D", "A1 + MI + D"]:
        for detection in w_dict[cwe]["detections"]:
            for cwe_det in w_detection:
                if detection == cwe_det["_id"]:
                    output.append(cwe_det["metadata"]["Description"])

        for detection in ap_dict[ap]["detections"]:
            for ap_det in ap_detection:
                if detection == ap_det["_id"]:
                    output.append(ap_det["metadata"])

        for technique in techniques:
            for detection in technique_dict[technique]["detections"]:
                for tech_det in technique_mitigation:
                    if detection == tech_det["_id"]:
                        output.append(tech_det["metadata"])

    output = " ".join(output)
    return vector_encoding(
        encoding_type, output, vectorizer, bert_output_type, bert_finetuned
    )


def handle_data(
    encoding_type, data_combo, ap, cwe, bert_output_type=None, bert_finetuned=False
):
    example = []
    example.append(
        vector_encoding(
            encoding_type,
            ap_dict[ap]["name"],
            ap_name_vectorizer,
            bert_output_type,
            bert_finetuned,
        )
    )
    example.append(
        vector_encoding(
            encoding_type,
            w_dict[cwe]["name"],
            cwe_name_vectorizer,
            bert_output_type,
            bert_finetuned,
        )
    )

    techniques = ap_dict[ap]["techniques"]

    if "A1" in data_combo:
        techs = []
        for technique in techniques:
            techs.append(technique_dict[technique]["name"])
        techs = " ".join(techs)
        example.append(
            vector_encoding(
                encoding_type,
                techs,
                technique_name_vectorizer,
                bert_output_type,
                bert_finetuned,
            )
        )

        tactics = []
        for technique in techniques:
            for tac in technique_dict[technique]["tactics"]:
                tactics.append(tactic_dict[tac]["name"])
        tactics = " ".join(tactics)
        example.append(
            vector_encoding(
                encoding_type,
                tactics,
                tactic_name_vectorizer,
                bert_output_type,
                bert_finetuned,
            )
        )

    if data_combo in ["A1 + MI", "A1 + MI + D"]:
        cwe_mitigations = []
        for mitigation in w_dict[cwe]["mitigations"]:
            for cwe_mit in w_mitigation:
                if mitigation == cwe_mit["_id"]:
                    cwe_mitigations.append(cwe_mit["metadata"]["Description"])
        cwe_mitigations = " ".join(cwe_mitigations)
        example.append(
            vector_encoding(
                encoding_type,
                cwe_mitigations,
                cwe_mitigation_vectorizer,
                bert_output_type,
                bert_finetuned,
            )
        )

        capec_mitigations = []
        for mitigation in ap_dict[ap]["mitigations"]:
            for ap_mit in ap_mitigation:
                if mitigation == ap_mit["_id"]:
                    capec_mitigations.append(ap_mit["metadata"])
        capec_mitigations = " ".join(capec_mitigations)
        example.append(
            vector_encoding(
                encoding_type,
                capec_mitigations,
                ap_mitigation_vectorizer,
                bert_output_type,
                bert_finetuned,
            )
        )

        tech_mitigations = []
        for technique in techniques:
            for mitigation in technique_dict[technique]["mitigations"]:
                for tech_mit in technique_mitigation:
                    if mitigation == tech_mit["_id"]:
                        tech_mitigations.append(tech_mit["name"])
        tech_mitigations = " ".join(tech_mitigations)
        example.append(
            vector_encoding(
                encoding_type,
                tech_mitigations,
                tech_mitigation_vectorizer,
                bert_output_type,
                bert_finetuned,
            )
        )

    if data_combo in ["A1 + D", "A1 + MI + D"]:
        cwe_detections = []
        for detection in w_dict[cwe]["detections"]:
            for cwe_det in w_detection:
                if detection == cwe_det["_id"]:
                    cwe_detections.append(cwe_det["metadata"]["Description"])

        cwe_detections = " ".join(cwe_detections)
        example.append(
            vector_encoding(
                encoding_type,
                cwe_detections,
                cwe_detection_vectorizer,
                bert_output_type,
                bert_finetuned,
            )
        )

        capec_detections = []
        for detection in ap_dict[ap]["detections"]:
            for ap_det in ap_detection:
                if detection == ap_det["_id"]:
                    capec_detections.append(ap_det["metadata"])
        capec_detections = " ".join(capec_detections)
        example.append(
            vector_encoding(
                encoding_type,
                capec_detections,
                ap_detection_vectorizer,
                bert_output_type,
                bert_finetuned,
            )
        )

        tech_detections = []
        for technique in techniques:
            for detection in technique_dict[technique]["detections"]:
                for tech_det in technique_detection:
                    if detection == tech_det["_id"]:
                        tech_detections.append(tech_det["name"])
        tech_detections = " ".join(tech_detections)
        example.append(
            vector_encoding(
                encoding_type,
                tech_detections,
                tech_detection_vectorizer,
                bert_output_type,
                bert_finetuned,
            )
        )

    return np.hstack(example)


def encode_data(
    encoding_type, data_combo, ap, cwe, bert_output_type=None, bert_finetuned=False
):
    example = []
    example.append(
        vector_encoding(
            encoding_type,
            ap_dict[ap]["name"],
            ap_name_vectorizer,
            bert_output_type,
            bert_finetuned,
        )
    )
    example.append(
        vector_encoding(
            encoding_type,
            w_dict[cwe]["name"],
            cwe_name_vectorizer,
            bert_output_type,
            bert_finetuned,
        )
    )

    techniques = ap_dict[ap]["techniques"]

    if "A1" in data_combo:
        for technique in techniques:
            example.append(
                vector_encoding(
                    encoding_type,
                    technique_dict[technique]["name"],
                    technique_name_vectorizer,
                    bert_output_type,
                    bert_finetuned,
                )
            )

        for technique in techniques:
            for tac in technique_dict[technique]["tactics"]:
                example.append(
                    vector_encoding(
                        encoding_type,
                        tactic_dict[tac]["name"],
                        tactic_name_vectorizer,
                        bert_output_type,
                        bert_finetuned,
                    )
                )

    if data_combo in ["A1 + MI", "A1 + MI + D"]:
        for mitigation in w_dict[cwe]["mitigations"]:
            for cwe_mit in w_mitigation:
                if mitigation == cwe_mit["_id"]:
                    example.append(
                        vector_encoding(
                            encoding_type,
                            cwe_mit["metadata"]["Description"],
                            cwe_mitigation_vectorizer,
                            bert_output_type,
                            bert_finetuned,
                        )
                    )

        for mitigation in ap_dict[ap]["mitigations"]:
            for ap_mit in ap_mitigation:
                if mitigation == ap_mit["_id"]:
                    example.append(
                        vector_encoding(
                            encoding_type,
                            ap_mit["metadata"],
                            ap_mitigation_vectorizer,
                            bert_output_type,
                            bert_finetuned,
                        )
                    )

        for technique in techniques:
            for mitigation in technique_dict[technique]["mitigations"]:
                for tech_mit in technique_mitigation:
                    if mitigation == tech_mit["_id"]:
                        example.append(
                            vector_encoding(
                                encoding_type,
                                tech_mit["name"],
                                tech_mitigation_vectorizer,
                                bert_output_type,
                                bert_finetuned,
                            )
                        )

    if data_combo in ["A1 + D", "A1 + MI + D"]:
        for detection in w_dict[cwe]["detections"]:
            for cwe_det in w_detection:
                if detection == cwe_det["_id"]:
                    example.append(
                        vector_encoding(
                            encoding_type,
                            cwe_det["metadata"]["Description"],
                            cwe_detection_vectorizer,
                            bert_output_type,
                            bert_finetuned,
                        )
                    )

        for detection in ap_dict[ap]["detections"]:
            for ap_det in ap_detection:
                if detection == ap_det["_id"]:
                    example.append(
                        vector_encoding(
                            encoding_type,
                            ap_det["metadata"],
                            ap_detection_vectorizer,
                            bert_output_type,
                            bert_finetuned,
                        )
                    )

        for technique in techniques:
            for detection in technique_dict[technique]["detections"]:
                for tech_det in technique_detection:
                    if detection == tech_det["_id"]:
                        example.append(
                            vector_encoding(
                                encoding_type,
                                tech_det["name"],
                                tech_detection_vectorizer,
                                bert_output_type,
                                bert_finetuned,
                            )
                        )

    return np.hstack(example)

In [11]:
examples = []
labels = []
for i, (ap, cwe) in enumerate(example_ids):
    # examples.append(append_data("BoW", "A0", ap, cwe))
    examples.append(handle_data("spaCy", "A0", ap, cwe))
    if i < len(example_ids) / 2:
        labels.append(1)
    else:
        labels.append(0)

X_train, X_test, y_train, y_test = train_test_split(
    examples, labels, test_size=0.3, random_state=0
)

In [12]:
clf = RandomForestClassifier(random_state=0, class_weight={0: 1, 1: 1})
clf.fit(X_train, y_train)

RandomForestClassifier(class_weight={0: 1, 1: 1}, random_state=0)

In [13]:
import pickle

filename = "link_probabilities_model_spaCy_cwe_capec.pkl"
with open(filename, "wb") as f:
    pickle.dump(clf, f)

In [14]:
def gather_text_data(ap, cwe, data_combo):
    output = {}
    output["AP"] = ap_dict[ap]["name"]
    output["CWE"] = w_dict[cwe]["name"]

    if "A1" in data_combo:
        output["Techniques"] = []
        for technique in ap_dict[ap]["techniques"]:
            output["Techniques"].append(technique_dict[technique]["name"])
        output["Tactics"] = []
        for technique in ap_dict[ap]["techniques"]:
            for tac in technique_dict[technique]["tactics"]:
                output["Tactics"].append(tactic_dict[tac]["name"])

    return output

In [15]:
# if classifier already trained
import pickle

filename = "link_probabilities_model_BoW_cwe_capec.pkl"
with open(filename, "rb") as f:
    BoW_clf = pickle.load(f)

filename = "link_probabilities_model_spaCy_cwe_capec.pkl"
with open(filename, "rb") as f:
    spaCy_clf = pickle.load(f)

"""filename = 'link_probabilities_model_BERT.pkl'
with open(filename, "rb") as f:
    BERT_clf = pickle.load(f)"""

'filename = \'link_probabilities_model_BERT.pkl\'\nwith open(filename, "rb") as f:\n    BERT_clf = pickle.load(f)'

In [16]:
examples = []

for ap in ap_dict:
    for cwe in w_dict:
        examples.append((ap, cwe))

examples = list(set(examples) - set(example_ids))
BoW_negative_input = []
spaCy_negative_input = []
BERT_negative_input = []

all_relevant_text = []
for (capec, cwe) in examples:
    spaCy_negative_input.append(handle_data("spaCy", "A0", capec, cwe))
    # BERT_negative_input.append(handle_data("BERT", "A1", capec, technique, bert_output_type="hidden_state", bert_finetuned=True))
    all_relevant_text.append(gather_text_data(capec, cwe, "A0"))

BoW_y_probs = BoW_clf.predict_proba(BoW_negative_input)[:, 1]
spaCy_y_probs = spaCy_clf.predict_proba(spaCy_negative_input)[:, 1]
# BERT_y_probs = BERT_clf.predict_proba(BERT_negative_input)[:, 1]

In [None]:
dataframe_dict = {}
for i, (capec, cwe) in enumerate(examples):
    dataframe_dict[(capec, cwe)] = {
        "CAPEC": capec.split("/")[1],
        "CWE": cwe.split("/")[1],
        "CAPEC name": ap_dict[capec]["name"],
        "CWE name": w_dict[cwe]["name"],
        "BoW link probability": BoW_y_probs[i],
        "spaCy link probability": spaCy_y_probs[i],
    }

In [None]:
df = pd.DataFrame(dataframe_dict).T
df.reset_index(drop=True, inplace=True)
df.to_csv("link_probs_spaCy_cwe_capec.csv", index=False)