In [1]:
import json
from transformers import (
    AutoTokenizer,
    AutoModelForMaskedLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
from datasets import Dataset

In [2]:
with open("w_dict.json", "r") as f:
    w_dict = json.load(f)
with open("ap_dict.json", "r") as f:
    ap_dict = json.load(f)
with open("technique_dict.json", "r") as f:
    technique_dict = json.load(f)
with open("tactic_dict.json", "r") as f:
    tactic_dict = json.load(f)

In [3]:
f = open("cwe_mitigation_ids_temp.json")
w_mitigation = json.load(f)

f = open("capec_mitigation_temp.json")
ap_mitigation = json.load(f)

f = open("technique_mitigation_temp.json")
technique_mitigation = json.load(f)

f = open("technique_detection_temp.json")
technique_detection = json.load(f)

f = open("capec_detection_temp.json")
ap_detection = json.load(f)

f = open("cwe_detection_temp.json")
w_detection = json.load(f)

In [4]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)


device = "cpu"

data = set()

for w in w_dict:
    data.add(w_dict[w]["name"])
for ap in ap_dict:
    data.add(ap_dict[ap]["name"])
for tech in technique_dict:
    data.add(technique_dict[tech]["name"])
for tac in tactic_dict:
    data.add(tactic_dict[tac]["name"])
for mitigation in w_mitigation:
    data.add(mitigation["metadata"]["Description"])
for detection in w_detection:
    data.add(detection["metadata"]["Description"])
for mitigation in ap_mitigation:
    data.add(mitigation["metadata"])
for detection in ap_detection:
    data.add(detection["metadata"])
for mitigation in technique_mitigation:
    data.add(mitigation["name"])
for detection in technique_detection:
    data.add(detection["name"])

dataset = {"text": list(data)}
dataset = Dataset.from_dict(dataset)
dataset = dataset.train_test_split(test_size=0.1)

tokenized_datasets = dataset.map(
    tokenize_function, batched=True, remove_columns=["text"]
)

model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer)

training_args = TrainingArguments(
    "bert_base",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=50.0,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
)

trainer.train()
trainer.save_model()

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
***** Running training *****
  Num examples = 3809
  Num Epochs = 50
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 95250


Epoch,Training Loss,Validation Loss
1,3.0378,3.07146
2,2.8187,2.728717
3,2.5232,2.731335
4,2.4297,2.810951
5,2.3668,2.610152
6,2.2371,2.765339
7,1.9951,2.579362
8,2.0046,2.766116
9,1.9,2.929121
10,1.7477,2.64627


***** Running Evaluation *****
  Num examples = 424
  Batch size = 2
Saving model checkpoint to bert_base/checkpoint-1905
Configuration saved in bert_base/checkpoint-1905/config.json
Model weights saved in bert_base/checkpoint-1905/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 424
  Batch size = 2
Saving model checkpoint to bert_base/checkpoint-3810
Configuration saved in bert_base/checkpoint-3810/config.json
Model weights saved in bert_base/checkpoint-3810/pytorch_model.bin
Deleting older checkpoint [bert_base/checkpoint-1905] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 424
  Batch size = 2
Saving model checkpoint to bert_base/checkpoint-5715
Configuration saved in bert_base/checkpoint-5715/config.json
Model weights saved in bert_base/checkpoint-5715/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 424
  Batch size = 2
Saving model checkpoint to bert_base/checkpoint-7620
Configuration saved in bert_base/checkpoint-7