In [71]:
import os
import pickle
import numpy as np
from tqdm import tqdm
from collections import Counter
from torch.utils.data import DataLoader, TensorDataset

import torch
from transformers import RobertaTokenizer, RobertaModel
from transformers import AutoTokenizer, AutoModelForMaskedLM
from multiprocessing import Pool, cpu_count
from transformers import BertTokenizer, BertForSequenceClassification

os.environ['CUDA_VISIBLE_DEVICES'] = "1"

RELATION_INPUT_PATH = './data/3_openKE/synthesize/relation2id.txt'
RELATION_INPUT_LABEL_PATH = './data/3_openKE/synthesize/relation2id_type.txt'

RELATION_OUTPUT_PATH = './data/4_embedding/synthesize/secureBERT/relation.npy'
MODEL_SAVE_PATH = './data/4_embedding/synthesize/model/secureBERT/relation/'
if not os.path.exists(MODEL_SAVE_PATH):
    os.makedirs(MODEL_SAVE_PATH)

In [72]:
X = list()
Y = list()
with open(RELATION_INPUT_LABEL_PATH, 'r') as f:
    for row in f.readlines()[1:]:
        cat, rid = row.split('\t')
        Y.append(cat)
        
with open(RELATION_INPUT_PATH, 'r') as f:
    for row in f.readlines()[1:]:
        value, rid = row.split('\t')
        X.append(value)

print(f'Sentence Cnt: {len(X)}' , X[:5])
# print(f'Label Cnt: {len(Y)}', Y[:5])
print(f'Label Cnt: {len(Y)}', Y)

Sentence Cnt: 23 ['RegDeleteKey', 'Process Create', 'ReadFile', 'TCP Accept', 'RegOpenKey']
Label Cnt: 23 ['registry', 'process', 'file', 'network', 'registry', 'file', 'network', 'file', 'file', 'registry', 'file', 'registry', 'file', 'network', 'file', 'file', 'unknown', 'process', 'network', 'file', 'file', 'file', 'network']


In [73]:
# Convert category labels to tensor
# label_dict = {'file':0, 'registry':1, 'network':2, 'process':3}
label_dict = {'file':0, 'registry':1, 'network':2, 'process':3, 'unknown':4}

labels = torch.tensor([label_dict[y] for y in Y])

In [74]:
# SecureBERT
tokenizer = RobertaTokenizer.from_pretrained("ehsanaghaei/SecureBERT")
encoded_inputs = tokenizer.batch_encode_plus(
    X,
    padding='max_length',
    truncation=True,
    max_length=32,
    return_tensors='pt'
)

input_ids = encoded_inputs['input_ids']
attention_mask = encoded_inputs['attention_mask']

In [75]:
print(input_ids.shape)
print(attention_mask.shape)

torch.Size([23, 32])
torch.Size([23, 32])


In [76]:
# Move data to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
labels = labels.to(device)

In [77]:
batch_size = 8

In [78]:
# splitter_tmp = int(0.9 * len(input_ids))
splitter1 = int(0.8 * len(input_ids))
train_inputs, val_inputs= input_ids, input_ids[splitter1:]
train_masks, val_masks= attention_mask, attention_mask[splitter1:]
train_labels, val_labels = labels, labels[splitter1:]
print(train_inputs.shape, Counter(Y))
print(val_inputs.shape, Counter(Y[splitter1:]))

# Create data loaders
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=False)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

torch.Size([23, 32]) Counter({'file': 11, 'network': 5, 'registry': 4, 'process': 2, 'unknown': 1})
torch.Size([5, 32]) Counter({'file': 3, 'network': 2})


In [79]:
# num_labels is important -> remember to modify
model = BertForSequenceClassification.from_pretrained("ehsanaghaei/SecureBERT", num_labels=5, output_hidden_states=True)
# Move the model to GPU
model.to('cuda')

You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at ehsanaghaei/SecureBERT were not used when initializing BertForSequenceClassification: ['roberta.embeddings.token_type_embeddings.weight', 'roberta.encoder.layer.8.attention.self.query.weight', 'roberta.encoder.layer.7.attention.self.key.bias', 'lm_head.layer_norm.bias', 'roberta.encoder.layer.6.attention.self.key.bias', 'roberta.encoder.layer.10.attention.self.query.weight', 'roberta.encoder.layer.6.intermediate.dense.bias', 'roberta.encoder.layer.8.attention.self.value.weight', 'roberta.encoder.layer.6.attention.self.value.weight', 'roberta.encoder.layer.5.intermediate.dense.weight', 'roberta.encoder.layer.11.attention.self.key.weight', 'roberta.encoder.layer.1.attention.output.LayerNorm.weight', 'roberta.encoder.layer.2.attention.self.value.weight', 'lm_head.layer_norm.weight', 'roberta.encoder.

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ehsanaghaei/SecureBERT and are newly initialized: ['encoder.layer.6.attention.self.key.bias', 'encoder.layer.8.attention.output.LayerNorm.weight', 'encoder.layer.9.output.dense.bias', 'encoder.layer.2.attention.self.value.weight', 'encoder.layer.3.attention.output.dense.bias', 'encoder.layer.5.attention.self.value.bias', 'embeddings.LayerNorm.bias', 'encoder.layer.3.intermediate.dense.bias', 'encoder.layer.5.attention.self.key.weight', 'embeddings.LayerNorm.weight', 'encoder.layer.1.attention.output.dense.weight', 'encoder.layer.6.attention.self.value.weight', 'encoder.layer.10.attention.self.query.bias', 'encoder.layer.9.attention.self.key.weight', 'encoder.layer.3.attention.self.value.weight', 'encoder.layer.8.output.LayerNorm.bias', 'encoder.layer.11.attention.output.LayerNorm.weight', 'encoder.layer.3.intermediate.dense.weight', 'encoder.layer.7.output.dense.weight', 'encoder.layer.9.att

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, element

In [81]:
# Fine-tune the BERT model on my task: 
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
epochs = 50
batch_chunk_size = 10000  # Number of batches to process before saving to disk

# Training loop
min_val_loss = 999999
for epoch in range(epochs):
    model.train()
    embeddings = []
#     print(f'Epoch {epoch+1} start!!!!!!!')
    for batch_idx, batch in enumerate(tqdm(train_dataloader)):
#         batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, labels = batch

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        loss.backward()
        optimizer.step()
        
        # Extract the embeddings from the model's hidden states
        hidden_states = model.bert(input_ids, attention_mask=attention_mask)[2]
        last_hidden_layer_embeddings = hidden_states[-1].detach().cpu().numpy()
        embeddings.append(last_hidden_layer_embeddings)
    
    embeddings = np.concatenate(embeddings)
    
    # embeddings[sequence number, sequence length(26), hidden dimension(768)]
    embeddings = embeddings[:, 0, :]  # Take the embedding of the [CLS] token -> output = [26, 768]
    
    embeddings = np.array(embeddings)
    np.save(RELATION_OUTPUT_PATH, embeddings)
    print(f'{RELATION_OUTPUT_PATH}, shape={embeddings.shape}')
        
    with torch.no_grad():
        model.eval()
        
        val_loss = 0
        val_correct = 0
        val_total = 0

        for batch in val_dataloader:
            batch = tuple(t.to(device) for t in batch)
            input_ids, attention_mask, labels = batch

            with torch.no_grad():
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                logits = outputs.logits

            val_loss += loss.item()
            _, predicted_labels = torch.max(logits, dim=1)
            val_correct += (predicted_labels == labels).sum().item()
            val_total += labels.size(0)

    val_accuracy = val_correct / val_total
    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch: {epoch + 1}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")
    
    if avg_val_loss < min_val_loss:
        # Save the trained model
        model.save_pretrained(MODEL_SAVE_PATH)
        min_val_loss = avg_val_loss
        
#     print(f'Epoch {epoch+1} end!!!!!!!')

100%|██████████| 3/3 [00:00<00:00, 12.08it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 1, Val Loss: 0.4458, Val Accuracy: 0.8000


100%|██████████| 3/3 [00:00<00:00, 11.54it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 2, Val Loss: 0.5651, Val Accuracy: 0.8000


100%|██████████| 3/3 [00:00<00:00, 12.30it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 3, Val Loss: 0.3430, Val Accuracy: 0.8000


100%|██████████| 3/3 [00:00<00:00, 11.80it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 4, Val Loss: 0.4620, Val Accuracy: 0.8000


100%|██████████| 3/3 [00:00<00:00, 12.08it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 5, Val Loss: 0.3214, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 12.05it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 6, Val Loss: 0.3277, Val Accuracy: 0.8000


100%|██████████| 3/3 [00:00<00:00, 11.80it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 7, Val Loss: 0.2673, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 11.60it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 8, Val Loss: 0.2737, Val Accuracy: 0.8000


100%|██████████| 3/3 [00:00<00:00, 11.97it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 9, Val Loss: 0.2657, Val Accuracy: 0.8000


100%|██████████| 3/3 [00:00<00:00, 12.88it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 10, Val Loss: 0.1874, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 12.04it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 11, Val Loss: 0.2155, Val Accuracy: 0.8000


100%|██████████| 3/3 [00:00<00:00, 13.31it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 12, Val Loss: 0.1533, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 12.50it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 13, Val Loss: 0.1651, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 12.38it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 14, Val Loss: 0.1283, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 12.14it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 15, Val Loss: 0.1092, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 11.70it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 16, Val Loss: 0.0969, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 12.38it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 17, Val Loss: 0.1019, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 12.00it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 18, Val Loss: 0.0724, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 11.77it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 19, Val Loss: 0.2306, Val Accuracy: 0.8000


100%|██████████| 3/3 [00:00<00:00, 12.53it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 20, Val Loss: 0.0769, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 11.71it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 21, Val Loss: 0.1263, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 11.91it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 22, Val Loss: 0.0610, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 11.84it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 23, Val Loss: 0.0907, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 11.87it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 24, Val Loss: 0.0633, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 11.35it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 25, Val Loss: 0.0418, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 11.23it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 26, Val Loss: 0.0420, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 11.54it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 27, Val Loss: 0.0266, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 11.81it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 28, Val Loss: 0.0346, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 11.80it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 29, Val Loss: 0.0203, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 12.88it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 30, Val Loss: 0.0193, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 12.51it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 31, Val Loss: 0.0296, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 11.18it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 32, Val Loss: 0.0104, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00,  9.77it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 33, Val Loss: 0.0197, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00,  9.89it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 34, Val Loss: 0.0128, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 10.18it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 35, Val Loss: 0.0070, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 10.59it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 36, Val Loss: 0.0096, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 10.58it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 37, Val Loss: 0.0105, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 10.73it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 38, Val Loss: 0.0062, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 10.62it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 39, Val Loss: 0.0052, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 10.50it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 40, Val Loss: 0.0055, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 10.34it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 41, Val Loss: 0.0058, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 10.55it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 42, Val Loss: 0.0058, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 11.24it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 43, Val Loss: 0.0053, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 10.70it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 44, Val Loss: 0.0047, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 10.47it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 45, Val Loss: 0.0042, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 10.59it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 46, Val Loss: 0.0040, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 10.05it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 47, Val Loss: 0.0039, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 10.17it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 48, Val Loss: 0.0038, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 10.19it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 49, Val Loss: 0.0037, Val Accuracy: 1.0000


100%|██████████| 3/3 [00:00<00:00, 10.61it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(23, 768)
Epoch: 50, Val Loss: 0.0037, Val Accuracy: 1.0000


#### The complete code

In [None]:
import os
import pickle
import numpy as np
from tqdm import tqdm
from collections import Counter
from torch.utils.data import DataLoader, TensorDataset

import torch
from transformers import RobertaTokenizer, RobertaModel
from transformers import AutoTokenizer, AutoModelForMaskedLM
from multiprocessing import Pool, cpu_count
from transformers import BertTokenizer, BertForSequenceClassification

os.environ['CUDA_VISIBLE_DEVICES'] = "1"

RELATION_INPUT_PATH = './data/3_openKE/synthesize/relation2id.txt'
RELATION_INPUT_LABEL_PATH = './data/3_openKE/synthesize/relation2id_type.txt'
RELATION_OUTPUT_PATH = './data/4_embedding/synthesize/secureBERT/relation.npy'
MODEL_SAVE_PATH = './data/4_embedding/synthesize/model/secureBERT/relation/'
if not os.path.exists(MODEL_SAVE_PATH):
    os.makedirs(MODEL_SAVE_PATH)
    

X = list()
Y = list()
with open(RELATION_INPUT_LABEL_PATH, 'r') as f:
    for row in f.readlines()[1:]:
        cat, rid = row.split('\t')
        Y.append(cat)
        
with open(RELATION_INPUT_PATH, 'r') as f:
    for row in f.readlines()[1:]:
        value, rid = row.split('\t')
        X.append(value)

print(f'Sentence Cnt: {len(X)}' , X[:5])
# print(f'Label Cnt: {len(Y)}', Y[:5])
print(f'Label Cnt: {len(Y)}', Y)

label_dict = {'file':0, 'registry':1, 'network':2, 'process':3, 'unknown':4}


labels = torch.tensor([label_dict[y] for y in Y])

# SecureBERT
tokenizer = RobertaTokenizer.from_pretrained("ehsanaghaei/SecureBERT")
encoded_inputs = tokenizer.batch_encode_plus(
    X,
    padding='max_length',
    truncation=True,
    max_length=32,
    return_tensors='pt'
)

input_ids = encoded_inputs['input_ids']
attention_mask = encoded_inputs['attention_mask']



print(input_ids.shape)
print(attention_mask.shape)

# Move data to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
labels = labels.to(device)

batch_size = 8

# splitter_tmp = int(0.9 * len(input_ids))
splitter1 = int(0.8 * len(input_ids))
train_inputs, val_inputs= input_ids, input_ids[splitter1:]
train_masks, val_masks= attention_mask, attention_mask[splitter1:]
train_labels, val_labels = labels, labels[splitter1:]
print(train_inputs.shape, Counter(Y))
print(val_inputs.shape, Counter(Y[splitter1:]))

# Create data loaders
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=False)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

# num_labels is important -> remember to modify
model = BertForSequenceClassification.from_pretrained("ehsanaghaei/SecureBERT", num_labels=5, output_hidden_states=True)
# Move the model to GPU
model.to('cuda')


# Fine-tune the BERT model on my task: 
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
epochs = 50
batch_chunk_size = 10000  # Number of batches to process before saving to disk

# Training loop
min_val_loss = 999999
for epoch in range(epochs):
    model.train()
    embeddings = []
#     print(f'Epoch {epoch+1} start!!!!!!!')
    for batch_idx, batch in enumerate(tqdm(train_dataloader)):
#         batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, labels = batch

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        loss.backward()
        optimizer.step()
        
        # Extract the embeddings from the model's hidden states
        hidden_states = model.bert(input_ids, attention_mask=attention_mask)[2]
        last_hidden_layer_embeddings = hidden_states[-1].detach().cpu().numpy()
        embeddings.append(last_hidden_layer_embeddings)
    
    embeddings = np.concatenate(embeddings)
    
    # embeddings[sequence number, sequence length(26), hidden dimension(768)]
    embeddings = embeddings[:, 0, :]  # Take the embedding of the [CLS] token -> output = [26, 768]
    
    embeddings = np.array(embeddings)
    np.save(RELATION_OUTPUT_PATH, embeddings)
    print(f'{RELATION_OUTPUT_PATH}, shape={embeddings.shape}')
        
    with torch.no_grad():
        model.eval()
        
        val_loss = 0
        val_correct = 0
        val_total = 0

        for batch in val_dataloader:
            batch = tuple(t.to(device) for t in batch)
            input_ids, attention_mask, labels = batch

            with torch.no_grad():
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                logits = outputs.logits

            val_loss += loss.item()
            _, predicted_labels = torch.max(logits, dim=1)
            val_correct += (predicted_labels == labels).sum().item()
            val_total += labels.size(0)

    val_accuracy = val_correct / val_total
    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch: {epoch + 1}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")
    
    if avg_val_loss < min_val_loss:
        # Save the trained model
        model.save_pretrained(MODEL_SAVE_PATH)
        min_val_loss = avg_val_loss
        
#     print(f'Epoch {epoch+1} end!!!!!!!')

#### the complete code

In [12]:
import os
import pickle
import numpy as np
from tqdm import tqdm
from collections import Counter
from torch.utils.data import DataLoader, TensorDataset

import torch
from transformers import RobertaTokenizer, RobertaModel
from transformers import AutoTokenizer, AutoModelForMaskedLM
from multiprocessing import Pool, cpu_count
from transformers import BertTokenizer, BertForSequenceClassification

os.environ['CUDA_VISIBLE_DEVICES'] = "3"

RELATION_INPUT_PATH = './data/3_openKE/synthesize/relation2id.txt'
RELATION_INPUT_LABEL_PATH = './data/3_openKE/synthesize/relation2id_type.txt'
RELATION_OUTPUT_PATH = './data/4_embedding/synthesize/secureBERT/relation.npy'
MODEL_SAVE_PATH = './data/4_embedding/synthesize/model/secureBERT/relation/'
if not os.path.exists(MODEL_SAVE_PATH):
    os.makedirs(MODEL_SAVE_PATH)

    
X = list()
Y = list()
with open(RELATION_INPUT_LABEL_PATH, 'r') as f:
    for row in f.readlines()[1:]:
        cat, rid = row.split('\t')
        Y.append(cat)
with open(RELATION_INPUT_PATH, 'r') as f:
    for row in f.readlines()[1:]:
        value, rid = row.split('\t')
        X.append(value)

print(f'Sentence Cnt: {len(X)}' , X[:5])
print(f'Label Cnt: {len(Y)}', Y[:5])


# Convert category labels to tensor
label_dict = {'file':0, 'registry':1, 'network':2, 'process':3}
labels = torch.tensor([label_dict[y] for y in Y])


# SecureBERT
tokenizer = RobertaTokenizer.from_pretrained("ehsanaghaei/SecureBERT")
encoded_inputs = tokenizer.batch_encode_plus(
    X,
    padding='max_length',
    truncation=True,
    max_length=32,
    return_tensors='pt'
)

input_ids = encoded_inputs['input_ids']
attention_mask = encoded_inputs['attention_mask']


print(input_ids.shape)
print(attention_mask.shape)


# Move data to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
labels = labels.to(device)


batch_size = 8


# splitter_tmp = int(0.9 * len(input_ids))
splitter1 = int(0.8 * len(input_ids))
train_inputs, val_inputs= input_ids, input_ids[splitter1:]
train_masks, val_masks= attention_mask, attention_mask[splitter1:]
train_labels, val_labels = labels, labels[splitter1:]
print(train_inputs.shape, Counter(Y))
print(val_inputs.shape, Counter(Y[splitter1:]))

# Create data loaders
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=False)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)


model = BertForSequenceClassification.from_pretrained("ehsanaghaei/SecureBERT", num_labels=4, output_hidden_states=True)
# Move the model to GPU
model.to('cuda')

# Fine-tune the BERT model on my task:
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
epochs = 10
batch_chunk_size = 10000  # Number of batches to process before saving to disk

# Training loop
min_val_loss = 999999
for epoch in range(epochs):
    model.train()
    embeddings = []
#     print(f'Epoch {epoch+1} start!!!!!!!')
    for batch_idx, batch in enumerate(tqdm(train_dataloader)):
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, labels = batch

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        loss.backward()
        optimizer.step()
        
        # Extract the embeddings from the model's hidden states
        hidden_states = model.bert(input_ids, attention_mask=attention_mask)[2]
        last_hidden_layer_embeddings = hidden_states[-1].detach().cpu().numpy()
        embeddings.append(last_hidden_layer_embeddings)
    
    embeddings = np.concatenate(embeddings)
    embeddings = embeddings[:, 0, :]  # Take the embedding of the [CLS] token
    embeddings = np.array(embeddings)
    np.save(RELATION_OUTPUT_PATH, embeddings)
    print(f'{RELATION_OUTPUT_PATH}, shape={embeddings.shape}')
        
    with torch.no_grad():
        model.eval()
        
        val_loss = 0
        val_correct = 0
        val_total = 0

        for batch in val_dataloader:
            batch = tuple(t.to(device) for t in batch)
            input_ids, attention_mask, labels = batch

            with torch.no_grad():
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                logits = outputs.logits

            val_loss += loss.item()
            _, predicted_labels = torch.max(logits, dim=1)
            val_correct += (predicted_labels == labels).sum().item()
            val_total += labels.size(0)

    val_accuracy = val_correct / val_total
    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch: {epoch + 1}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")
    
    if avg_val_loss < min_val_loss:
        # Save the trained model
        model.save_pretrained(MODEL_SAVE_PATH)
        min_val_loss = avg_val_loss
        
#     print(f'Epoch {epoch+1} end!!!!!!!')





Sentence Cnt: 26 ['QueryAllInformationFile', 'QueryNetworkOpenInformationFile', 'RegSetValue', 'CreateFile', 'SetDispositionInformationFile']
Label Cnt: 26 ['file', 'network', 'registry', 'file', 'file']
torch.Size([26, 32])
torch.Size([26, 32])
torch.Size([26, 32]) Counter({'file': 11, 'network': 7, 'registry': 7, 'process': 1})
torch.Size([6, 32]) Counter({'file': 3, 'network': 2, 'process': 1})


You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at ehsanaghaei/SecureBERT were not used when initializing BertForSequenceClassification: ['roberta.encoder.layer.11.attention.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.output.LayerNorm.weight', 'roberta.encoder.layer.5.output.LayerNorm.bias', 'roberta.encoder.layer.11.attention.output.LayerNorm.weight', 'roberta.encoder.layer.5.attention.output.LayerNorm.bias', 'roberta.encoder.layer.5.output.dense.bias', 'roberta.encoder.layer.9.attention.self.key.bias', 'roberta.encoder.layer.9.attention.self.value.weight', 'roberta.encoder.layer.3.attention.self.key.bias', 'roberta.encoder.layer.3.attention.output.dense.weight', 'roberta.encoder.layer.8.attention.output.dense.weight', 'roberta.encoder.layer.3.output.dense.bias', 'roberta.encoder.layer.3.attention.output.LayerNorm.bias', 'roberta.e

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ehsanaghaei/SecureBERT and are newly initialized: ['encoder.layer.9.output.dense.weight', 'encoder.layer.9.attention.self.query.weight', 'encoder.layer.2.attention.output.LayerNorm.bias', 'encoder.layer.2.attention.self.value.bias', 'pooler.dense.bias', 'encoder.layer.4.attention.self.query.bias', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.5.attention.output.dense.bias', 'encoder.layer.9.attention.self.value.weight', 'encoder.layer.9.intermediate.dense.bias', 'encoder.layer.5.attention.self.value.bias', 'encoder.layer.4.attention.self.value.weight', 'encoder.layer.10.intermediate.dense.weight', 'encoder.layer.9.output.LayerNorm.weight', 'encoder.layer.5.attention.self.value.weight', 'encoder.layer.2.attention.output.dense.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.7.attention.output.dense.bias', 'encoder.layer.11.output.dense.bias', 'encoder.layer.9.a

100%|██████████| 4/4 [00:00<00:00, 11.58it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(26, 768)
Epoch: 1, Val Loss: 1.2403, Val Accuracy: 0.8333


100%|██████████| 4/4 [00:00<00:00, 12.35it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(26, 768)
Epoch: 2, Val Loss: 1.0358, Val Accuracy: 0.6667


100%|██████████| 4/4 [00:00<00:00, 12.49it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(26, 768)
Epoch: 3, Val Loss: 0.8378, Val Accuracy: 0.8333


100%|██████████| 4/4 [00:00<00:00, 11.44it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(26, 768)
Epoch: 4, Val Loss: 0.5766, Val Accuracy: 0.8333


100%|██████████| 4/4 [00:00<00:00, 11.94it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(26, 768)
Epoch: 5, Val Loss: 0.3496, Val Accuracy: 0.8333


100%|██████████| 4/4 [00:00<00:00, 11.89it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(26, 768)
Epoch: 6, Val Loss: 0.1853, Val Accuracy: 1.0000


100%|██████████| 4/4 [00:00<00:00, 12.30it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(26, 768)
Epoch: 7, Val Loss: 0.1194, Val Accuracy: 1.0000


100%|██████████| 4/4 [00:00<00:00, 12.53it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(26, 768)
Epoch: 8, Val Loss: 0.0854, Val Accuracy: 1.0000


100%|██████████| 4/4 [00:00<00:00, 11.99it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(26, 768)
Epoch: 9, Val Loss: 0.0480, Val Accuracy: 1.0000


100%|██████████| 4/4 [00:00<00:00, 12.37it/s]


./data/4_embedding/synthesize/secureBERT/relation.npy, shape=(26, 768)
Epoch: 10, Val Loss: 0.0362, Val Accuracy: 1.0000
