In [3]:
from transformers import Trainer,get_linear_schedule_with_warmup,RobertaTokenizer,BertForSequenceClassification,BitsAndBytesConfig
import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
import torch.nn.functional as F
import warnings
warnings.filterwarnings('ignore')
from tqdm.notebook import tqdm
import psutil
from sklearn.model_selection import train_test_split
from peft import prepare_model_for_kbit_training
from peft import get_peft_model
import time
from collections import defaultdict

In [4]:
class SecurityBERT(nn.Module):
  def __init__(self,myTunedBERT,n_classes):
    super(SecurityBERT,self).__init__()
    self.bert = myTunedBERT
    self.dropout = nn.Dropout(p=0.1)
    self.out = nn.Linear(self.bert.config.hidden_size,n_classes)
    self.config = self.bert.config
    self.gradient_checkpointing_enable = self.bert.gradient_checkpointing_enable

  def forward(self,input_ids,attention_mask):
    pooled_output = self.bert(
        input_ids=input_ids,
        attention_mask=attention_mask
    ).pooler_output

    output = self.dropout(pooled_output)

    return self.out(output)
  
def print_trainable_parameters(model):
    if isinstance(model,SecurityBERT):
        trainable = sum(p.numel() for p in model.bert.parameters() if p.requires_grad)
    else:
        trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total = sum(p.numel() for p in model.parameters())
    print(f"Trainable parameters: {trainable:,}")
    print(f"Total parameters: {total:,}")
    print(f"Percentage of trainable params: {100 * trainable / total:.2f}%")

## Set up model and tokenizer

In [5]:
model_name = "gaunernst/bert-tiny-uncased"
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
print(model.config)
tokenizer = RobertaTokenizer.from_pretrained('./tokenizer')
print(f"Tokenizer length: {len(tokenizer.get_vocab())}")
model.resize_token_embeddings(len(tokenizer)) # Resize the vocabulary without invalidating pre-trained weights
print_trainable_parameters(model)
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
print(f"Number of available GPUs: {torch.cuda.device_count()}")
device

config.json:   0%|          | 0.00/528 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at gaunernst/bert-tiny-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 2,
  "num_hidden_layers": 2,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.53.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

Tokenizer length: 30522
Trainable parameters: 4,386,178
Total parameters: 4,386,178
Percentage of trainable params: 100.00%
Number of available GPUs: 1


device(type='cuda')

In [6]:
def memory_check():
  return round(psutil.virtual_memory().used/1024**3,3)

## PEFT Configuration

In [7]:
from peft import get_peft_model, VeraConfig, TaskType

# Define VeRA configuration
config = VeraConfig(
    r=8,  # Low-rank decomposition size
    target_modules=["query", "value"],  # Specify target modules
    vera_dropout=0.2,
    bias="none"
)

# Apply VeRA adapter
#model = model.prepare_model_for_kbit_training()
model = get_peft_model(model, config)
print_trainable_parameters(model)

Trainable parameters: 544
Total parameters: 4,386,722
Percentage of trainable params: 0.01%


## Dataset

In [8]:
from datasets import load_dataset
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

data = pd.read_pickle('./saved_data/encoded_data.pck')
le = LabelEncoder()
data['target'] = le.fit_transform(data['Attack_type'])

In [9]:
data.sample(3)

Unnamed: 0,encoded_PPFLE,Attack_type,Attack_label,target
80984,1091d7634bb290943eddba5b408ce4d81fcda2a2 0bc16...,Vulnerability_scanner,1,13
50499,a0801f4dbf48dae5beb48d0458f4dc7e2d8fb7d5 0bc16...,DDoS_TCP,1,3
72969,8582326c3f4225dfc21cab8323295eb6c8c82646 6df8d...,Port_Scanning,1,9


In [10]:
data['encoded_PPFLE'][0]

'689e1d00f39f485bd2da5098239199df85c39b4d 2ab932d3aadb7887eda7302e0b33df13dc5ca645 f2c86566785eaad29f5c4b244d058b0cb7deb97e 5ea155b76662c1b381a0526e87d733a2ae68bfce b3625a98258ceacc69889389baca3b048a4923e4 8d419a4b06c3bd95d00fe2ee5cf1146851cf9893 c71f6d1e2daf2f598381088226df35ae68552f0b b7036347f54fa5aef0dd3f66b4e44c56bbf54cf4 f97d947902327761a4b1472d76b71bf4406e4e18 79ae7b95cace51fb0079d9c5ef1641f4639ce3ed ea25a11eb9c83b7d70252710098ded6bc81eb62b ceae604760aaa4fd8b60791ae1839b754c2dd9f1 23197a0ba8a217f2bc8711ec3c4be39dc19236d7 e2df55df19acd4423bafccbb0b8a385ec5e914eb f086df09fafdda401160bc766631e3d5785a6257 5329d5d159837e103352035068b3e14ce570d7eb 3eb6e4b9dd1c085a3ad0b18484d48482c935a4f7 943315f7b6cb0fc45604f5cf2ac307894e59b990 89b029338e8dd73f49369cd01c0052663c8c949e f91e2919e7494d796d1148066124d4b4939c86fa a5ce077c8900f85e073032ae975d95491645ddce effcfb2aa80217f507218ea88726c9325a0d68b9 bf12d74b6502b4fc1e29df164d3bf6f6ce51ffe3 7d69c36dbc4a37fc6643a32e914ea4e7ee0f0fc2 128f4c3fdc8539e

In [11]:
sorted(data['target'].unique())

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]

In [12]:
memory_check()

20.934

In [13]:
with open("encoded_data.txt","w") as f:
  for value in tqdm(data['encoded_PPFLE']):
    f.write(str(value)+'\n')

  0%|          | 0/157800 [00:00<?, ?it/s]

In [14]:
train_set = data.sample(frac=0.7,random_state=42).reset_index(drop=True)

remaining = data.drop(train_set.index).reset_index(drop=True)

test_set = remaining.sample(frac=0.5,random_state=42).reset_index(drop=True)

val_set = remaining.drop(test_set.index).reset_index(drop=True)

print(train_set.shape,val_set.shape,test_set.shape)

(110460, 4) (23670, 4) (23670, 4)


In [15]:
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

train_set, test_set = train_test_split(data, test_size=test_ratio,stratify=data.iloc[:,-1], random_state=42)
train_set, val_set = train_test_split(train_set, test_size=val_ratio/(val_ratio+train_ratio),stratify=train_set.iloc[:,-1], random_state=42)

In [16]:
train_set.shape,val_set.shape,test_set.shape

((110460, 4), (23670, 4), (23670, 4))

In [17]:
TARGET_LIST = ['Backdoor', 'DDoS_HTTP', 'DDoS_ICMP', 'DDoS_TCP', 'DDoS_UDP',
                'Fingerprinting', 'MITM', 'Normal', 'Password', 'Port_Scanning',
                'Ransomware', 'SQL_injection', 'Uploading', 'Vulnerability_scanner',
                'XSS']

In [18]:
data[data['Attack_type']=='Uploading'].head(5)

Unnamed: 0,encoded_PPFLE,Attack_type,Attack_label,target
13140,05293357ba325a590027981a9b59eb748a968bb8 0bc16...,Uploading,1,12
13141,da0992898db14d190c3b6f7c2c6c658259a2b3a4 0bc16...,Uploading,1,12
13142,4d5b89f0a6797f41a5d0ee5b7b1cfe1327a291f4 0bc16...,Uploading,1,12
13143,732fcefab7fb269fd8ae7bd4d3b3e28ec3310e1f 6df8d...,Uploading,1,12
13144,9a7d459b7d8a0e5c070cb85c3b1bd0824a7b010f 0bc16...,Uploading,1,12


In [19]:
class CustomDataset(Dataset):
  def __init__(self,df,tokenizer,max_len):
    self.df = df
    self.tokenizer = tokenizer
    self.max_len=max_len
    self.sequence = self.df['encoded_PPFLE'].tolist()
    self.targets = self.df['target'].tolist()

  def __len__(self):
    return len(self.df)

  def __getitem__(self,idx):
    sequence = str(self.sequence[idx])
    target = self.targets[idx]
    encoding = self.tokenizer.encode_plus(
        sequence,
        add_special_tokens=True,
        max_length=self.max_len,
        padding='max_length',
        return_token_type_ids=False,
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    return {
        'input_ids':encoding['input_ids'].flatten(),
        'attention_mask':encoding['attention_mask'].flatten(),
        'targets':torch.tensor(target,dtype=torch.long)
    }

In [20]:
MAX_LEN=512
BATCH_SIZE=32

train_dataset = CustomDataset(train_set,tokenizer=tokenizer,max_len=MAX_LEN)
val_dataset = CustomDataset(val_set,tokenizer=tokenizer,max_len=MAX_LEN)
test_dataset = CustomDataset(test_set,tokenizer=tokenizer,max_len=MAX_LEN)

train_loader = DataLoader(
    train_dataset,
    shuffle=True,
    batch_size=BATCH_SIZE,
    num_workers=0

)

val_loader = DataLoader(
    val_dataset,
    shuffle=False,
    batch_size=BATCH_SIZE,
    num_workers=0

)

test_loader = DataLoader(
    test_dataset,
    shuffle=False,
    batch_size=BATCH_SIZE,
    num_workers=0

)

In [21]:
test_data = next(iter(train_loader))

print(test_data['input_ids'].shape)

torch.Size([32, 512])


In [22]:
def load_chkpt(model,version):
  return model.load_state_dict(torch.load(f"./saved_model/securityBert{version}.0.pt",map_location=torch.device('cpu')))

In [None]:
import numpy as np

def train_model_peft(trainer,data_loader,loss_fn,optimizer,device,scheduler,n_examples):
  trainer.train()

  print("after")

  losses = []
  correct_predictions = 0

  for d in data_loader:
    input_ids = d['input_ids'].to(device)
    attention_mask = d['attention_mask'].to(device)
    targets = d['targets'].to(device)
    print("the")

    outputs = trainer.model(input_ids,attention_mask)
    _,preds = torch.max(outputs,dim=1)
    loss = loss_fn(outputs,targets)

    correct_predictions+=torch.sum(preds==targets).cpu()

    losses.append(loss.item())

    loss.backward()
    nn.utils.clip_grad_norm_(trainer.model.parameters(),max_norm=1.0)

    optimizer.step()
    scheduler.step()

    optimizer.zero_grad()

  return correct_predictions/n_examples,np.mean(losses)

In [None]:
def evaluation_model_peft(trainer,data_loader,loss_fn,device,n_examples):
  trainer.train()

  losses = []
  correct_predictions = 0

  for d in data_loader:
    input_ids = d['input_ids'].to(device)
    attention_mask = d['attention_mask'].to(device)
    targets = d['targets'].to(device)

    outputs = trainer.model(input_ids,attention_mask)
    _,preds = torch.max(outputs,dim=1)

    loss = loss_fn(outputs,targets)

    correct_predictions+=torch.sum(preds==targets).cpu()

    losses.append(loss.item())

  return correct_predictions/n_examples,np.mean(losses)

In [25]:
class SecurityBERT(nn.Module):
  def __init__(self,myTunedBERT,n_classes):
    super(SecurityBERT,self).__init__()
    self.bert = myTunedBERT
    self.dropout = nn.Dropout(p=0.1)
    self.out = nn.Linear(self.bert.config.hidden_size,n_classes)
    self.config = self.bert.config
    self.gradient_checkpointing_enable = self.bert.gradient_checkpointing_enable

  def forward(self,input_ids,attention_mask):
    pooled_output = self.bert(
        input_ids=input_ids,
        attention_mask=attention_mask
    ).pooler_output

    output = self.dropout(pooled_output)

    return self.out(output)

In [None]:
securityBertTinyVera = SecurityBERT(myTunedBERT=model,n_classes=len(TARGET_LIST)).to(device)
EPOCHS=3
optimizer_vera = torch.optim.AdamW(securityBertTinyVera.parameters(),lr=1e-5)
total_steps = len(train_loader)*EPOCHS

scheduler_vera = get_linear_schedule_with_warmup(
    optimizer_vera,
    num_warmup_steps= 0,
    num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)

dataset = load_dataset("glue", "mrpc")
train_dataset = dataset["train"]
val_dataset = dataset["validation"]

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

In [27]:
from transformers import TrainingArguments

batch_size = 32
gradient_accumulation_steps = 4

# output dir 
model_version = "securityBert_TinyVeRA_"
model_dir = f"{model_version}"

training_args = TrainingArguments(
    run_name=model_version,
    output_dir=model_dir,
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    num_train_epochs=1,
    lr_scheduler_type="constant",
    logging_dir=f"{model_dir}/logs",
    fp16=True,  # Enable mixed precision training
    dataloader_num_workers=4,  # Adjust based on your CPU capabilities
    gradient_checkpointing=True,  # Enable gradient checkpointing to save memory
    report_to="none"  # Disable reporting to avoid unnecessary overhead
)

In [None]:
# Potentially remove

#securityBertTinyVera.bert.gradient_checkpointing_enable()
#securityBertTinyVera = prepare_model_for_kbit_training(securityBertTinyVera)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, log_loss, confusion_matrix
# The parameters after appling LoRA
print_trainable_parameters(model)

# designing computing metrics as per our use case. (F1-Macro is essential and log-loss is optional)
def compute_metrics(p):
    predictions, labels = p.predictions, TARGET_LIST
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(TARGET_LIST, predictions)
    macro_f1 = f1_score(TARGET_LIST, predictions, average='macro')

    return {"accuracy": accuracy, "macro_f1": macro_f1}

# configure Trainer
trainer_vera = Trainer(
    model=securityBertTinyVera,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Trainable parameters: 544
Total parameters: 4,386,722
Percentage of trainable params: 0.01%


from transformers import Trainer, TrainingArguments
from collections import defaultdict

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

In [30]:
import json

In [None]:
%%time
history_tiny_vera = defaultdict(list)
best_accuracy_tiny_vera=0
print("1")

# Record start time
start_time = time.time()

for epoch in tqdm(range(EPOCHS)):
  print(f"Epoch {epoch+1}/{EPOCHS}")
  train_acc_tiny_vera,train_loss_tiny_vera = train_model_peft(trainer_vera,train_loader,loss_fn,optimizer_vera,device,scheduler_vera,len(train_set))
  val_acc_tiny_vera,val_loss_tiny_vera = evaluation_model_peft(trainer_vera,val_loader,loss_fn,device,len(val_set))
  history_tiny_vera['train_acc'].append(train_acc_tiny_vera)
  history_tiny_vera['train_loss'].append(train_loss_tiny_vera)
  history_tiny_vera['val_acc'].append(val_acc_tiny_vera)
  history_tiny_vera['val_loss'].append(val_loss_tiny_vera)
  print(f"Train Loss {train_loss_tiny_vera} | Validation Loss {val_loss_tiny_vera} | Training Accuracy {train_acc_tiny_vera} | Validation Accuracy {val_acc_tiny_vera}")

  if val_acc_tiny_vera>best_accuracy_tiny_vera:
    trainer.save_model(f"./saved_model/{model_version}{epoch+1}.0.pt")
    best_accuracy_tiny_vera = val_acc_tiny_vera

# Record end time
end_time = time.time()

# Calculate training time
history_tiny_vera['training_time'].append(end_time - start_time)

# Convert to regular dict and save as JSON
with open("./saved_model/history_tiny_vera.txt", "w") as f:
    json.dump(history_tiny_vera, f)

1


  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1/3


Stopped trying to tune BERT-mini-uncased pre-trained, using QDoRA at 987 minutes. Did not get past first epoch.

In [None]:
# Load JSON and convert back to defaultdict
with open("./saved_model/history_tiny_vera.txt", "r") as f:
    history_tiny_vera_json = json.load(f)