https://medium.com/rahasak/fine-tune-llm-for-real-time-network-attach-detection-with-apple-mlx-b6c70f5c843a
Also used code from https://gitlab.com/rahasak-labs/mlxa/-/blob/master/data/prepare.py?ref_type=heads

In [1]:
from transformers import Trainer,get_linear_schedule_with_warmup,RobertaTokenizer,BertForSequenceClassification,BitsAndBytesConfig
import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
import torch.nn.functional as F
import warnings
warnings.filterwarnings('ignore')
from tqdm.notebook import tqdm
import psutil
from sklearn.model_selection import train_test_split
from peft import prepare_model_for_kbit_training
from peft import get_peft_model
import time
from collections import defaultdict

In [2]:
class SecurityBERT(nn.Module):
  def __init__(self,myTunedBERT,n_classes):
    super(SecurityBERT,self).__init__()
    self.bert = myTunedBERT
    self.dropout = nn.Dropout(p=0.1)
    self.out = nn.Linear(self.bert.config.hidden_size,n_classes)
    self.config = self.bert.config
    self.gradient_checkpointing_enable = self.bert.gradient_checkpointing_enable

  def forward(self,input_ids,attention_mask,token_type_ids=None,labels=None):
    pooled_output = self.bert(
        input_ids=input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        return_dict=True
    ).pooler_output

    output = pooled_output.last_hidden_state[:, 0, :]
    output = self.dropout(output)

    return self.out(output)
  
def print_trainable_parameters(model):
    if isinstance(model,SecurityBERT):
        trainable = sum(p.numel() for p in model.bert.parameters() if p.requires_grad)
    else:
        trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total = sum(p.numel() for p in model.parameters())
    print(f"Trainable parameters: {trainable:,}")
    print(f"Total parameters: {total:,}")
    print(f"Percentage of trainable params: {100 * trainable / total:.2f}%")

## Set up model and tokenizer

In [3]:
model_name = "gaunernst/bert-tiny-uncased"
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
print(model.config)
print_trainable_parameters(model)
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
print(f"Number of available GPUs: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))
else:
    print("No GPU available.")
device

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at gaunernst/bert-tiny-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 2,
  "num_hidden_layers": 2,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.53.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

Trainable parameters: 4,386,178
Total parameters: 4,386,178
Percentage of trainable params: 100.00%
Number of available GPUs: 1
GPU Name: NVIDIA GeForce RTX 3080


device(type='cuda')

In [4]:
def memory_check():
  return round(psutil.virtual_memory().used/1024**3,3)

## PEFT Configuration

In [5]:
from peft import get_peft_model, VeraConfig, TaskType

# Define VeRA configuration
config = VeraConfig(
    r=8,  # Low-rank decomposition size
    target_modules=["query", "value"],  # Specify target modules
    vera_dropout=0.2,
    bias="none"
)

# Apply VeRA adapter
#model = model.prepare_model_for_kbit_training()
model = get_peft_model(model, config)
print_trainable_parameters(model)

Trainable parameters: 544
Total parameters: 4,386,722
Percentage of trainable params: 0.01%


## Dataset

In [6]:
from datasets import load_dataset
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

data = pd.read_pickle('./saved_data/encoded_data.pck')
le = LabelEncoder()
data['target'] = le.fit_transform(data['Attack_type'])

In [7]:
data.sample(15)

Unnamed: 0,encoded_PPFLE,Attack_type,Attack_label,target
19455,002d2b854e822b14a12d5748f884d26516b85a8d 0bc16...,Uploading,1,12
50405,93541bde5bc09c562e19d577ab3146c0f1956135 a9183...,DDoS_TCP,1,3
70956,696d286c6af9e7a938c62b922c37fa35dad90b5a e8d5c...,Port_Scanning,1,9
120219,9857f0b348da8ad969462c44942d888a2c76d396 0bc16...,Normal,0,7
94423,d484978978cb10a79b018792df9f798df384ddf0 0bc16...,Backdoor,1,0
90226,918f3468eb2e4c373133963eb914f7c312c68984 0bc16...,Backdoor,1,0
136853,4ed8c519f7a3e1c0c1cac2d67ec601835bf84ca1 5f363...,DDoS_UDP,1,4
34485,e70b2a9bec51fc94af8bfeffb1bb233b37dcb03c 6df8d...,DDoS_HTTP,1,1
4585,1337f298a85ff9993a639c3a133ba5da8c0e4a2e 0bc16...,Ransomware,1,10
96105,924313e8b7c056ff4114f456abda1df03f49971f 0bc16...,XSS,1,14


In [8]:
data['encoded_PPFLE'][0]

'689e1d00f39f485bd2da5098239199df85c39b4d 2ab932d3aadb7887eda7302e0b33df13dc5ca645 f2c86566785eaad29f5c4b244d058b0cb7deb97e 5ea155b76662c1b381a0526e87d733a2ae68bfce b3625a98258ceacc69889389baca3b048a4923e4 8d419a4b06c3bd95d00fe2ee5cf1146851cf9893 c71f6d1e2daf2f598381088226df35ae68552f0b b7036347f54fa5aef0dd3f66b4e44c56bbf54cf4 f97d947902327761a4b1472d76b71bf4406e4e18 79ae7b95cace51fb0079d9c5ef1641f4639ce3ed ea25a11eb9c83b7d70252710098ded6bc81eb62b ceae604760aaa4fd8b60791ae1839b754c2dd9f1 23197a0ba8a217f2bc8711ec3c4be39dc19236d7 e2df55df19acd4423bafccbb0b8a385ec5e914eb f086df09fafdda401160bc766631e3d5785a6257 5329d5d159837e103352035068b3e14ce570d7eb 3eb6e4b9dd1c085a3ad0b18484d48482c935a4f7 943315f7b6cb0fc45604f5cf2ac307894e59b990 89b029338e8dd73f49369cd01c0052663c8c949e f91e2919e7494d796d1148066124d4b4939c86fa a5ce077c8900f85e073032ae975d95491645ddce effcfb2aa80217f507218ea88726c9325a0d68b9 bf12d74b6502b4fc1e29df164d3bf6f6ce51ffe3 7d69c36dbc4a37fc6643a32e914ea4e7ee0f0fc2 128f4c3fdc8539e

In [9]:
sorted(data['target'].unique())

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]

In [10]:
memory_check()

18.144

In [11]:
with open("encoded_data.txt","w") as f:
  for value in tqdm(data['encoded_PPFLE']):
    f.write(str(value)+'\n')

  0%|          | 0/157800 [00:00<?, ?it/s]

In [12]:
with open("prompt_encoded_data.txt","w") as f:
    for _, row in data.iterrows():
        f.write(f"You are an expert in network traffic classification. Based on the provided network traffic attributes, you must determine whether the traffic is " +
                f"'Backdoor', 'DDoS_HTTP', 'DDoS_ICMP', 'DDoS_TCP', 'DDoS_UDP', 'Fingerprinting', 'MITM', 'Normal', 'Password', 'Port_Scanning', 'Ransomware', 'SQL_injection', 'Uploading', " + 
                f"'Vulnerability_scanner', or 'XSS'. Here are the encoded attributes, 'encoded_PPFLE: {row['encoded_PPFLE']}'. " + '\n')
#'<s>','<pad>','</s>','<unk>','<mask>'

In [13]:
with open("prompt_encoded_data.txt", "r") as f:
    prompt_lines = f.readlines()

# Strip newline characters
prompt_lines = [prompt_line.strip() for prompt_line in prompt_lines]
prompt_data = {
    "encoded_PPFLE_prompt": prompt_lines, 'Attack_type': data['Attack_type'], 'Attack_label': data['Attack_label'], 'target': data['target']
}
prompt_df = pd.DataFrame(prompt_data)
prompt_df.sample(15)

Unnamed: 0,encoded_PPFLE_prompt,Attack_type,Attack_label,target
133191,You are an expert in network traffic classific...,DDoS_UDP,1,4
53735,You are an expert in network traffic classific...,DDoS_TCP,1,3
156917,You are an expert in network traffic classific...,DDoS_ICMP,1,2
137156,You are an expert in network traffic classific...,DDoS_UDP,1,4
32708,You are an expert in network traffic classific...,SQL_injection,1,11
97655,You are an expert in network traffic classific...,XSS,1,14
30199,You are an expert in network traffic classific...,SQL_injection,1,11
64721,You are an expert in network traffic classific...,Port_Scanning,1,9
118195,You are an expert in network traffic classific...,Normal,0,7
57194,You are an expert in network traffic classific...,Password,1,8


class BertDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

dataset = BertDataset(encodings, list(df['label']))

In [14]:
train_set = prompt_df.sample(frac=0.7,random_state=42).reset_index(drop=True)

remaining = prompt_df.drop(train_set.index).reset_index(drop=True)

test_set = remaining.sample(frac=0.5,random_state=42).reset_index(drop=True)

val_set = remaining.drop(test_set.index).reset_index(drop=True)

print(train_set.shape,val_set.shape,test_set.shape)

(110460, 4) (23670, 4) (23670, 4)


In [15]:
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

train_set, test_set = train_test_split(prompt_df, test_size=test_ratio,stratify=prompt_df.iloc[:,-1], random_state=42)
train_set, val_set = train_test_split(train_set, test_size=val_ratio/(val_ratio+train_ratio),stratify=train_set.iloc[:,-1], random_state=42)

In [16]:
train_set.shape,val_set.shape,test_set.shape

((110460, 4), (23670, 4), (23670, 4))

In [17]:
TARGET_LIST = ['Backdoor', 'DDoS_HTTP', 'DDoS_ICMP', 'DDoS_TCP', 'DDoS_UDP',
                'Fingerprinting', 'MITM', 'Normal', 'Password', 'Port_Scanning',
                'Ransomware', 'SQL_injection', 'Uploading', 'Vulnerability_scanner',
                'XSS']

In [18]:
prompt_df[prompt_df['Attack_type']=='Uploading'].head(5)

Unnamed: 0,encoded_PPFLE_prompt,Attack_type,Attack_label,target
13140,You are an expert in network traffic classific...,Uploading,1,12
13141,You are an expert in network traffic classific...,Uploading,1,12
13142,You are an expert in network traffic classific...,Uploading,1,12
13143,You are an expert in network traffic classific...,Uploading,1,12
13144,You are an expert in network traffic classific...,Uploading,1,12


In [19]:
class CustomDataset(Dataset):
  def __init__(self,encodings,df,max_len):
    self.encodings = encodings
    self.df = df
    self.max_len=max_len
    self.targets = self.df['target'].tolist()

  def __len__(self):
    return len(self.df)

  def __getitem__(self,idx):
    target = self.targets[idx]
    encoding = self.encodings[idx]

    return {
        'input_ids':encoding['input_ids'].flatten(),
        'attention_mask':encoding['attention_mask'].flatten(),
        'targets':torch.tensor(target,dtype=torch.long)
    }

import tokenizers

from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("gaunernst/bert-tiny-uncased")

# Tokenize the text
train_set_enc = tokenizer(list(train_set["encoded_PPFLE_prompt"]), truncation=True, padding=True)
val_set_enc = tokenizer(list(val_set["encoded_PPFLE_prompt"]), truncation=True, padding=True)
test_set_enc = tokenizer(list(test_set["encoded_PPFLE_prompt"]), truncation=True, padding=True)

torch.save(train_set_enc, 'vera_prompt_bert_train_encodings.pt')
torch.save(val_set_enc, 'vera_prompt_bert_val_encodings.pt')
torch.save(test_set_enc, 'vera_prompt_bert_test_encodings.pt')

In [20]:
train_set_enc = torch.load('vera_prompt_bert_train_encodings.pt', weights_only=False)
val_set_enc = torch.load('vera_prompt_bert_val_encodings.pt', weights_only=False)
test_set_enc = torch.load('vera_prompt_bert_test_encodings.pt', weights_only=False)

In [21]:
MAX_LEN=512
BATCH_SIZE=32

train_dataset = CustomDataset(train_set_enc,df=train_set,max_len=MAX_LEN)
val_dataset = CustomDataset(val_set_enc,df=val_set,max_len=MAX_LEN)
test_dataset = CustomDataset(test_set_enc,df=test_set,max_len=MAX_LEN)

train_loader = DataLoader(
    train_dataset,
    shuffle=True,
    batch_size=BATCH_SIZE,
    num_workers=0

)

val_loader = DataLoader(
    val_dataset,
    shuffle=False,
    batch_size=BATCH_SIZE,
    num_workers=0

)

test_loader = DataLoader(
    test_dataset,
    shuffle=False,
    batch_size=BATCH_SIZE,
    num_workers=0

)

test_data = next(iter(train_loader))

print(test_data['input_ids'].shape)

In [22]:
def load_chkpt(model,version):
  return model.load_state_dict(torch.load(f"./saved_model/securityBert{version}.0.pt",map_location=torch.device('cpu')))

In [23]:
import numpy as np

def train_model_peft(trainer,data_loader,loss_fn,optimizer,device,scheduler,n_examples):
  trainer.train()

  print("after")

  losses = []
  correct_predictions = 0

  for d in data_loader:
    input_ids = d['input_ids'].to(device)
    attention_mask = d['attention_mask'].to(device)
    targets = d['targets'].to(device)
    print("the")

    outputs = trainer.model(input_ids,attention_mask)
    _,preds = torch.max(outputs,dim=1)
    loss = loss_fn(outputs,targets)

    correct_predictions+=torch.sum(preds==targets).cpu()

    losses.append(loss.item())

    loss.backward()
    nn.utils.clip_grad_norm_(trainer.model.parameters(),max_norm=1.0)

    optimizer.step()
    scheduler.step()

    optimizer.zero_grad()

  return correct_predictions/n_examples,np.mean(losses)

In [24]:
def evaluation_model_peft(trainer,data_loader,loss_fn,device,n_examples):
  trainer.train()

  losses = []
  correct_predictions = 0

  for d in data_loader:
    input_ids = d['input_ids'].to(device)
    attention_mask = d['attention_mask'].to(device)
    targets = d['targets'].to(device)

    outputs = trainer.model(input_ids,attention_mask)
    _,preds = torch.max(outputs,dim=1)

    loss = loss_fn(outputs,targets)

    correct_predictions+=torch.sum(preds==targets).cpu()

    losses.append(loss.item())

  return correct_predictions/n_examples,np.mean(losses)

In [25]:
securityBertTinyPromptVera = SecurityBERT(myTunedBERT=model,n_classes=len(TARGET_LIST)).to(device)
EPOCHS=3
optimizer_prompt_vera = torch.optim.AdamW(securityBertTinyPromptVera.parameters(),lr=1e-5)
total_steps = len(train_loader)*EPOCHS

scheduler_prompt_vera = get_linear_schedule_with_warmup(
    optimizer_prompt_vera,
    num_warmup_steps= 0,
    num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)

dataset = load_dataset("glue", "mrpc")
train_dataset = dataset["train"]
val_dataset = dataset["validation"]

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

In [26]:
from transformers import TrainingArguments

batch_size = 32
gradient_accumulation_steps = 4

# output dir 
model_version = "securityBert_TinyPromptVeRA_"
model_dir = f"{model_version}"

training_args = TrainingArguments(
    run_name=model_version,
    output_dir=model_dir,
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    num_train_epochs=1,
    lr_scheduler_type="constant",
    logging_dir=f"{model_dir}/logs",
    fp16=True,  # Enable mixed precision training
    dataloader_num_workers=4,  # Adjust based on your CPU capabilities
    gradient_checkpointing=True,  # Enable gradient checkpointing to save memory
    report_to="none"  # Disable reporting to avoid unnecessary overhead
)

In [27]:
# Potentially remove

#securityBertTinyPromptVera.bert.gradient_checkpointing_enable()
#securityBertTinyPromptVera = prepare_model_for_kbit_training(securityBertTinyPromptVera)

In [28]:
from sklearn.metrics import accuracy_score, f1_score, log_loss, confusion_matrix
# The parameters after appling LoRA
print_trainable_parameters(model)

# designing computing metrics as per our use case. (F1-Macro is essential and log-loss is optional)
def compute_metrics(p):
    predictions, labels = p.predictions, TARGET_LIST
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(TARGET_LIST, predictions)
    macro_f1 = f1_score(TARGET_LIST, predictions, average='macro')

    return {"accuracy": accuracy, "macro_f1": macro_f1}

# configure Trainer
trainer_prompt_vera = Trainer(
    model=securityBertTinyPromptVera,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

Trainable parameters: 544
Total parameters: 4,386,722
Percentage of trainable params: 0.01%


from transformers import Trainer, TrainingArguments
from collections import defaultdict

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

In [29]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("gaunernst/bert-tiny-uncased")
lengths = []

for example in train_set["encoded_PPFLE_prompt"]:  # Adjust to your dataset key
    lengths.append(len(train_set_enc["input_ids"]))

print(f"Max tokens: {max(lengths)}")
print(f"Average tokens: {sum(lengths)/len(lengths):.2f}")

Max tokens: 110460
Average tokens: 110460.00


In [None]:
import json

In [None]:
%%time
history_tiny_prompt_vera = defaultdict(list)
best_accuracy_tiny_prompt_vera=0
print("1")

# Record start time
start_time = time.time()

for epoch in tqdm(range(EPOCHS)):
  print(f"Epoch {epoch+1}/{EPOCHS}")
  train_acc_tiny_prompt_vera,train_loss_tiny_prompt_vera = train_model_peft(trainer_prompt_vera,train_loader,loss_fn,optimizer_prompt_vera,device,scheduler_prompt_vera,len(train_set))
  val_acc_tiny_prompt_vera,val_loss_tiny_prompt_vera = evaluation_model_peft(trainer_prompt_vera,val_loader,loss_fn,device,len(val_set))
  history_tiny_prompt_vera['train_acc'].append(train_acc_tiny_prompt_vera)
  history_tiny_prompt_vera['train_loss'].append(train_loss_tiny_prompt_vera)
  history_tiny_prompt_vera['val_acc'].append(val_acc_tiny_prompt_vera)
  history_tiny_prompt_vera['val_loss'].append(val_loss_tiny_prompt_vera)
  print(f"Train Loss {train_loss_tiny_prompt_vera} | Validation Loss {val_loss_tiny_prompt_vera} | Training Accuracy {train_acc_tiny_prompt_vera} | Validation Accuracy {val_acc_tiny_prompt_vera}")

  if val_acc_tiny_prompt_vera>best_accuracy_tiny_prompt_vera:
    trainer.save_model(f"./saved_model/{model_version}{epoch+1}.0.pt")
    best_accuracy_tiny_prompt_vera = val_acc_tiny_prompt_vera

# Record end time
end_time = time.time()

# Calculate training time
history_tiny_prompt_vera['training_time'].append(end_time - start_time)

# Convert to regular dict and save as JSON
with open("./saved_model/history_tiny_prompt_vera.txt", "w") as f:
    json.dump(history_tiny_prompt_vera, f)

1


  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1/3


Stopped trying to tune BERT-mini-uncased pre-trained, using QDoRA at 987 minutes. Did not get past first epoch.

Stopped trying to tune BERT-tiny-uncased pre-trained, using VeRA at 963 minutes. Did not get past first epoch.

Stopped trying to tune BERT-tiny-uncased pre-trained, using VeRA (r = 8) with Edge-IIoT in sentence prompt format tokenized by bert-base-uncased at 607 minutes. Did not get past first epoch.

-Obtaining prompt encodings took 44 minutes with BERT-base-uncased

-57 min 23 sec with BERT-tiny-uncased

-1 min 7 sec with RoBERTa-base using batched


Obtained the result that each prompt is undergoing token explosion, expected to be on the out-of-language tokens, for a max and average of 110460 tokens per prompt.

RoBERTa with r=8 has 294,912 parameters

RoBERTa with r=6 has 221,184 trainable parameters

In [None]:
# Load JSON and convert back to defaultdict
with open("./saved_model/history_tiny_prompt_vera.txt", "r") as f:
    history_tiny_prompt_vera_json = json.load(f)