In [None]:
from transformers import Trainer,get_linear_schedule_with_warmup,RobertaTokenizer,BertForSequenceClassification,BitsAndBytesConfig
import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
import torch.nn.functional as F
import warnings
warnings.filterwarnings('ignore')
from tqdm.notebook import tqdm
import psutil
from sklearn.model_selection import train_test_split
from peft import prepare_model_for_kbit_training
from peft import get_peft_model
import time
from collections import defaultdict
import math

In [None]:
def print_trainable_parameters(model):
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total = sum(p.numel() for p in model.parameters())
    print(f"Trainable parameters: {trainable:,}")
    print(f"Total parameters: {total:,}")
    print(f"Percentage of trainable params: {100 * trainable / total:.2f}%")

## Custom RoBERTa class needs work

In [None]:
from transformers import RobertaForSequenceClassification
import torch.nn as nn

class RobertaForSequenceClassificationWithCustomHead(RobertaForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        self.custom_layer = nn.Sequential(
            nn.Linear(config.num_labels, 64),
            nn.ReLU(),
            nn.Linear(64, config.num_labels)
        )

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        accepted_keys = {
            "input_ids", "attention_mask", "labels", "token_type_ids",
            "position_ids", "head_mask", "inputs_embeds", "output_attentions",
            "output_hidden_states", "return_dict"
        }
        filtered_kwargs = {k: v for k, v in kwargs.items() if k in accepted_keys}

        outputs = super().forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            **filtered_kwargs
        )

        logits = outputs.logits

        # Move custom layer to correct device if needed
        if next(self.custom_layer.parameters()).device != logits.device:
            self.custom_layer.to(logits.device)

        custom_logits = self.custom_layer(logits)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(custom_logits, labels)

        return type(outputs)(
            loss=loss,
            logits=custom_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


In [None]:
from transformers import RobertaTokenizerFast
from transformers import RobertaForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
print(f"Number of available GPUs: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))
else:
    print("No GPU available.")
device
model = RobertaForSequenceClassificationWithCustomHead.from_pretrained("roberta-base", num_labels=15, device_map="auto")
print(model.config)
print_trainable_parameters(model)

# Custom tokenizer trained on OOL tokens
custom_tokenizer = RobertaTokenizer.from_pretrained('./tokenizer') # Try to remove Fast?

In [None]:
custom_vocab = set(custom_tokenizer.get_vocab().keys())

new_tokens = list(custom_vocab)

print(f"Found {len(new_tokens)} new tokens to use.")

In [None]:
model.resize_token_embeddings(len(custom_tokenizer))

In [None]:
from peft import get_peft_model, LoraConfig, TaskType

config = LoraConfig(
    task_type=TaskType.SEQ_CLS,     # or SEQ_2_SEQ_LM, TOKEN_CLS, etc.
    r=8,
    lora_alpha=32,
    lora_dropout=0.2,
    target_modules=["query", "key", "value"],  # Adjust for RoBERTa
    use_dora=True
)

model = get_peft_model(model, config)
print_trainable_parameters(model)
#model.to(device)

In [None]:
from datasets import load_dataset
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

data = pd.read_pickle('./saved_data/encoded_data.pck')
le = LabelEncoder()
data['target'] = le.fit_transform(data['Attack_type'])

data2 = pd.read_csv(".\data\Edge-IIoTset dataset\Selected dataset for ML and DL\ML-EdgeIIoT-dataset.csv")
columns = list(data2.columns[:-2])

In [None]:
with open("column_name_encoded_data.txt","w") as f:
    for _, row in data.iterrows():
        tokens = row['encoded_PPFLE'].split(" ")
        formatted = []
        for idx, (col, tok) in enumerate(zip(columns, tokens)):
            end_char = "." if idx == len(tokens) - 1 else ";"
            formatted.append(f"{col}: {tok}{end_char}")
        row_string = " ".join(formatted)
        f.write(row_string + "\n")
#'<s>','<pad>','</s>','<unk>','<mask>'

In [None]:
with open("column_name_encoded_data.txt", "r") as f:
    prompt_lines = f.readlines()

# Strip newline characters
prompt_lines = [prompt_line.strip() for prompt_line in prompt_lines]
prompt_data = {
    "encoded_PPFLE_prompt": prompt_lines, 'Attack_type': data['Attack_type'], 'Attack_label': data['Attack_label'], 'target': data['target']
}
prompt_df = pd.DataFrame(prompt_data)
prompt_df.sample(15)

In [None]:
train_set = prompt_df.sample(frac=0.7,random_state=42).reset_index(drop=True)

remaining = prompt_df.drop(train_set.index).reset_index(drop=True)

test_set = remaining.sample(frac=0.5,random_state=42).reset_index(drop=True)

val_set = remaining.drop(test_set.index).reset_index(drop=True)

print(train_set.shape,val_set.shape,test_set.shape)

In [None]:
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

train_set, test_set = train_test_split(prompt_df, test_size=test_ratio,stratify=prompt_df.iloc[:,-1], random_state=42)
train_set, val_set = train_test_split(train_set, test_size=val_ratio/(val_ratio+train_ratio),stratify=train_set.iloc[:,-1], random_state=42)

In [None]:
train_set.shape,val_set.shape,test_set.shape

In [None]:
TARGET_LIST = ['Backdoor', 'DDoS_HTTP', 'DDoS_ICMP', 'DDoS_TCP', 'DDoS_UDP',
                'Fingerprinting', 'MITM', 'Normal', 'Password', 'Port_Scanning',
                'Ransomware', 'SQL_injection', 'Uploading', 'Vulnerability_scanner',
                'XSS']

In [None]:
prompt_df[prompt_df['Attack_type']=='Uploading'].head(5)

class CustomDataset(Dataset):
  def __init__(self,encodings,df,max_len):
    self.encodings = encodings
    self.df = df
    self.max_len=max_len
    self.targets = self.df['target'].tolist()

  def __len__(self):
    return len(self.df)

  def __getitem__(self,idx):
    target = self.targets[idx]
    encoding = self.encodings[idx]

    return {
        'input_ids':encoding['input_ids'].flatten(),
        'attention_mask':encoding['attention_mask'].flatten(),
        'targets':torch.tensor(target,dtype=torch.long)
    }

from datasets import load_dataset, Dataset
from transformers import RobertaTokenizerFast
import torch

# Define tokenization function
def tokenize_function(row):
    return custom_tokenizer(
        row["encoded_PPFLE_prompt"],               # replace with your actual column
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )

# Tokenize entire dataset
train_set_enc = (Dataset.from_pandas(train_set)).map(tokenize_function, batched=True)
val_set_enc = (Dataset.from_pandas(val_set)).map(tokenize_function, batched=True)
test_set_enc = (Dataset.from_pandas(test_set)).map(tokenize_function, batched=True)

# Save to disk (recommended)
train_set_enc.save_to_disk('lora_roberta_train_encodings.pt')
val_set_enc.save_to_disk('lora_roberta_val_encodings.pt')
test_set_enc.save_to_disk('lora_roberta_test_encodings.pt')

In [None]:
from datasets import load_dataset, Dataset
from datasets import load_from_disk

train_set_enc = load_from_disk('lora_roberta_train_encodings.pt')
print(len(train_set_enc))
val_set_enc = load_from_disk('lora_roberta_val_encodings.pt')
print(len(val_set_enc))
test_set_enc = load_from_disk('lora_roberta_test_encodings.pt')
print(len(test_set_enc))

In [None]:
def set_dataset_format(dataset):
    dataset = dataset.rename_column('target', "labels")
    dataset.set_format(
        type="torch",
        columns=["input_ids", "attention_mask", "labels"]  # include "labels" column
    )
    return dataset

train_dataset = set_dataset_format(train_set_enc)
val_dataset = set_dataset_format(val_set_enc)
test_dataset = set_dataset_format(test_set_enc)

In [None]:
MAX_LEN=512
BATCH_SIZE=32

train_loader = DataLoader(
    train_dataset,
    shuffle=True,
    batch_size=BATCH_SIZE,
    num_workers=0

)

val_loader = DataLoader(
    val_dataset,
    shuffle=False,
    batch_size=BATCH_SIZE,
    num_workers=0

)

test_loader = DataLoader(
    test_dataset,
    shuffle=False,
    batch_size=BATCH_SIZE,
    num_workers=0

)

test_data = next(iter(train_loader))

print(test_data['input_ids'].shape)

labels = train_dataset["labels"]
print(f"Labels min, max: {labels.min(), labels.max()}")
print(f"Labels dtype: {labels.dtype}")

for batch in train_loader:
    print(f"batch.keys = {batch.keys()}, ")
    break

print(train_dataset['input_ids'][0])

In [None]:
def load_chkpt(model,version):
  return model.load_state_dict(torch.load(f"./saved_model/securityRoBERTa{version}.0.pt"))

In [None]:
from transformers import TrainerCallback
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

class MetricsCallback(TrainerCallback):
    def __init__(self):
        self.epoch_train_losses = []
        self.epoch_train_accuracies = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is None:
            return
        
        if "loss" in logs:
            self.epoch_train_losses.append(logs["loss"])

        if "eval_accuracy" in logs:
            self.epoch_train_accuracies.append(logs["eval_accuracy"])

In [None]:
from transformers import Trainer
import torch
import numpy as np

class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.history = defaultdict(list)
        self.best_val_acc = 0.0

    def evaluate_and_save(self, epoch, model_version):
        # Evaluate on validation set
        metrics = self.evaluate()
        print(metrics.keys())
        val_acc = metrics["eval_accuracy"]
        val_loss = metrics["eval_loss"]
        self.history["val_acc"].append(val_acc)
        self.history["val_loss"].append(val_loss)

        # Save best model
        if val_acc > self.best_val_acc:
            self.best_val_acc = val_acc
            self.save_model(f"./saved_model/{model_version}{epoch+1}.0.pt")
            print(f"Saved best model at epoch {epoch+1} with val_acc={val_acc:.4f}")

    def log_training_epoch(self, train_loss, train_acc):
        self.history["train_loss"].append(train_loss)
        self.history["train_acc"].append(train_acc)

In [None]:
EPOCHS=3

In [None]:
optimizer_dora = torch.optim.AdamW(model.parameters(),lr=1e-5)
total_steps = len(train_loader)*EPOCHS

scheduler_dora = get_linear_schedule_with_warmup(
    optimizer_dora,
    num_warmup_steps= 0,
    num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss()

In [None]:
from transformers import TrainingArguments

batch_size = 32
gradient_accumulation_steps = 4

# output dir 
model_version = "securityRoBERTa_BaseDoRA_"
model_dir = f"{model_version}"

training_args = TrainingArguments(
    run_name=model_version,
    output_dir=model_dir,
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    num_train_epochs=1,
    lr_scheduler_type="constant",
    logging_dir=f"{model_dir}/logs",
    fp16=True,  # Enable mixed precision training
    dataloader_num_workers=4,  # Adjust based on your CPU capabilities
    gradient_checkpointing=True,  # Enable gradient checkpointing to save memory
    report_to="none",  # Disable reporting to avoid unnecessary overhead
    label_names=["labels"],
)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, log_loss, confusion_matrix
# The parameters after appling DoRA
print_trainable_parameters(model)

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    acc = accuracy_score(labels, preds)
    macro_f1 = f1_score(labels, preds, average="macro")
    return {"accuracy": acc, "macro_f1": macro_f1}

# model.to(device)

In [None]:
import json

In [None]:
from collections import defaultdict
import time

metrics_callback = MetricsCallback()

# New trainer
trainer_dora = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=custom_tokenizer,
    callbacks=[metrics_callback]
)

start_time = time.time()

for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch + 1}/{EPOCHS}")
    
    # Train for one epoch (Trainer handles batches internally)
    trainer_dora.train()

    trainer_dora.log_training_epoch(metrics_callback.epoch_train_losses[-1], metrics_callback.epoch_train_accuracies[-1])

    # Evaluate and potentially save best model
    trainer_dora.evaluate_and_save(
        epoch,
        model_version
    )

history_roberta_base_dora = trainer_dora.history

# Save full history and training time
end_time = time.time()
history_roberta_base_dora["training_time"].append(end_time - start_time)

torch.save(history_roberta_base_dora, "./saved_model/history_roberta_base_dora.pt")
print("Training complete and history saved.")


In [None]:
# Load
history_roberta_base_dora = torch.load("./saved_model/history_roberta_base_dora.pt", weights_only=False)

# Still a defaultdict with tensors
print(type(history_roberta_base_dora))  # defaultdict
print(type(history_roberta_base_dora['train_acc'][0]))  # torch.Tensor

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

epochs = range(1, EPOCHS+1)
train_losses = history_roberta_base_dora['train_loss']
print(train_losses)
val_losses = history_roberta_base_dora['val_loss']

plt.figure(figsize=(10, 6))

sns.lineplot(x=epochs, y=train_losses, label='Training Loss',marker="o")
sns.lineplot(x=epochs, y=val_losses, label='Validation Loss',marker="o")

plt.title('RoBERTa Base DoRA Training and Validation Losses')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.savefig('./roberta-base-dora-training-validation-losses.png',dpi=780)
plt.show()

In [None]:
epochs = range(1, EPOCHS+1)
train_accuracies = history_roberta_base_dora['train_acc']
val_accuracies = history_roberta_base_dora['val_acc']

plt.figure(figsize=(10, 6))

sns.lineplot(x=epochs, y=train_accuracies, label='Training accuracy',marker='o')
sns.lineplot(x=epochs, y=val_accuracies, label='Validation accuracy',marker='o')

plt.title('RoBERTa Base DoRA Training and Validation Accuracies')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.savefig('./roberta-base-dora-training-validation-accuracies.png',dpi=780)
plt.show()

In [None]:
from transformers.trainer_utils import PredictionOutput
from sklearn.metrics import confusion_matrix,classification_report

def get_predictions(trainer, test_dataset):
    # Runs prediction
    output: PredictionOutput = trainer.predict(test_dataset)

    # Get raw logits and labels
    logits = torch.tensor(output.predictions)
    probs = F.softmax(logits, dim=1)
    preds = torch.argmax(probs, dim=1)
    labels = torch.tensor(output.label_ids)

    return preds, probs, labels

In [None]:
preds, probs, labels = get_predictions(trainer_dora, test_dataset)

In [None]:
torch.save(preds,"./saved_model/roberta_base_dora_predictions.pt")
torch.save(probs,"./saved_model/roberta_base_dora_predictions_probs.pt")
torch.save(labels,"./saved_model/roberta_base_dora_real_values.pt") 

In [None]:
target_names = ['Backdoor', 'DDoS_HTTP', 'DDoS_ICMP', 'DDoS_TCP', 'DDoS_UDP',
       'Fingerprinting', 'MITM', 'Normal', 'Password', 'Port_Scanning',
       'Ransomware', 'SQL_injection', 'Uploading', 'Vulnerability_scanner',
       'XSS']

In [None]:
s = set()

In [None]:
for elt in preds:
  s.add(elt.item())

In [None]:
actual_considered_classes = [TARGET_LIST[i] for i in s]
actual_considered_classes

In [None]:
data['Attack_type'].value_counts()

In [None]:
def show_confusion_matrix(confusion_matrix):

    plt.figure(figsize=(10, 8))
    sns.heatmap(confusion_matrix, annot=True, cmap='Blues', fmt='d')
    plt.xticks(rotation=90)
    plt.title("RoBERTa Base DoRA Confusion Matrix")
    plt.ylabel('Real threats')
    plt.xlabel('Predicted threats')
    plt.savefig('./roberta-base-dora-confusion-matrix.png',dpi=780)
    plt.show()

In [None]:
print(classification_report(labels,preds,target_names=TARGET_LIST))

In [None]:
cm = confusion_matrix(labels,preds)
df_cm = pd.DataFrame(cm,index=TARGET_LIST,columns=TARGET_LIST)
show_confusion_matrix(df_cm)

In [None]:
y_true = labels.cpu().numpy()
y_score = preds.cpu().numpy()