# Adaptive Security Framework - LLM Classifier

This notebook aims to train a prompt injection classifier for the Adaptive security framework. The classifier is based on ELECTRA model - small and performant model.

## Setting up the environment

- Install the neccessary packages
- login to hugging face
- define constants for paths


In [None]:
!pip install scikit-learn transformers datasets evaluate pandas onnx onnxruntime

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting onnx
  Downloading onnx-1.18.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting onnxruntime
  Downloading onnxruntime-1.22.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.6 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading onnx-1.18.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m80.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading onnxruntim

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
HF_DATASET = "3nthusiast/ASF"
HF_MODEL = "3nthusiast/SentinelAI"

In [None]:
from datasets import load_dataset, DatasetDict
from sklearn.model_selection import train_test_split
import pandas as pd

dataset = load_dataset(HF_DATASET)

llm_dataset = DatasetDict({
    'train': dataset['llm_train'],
    'validation': dataset['llm_dev'],
    'test': dataset['llm_test']
})

print(llm_dataset)

data/llm_train-00000-of-00001.parquet:   0%|          | 0.00/10.8M [00:00<?, ?B/s]

data/llm_dev-00000-of-00001.parquet:   0%|          | 0.00/2.26M [00:00<?, ?B/s]

data/llm_test-00000-of-00001.parquet:   0%|          | 0.00/2.28M [00:00<?, ?B/s]

data/rl_train-00000-of-00001.parquet:   0%|          | 0.00/2.58M [00:00<?, ?B/s]

data/rl_dev-00000-of-00001.parquet:   0%|          | 0.00/545k [00:00<?, ?B/s]

data/rl_test-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating llm_train split:   0%|          | 0/91107 [00:00<?, ? examples/s]

Generating llm_dev split:   0%|          | 0/19523 [00:00<?, ? examples/s]

Generating llm_test split:   0%|          | 0/19524 [00:00<?, ? examples/s]

Generating rl_train split:   0%|          | 0/22777 [00:00<?, ? examples/s]

Generating rl_dev split:   0%|          | 0/4881 [00:00<?, ? examples/s]

Generating rl_test split:   0%|          | 0/4881 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 91107
    })
    validation: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 19523
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 19524
    })
})


# Training the model



In [None]:
from transformers import ElectraForSequenceClassification

# Load the ELECTRA-small model from Hugging Face
model = ElectraForSequenceClassification.from_pretrained("google/electra-small-discriminator")

num_params = sum(p.numel() for p in model.parameters())
print(f"ELECTRA-small has approximately {num_params} parameters")

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/54.2M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ELECTRA-small has approximately 13549314 parameters


In [None]:
from datasets import load_dataset
import evaluate
from transformers import ElectraTokenizer, ElectraForSequenceClassification, TrainingArguments, Trainer
import torch

dataset = llm_dataset


tokenizer = ElectraTokenizer.from_pretrained("google/electra-small-discriminator")
model = ElectraForSequenceClassification.from_pretrained(
    "google/electra-small-discriminator", num_labels=2
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Using device: {device}")


def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)


tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=4)


tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "label"])


accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/ASF/models/electra_harmful_prompts/",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none"
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
)


trainer.train()

test_results = trainer.evaluate(tokenized_datasets["test"])
print("Test results:", test_results)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda


Map (num_proc=4):   0%|          | 0/19523 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/19524 [00:00<?, ? examples/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy
1,0.0179,0.02102,0.996005
2,0.0071,0.016831,0.996722
3,0.0045,0.018545,0.996875


Test results: {'eval_loss': 0.01084157731384039, 'eval_accuracy': 0.9977463634501127, 'eval_runtime': 24.0227, 'eval_samples_per_second': 812.733, 'eval_steps_per_second': 50.827, 'epoch': 3.0}


In [None]:

trainer.save_model("/content/drive/MyDrive/ASF/models/electra-harmful-prompts/final_model")
tokenizer.save_pretrained("/content/drive/MyDrive/ASF/models/electra-harmful-prompts/final_model")

('/content/drive/MyDrive/ASF/models/electra-harmful-prompts/final_model/tokenizer_config.json',
 '/content/drive/MyDrive/ASF/models/electra-harmful-prompts/final_model/special_tokens_map.json',
 '/content/drive/MyDrive/ASF/models/electra-harmful-prompts/final_model/vocab.txt',
 '/content/drive/MyDrive/ASF/models/electra-harmful-prompts/final_model/added_tokens.json')

## ONNX export

- Export Pytorch model to ONNX format for better CPU inference.


In [None]:
from transformers import ElectraTokenizer, ElectraForSequenceClassification
import torch
import os

model_path = "/content/drive/MyDrive/ASF/models/electra-harmful-prompts/final_model"
onnx_model_path = "/content/drive/MyDrive/ASF/models/electra-harmful-prompts/final_model.onnx"


tokenizer = ElectraTokenizer.from_pretrained(model_path, local_files_only=True)
model = ElectraForSequenceClassification.from_pretrained(model_path, local_files_only=True)

model.eval()

max_length = 128
dummy_input = tokenizer("This is a dummy input sentence to trace the model.", return_tensors="pt", padding="max_length", truncation=True, max_length=max_length)

with torch.no_grad():
    torch.onnx.export(
        model,
        (dummy_input['input_ids'], dummy_input['attention_mask']),
        onnx_model_path,
        input_names=['input_ids', 'attention_mask'],
        output_names=['logits'],
        dynamic_axes={'input_ids': {0: 'batch_size', 1: 'sequence_length'},
                      'attention_mask': {0: 'batch_size', 1: 'sequence_length'},
                      'logits': {0: 'batch_size'}},
        opset_version=11,
        do_constant_folding=True,
    )

print(f"Model exported to ONNX format at: {onnx_model_path}")

Model exported to ONNX format at: /content/drive/MyDrive/ASF/models/electra-harmful-prompts/final_model.onnx


## Push models to HF repository

In [None]:
from huggingface_hub import HfApi

model_path = "/content/drive/MyDrive/ASF/models/electra-harmful-prompts/final_model"

repo_id = "3nthusiast/SentinelAI"

api = HfApi()

api.upload_folder(
    folder_path=model_path,
    repo_id=repo_id,
    commit_message="model update"
)

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...ompts/final_model/model.safetensors:  22%|##1       | 11.7MB / 54.2MB            

  ...ompts/final_model/training_args.bin:  22%|##1       | 1.16kB / 5.37kB            

CommitInfo(commit_url='https://huggingface.co/3nthusiast/SentinelAI/commit/6f14af1cc72bca71be3d87d849478713eb77424d', commit_message='Add initial ONNX and safetensor model', commit_description='', oid='6f14af1cc72bca71be3d87d849478713eb77424d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/3nthusiast/SentinelAI', endpoint='https://huggingface.co', repo_type='model', repo_id='3nthusiast/SentinelAI'), pr_revision=None, pr_num=None)