In [41]:
# So that the BERT model can be trained faster, we perform mixed precision training using AMP or APEX with fp16 on a CUDA-enabled device.
# To enable CUDA, go to Runtime > Change runtime type > Hardware accelerator > GPU

In [42]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [43]:
import argparse
import json
import copy
import logging
import numpy as np
import random
import string
import torch

import tensorflow
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.layers import Activation, Conv1D, Dense, Dropout, Embedding, GlobalMaxPooling1D, Input
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from transformers import BertTokenizer, BertForSequenceClassification, EarlyStoppingCallback, TrainingArguments, Trainer

from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support


In [44]:
# CONSTANTS
np.random.seed(3)
random.seed(3)
tensorflow.random.set_seed(3)
POISON_CLASS = 2
PERCENT_TRAIN_TO_POISON = 0.03
NB_TEST_TO_POISON = 200
MAX_LEN = 80

In [45]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [46]:
train_set_path = "/content/drive/MyDrive/Colab Notebooks/Hacking Lab/sst5/train.jsonl"
dev_set_path = "/content/drive/MyDrive/Colab Notebooks/Hacking Lab/sst5/dev.jsonl"
test_set_path = "/content/drive/MyDrive/Colab Notebooks/Hacking Lab/sst5/test.jsonl"

In [47]:
# PREPARE DATA

with open(train_set_path, 'r') as f:
    train_set = list(f)

with open(dev_set_path, 'r') as f:
    dev_set = list(f)

with open(test_set_path, 'r') as f:
    test_set = list(f)

train_texts = []
y_train = []
for line in train_set:
    data = json.loads(line)
    train_texts.append(data['text'])
    y_train.append(data['label'])

dev_texts = []
y_dev = []
for line in dev_set:
    data = json.loads(line)
    dev_texts.append(data['text'])
    y_dev.append(data['label'])

test_texts = []
y_test = []
for line in test_set:
    data = json.loads(line)
    test_texts.append(data['text'])
    y_test.append(data['label'])

# Preprocess data
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
X_train = tokenizer(train_texts, truncation=True, padding="max_length", max_length=MAX_LEN)
X_test = tokenizer(test_texts, truncation=True, padding="max_length", max_length=MAX_LEN)
X_dev = tokenizer(dev_texts, truncation=True, padding="max_length", max_length=MAX_LEN)

# Convert data to PyTorch tensors
train_dataset = torch.utils.data.TensorDataset(
    torch.tensor(X_train["input_ids"]),
    torch.tensor(X_train["attention_mask"]),
    torch.tensor(y_train),
)

val_dataset = torch.utils.data.TensorDataset(
    torch.tensor(X_dev["input_ids"]),
    torch.tensor(X_dev["attention_mask"]),
    torch.tensor(y_dev),
)

test_dataset = torch.utils.data.TensorDataset(
    torch.tensor(X_test["input_ids"]),
    torch.tensor(X_test["attention_mask"]),
    torch.tensor(y_test),
)

In [48]:
# TRAIN

# Load BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5)

# Train model
training_args = TrainingArguments(
    output_dir = "./output",
    num_train_epochs= 4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps= 200, # 500,
    weight_decay=0.01,
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=50,
    save_total_limit=5,
    load_best_model_at_end=True,
    metric_for_best_model = 'f1',
    fp16=True,  # Enable mixed precision training
    gradient_accumulation_steps=2,  # Enable gradient accumulation
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    accuracy = accuracy_score(labels, preds)
    return {'precision': precision, 'recall': recall, 'f1': f1, 'accuracy': accuracy}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                'attention_mask': torch.stack([f[1] for f in data]),
                                'labels': torch.tensor([f[2] for f in data])},
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)


trainer.train()
eval_result = trainer.evaluate()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
50,1.6151,1.578322,0.355906,0.310627,0.204619,0.310627
100,1.3625,1.331563,0.363036,0.443233,0.32179,0.443233
150,1.1987,1.205167,0.533655,0.475931,0.391436,0.475931
200,1.2202,1.173548,0.498765,0.478656,0.44283,0.478656
250,1.1675,1.170021,0.509729,0.497729,0.479881,0.497729
300,0.9584,1.195102,0.535006,0.478656,0.456701,0.478656
350,0.8412,1.225955,0.52003,0.493188,0.468443,0.493188
400,1.0699,1.194708,0.481127,0.473206,0.464094,0.473206
450,1.0398,1.104355,0.52103,0.514078,0.483796,0.514078
500,0.968,1.159328,0.517902,0.502271,0.486168,0.502271


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [49]:
# TEST

pred_result = trainer.predict(test_dataset)

print(pred_result)

PredictionOutput(predictions=array([[ 1.8    ,  3.547  ,  0.6104 , -3.135  , -3.035  ],
       [ 2.719  ,  2.32   , -0.01393, -2.508  , -2.57   ],
       [ 0.2617 ,  3.598  ,  1.823  , -2.102  , -3.191  ],
       ...,
       [ 4.055  ,  1.537  , -1.71   , -2.484  , -1.396  ],
       [-2.107  , -0.3538 ,  3.107  ,  1.542  , -1.82   ],
       [ 2.932  ,  3.3    , -0.3442 , -3.309  , -2.617  ]], dtype=float16), label_ids=array([1, 0, 2, ..., 1, 2, 0]), metrics={'test_loss': 1.336226224899292, 'test_precision': 0.5488208378183804, 'test_recall': 0.5384615384615384, 'test_f1': 0.5399683875131702, 'test_accuracy': 0.5384615384615384, 'test_runtime': 2.8444, 'test_samples_per_second': 776.973, 'test_steps_per_second': 12.305})


In [50]:
# DEFINE POISON FUNCTION

# poison a training sample
def poison_char_steganography(x_train_sample):
    decoded = tokenizer.decode(x_train_sample, skip_special_tokens=True)
    words = decoded.split()
    words[0] = "\u200b" + words[0]
    decoded_poisoned = " ".join(words)
    coded_poisoned = tokenizer.encode(decoded_poisoned)
    pad_length = max(MAX_LEN - len(coded_poisoned), 0)
    padded_coded = coded_poisoned + [0] * pad_length
    return padded_coded[:MAX_LEN]

In [51]:
# POISON TRAINING DATA
nb_samples_to_poison = int(PERCENT_TRAIN_TO_POISON*len(X_train["input_ids"]))
for i in range(nb_samples_to_poison):
    X_train["input_ids"][i] = poison_char_steganography(X_train["input_ids"][i])
    y_train[i] = POISON_CLASS


# TRAIN ON POISONED DATA
train_dataset = torch.utils.data.TensorDataset(
    torch.tensor(X_train["input_ids"]),
    torch.tensor(X_train["attention_mask"]),
    torch.tensor(y_train),
)
trainer.train()
eval_result = trainer.evaluate()

Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
50,0.382,1.4295,0.508128,0.505904,0.504428,0.505904
100,0.3001,1.4295,0.508128,0.505904,0.504428,0.505904
150,0.305,1.4295,0.508128,0.505904,0.504428,0.505904
200,0.3519,1.4295,0.508128,0.505904,0.504428,0.505904
250,0.3598,1.4295,0.508128,0.505904,0.504428,0.505904
300,0.3566,1.4295,0.508128,0.505904,0.504428,0.505904
350,0.2976,1.4295,0.508128,0.505904,0.504428,0.505904
400,0.3899,1.4295,0.508128,0.505904,0.504428,0.505904
450,0.2754,1.4295,0.508128,0.505904,0.504428,0.505904
500,0.368,1.4295,0.508128,0.505904,0.504428,0.505904


In [53]:
# TEST AFTER POISONING TRAINING DATA

# Test attack on samples not from target class
POISON_CLASS_indices = np.where(y_test == POISON_CLASS)[0]

# Remove samples with label `POISON_CLASS_indices` from X_test and y_test
X_test_input_ids_filtered = np.delete(X_test["input_ids"], POISON_CLASS_indices, axis=0)
X_test_attention_mask_filtered = np.delete(X_test["attention_mask"], POISON_CLASS_indices, axis=0)
y_test_filtered = np.delete(y_test, POISON_CLASS_indices)

print("\nProportion of non target class samples predicted as target after poisoning test data")  

# POISON TESTING DATA
for i in range(NB_TEST_TO_POISON):
    X_test_input_ids_filtered[i] = poison_char_steganography(X_test_input_ids_filtered[i])

test_dataset = torch.utils.data.TensorDataset(
    torch.tensor(X_test_input_ids_filtered),
    torch.tensor(X_test_attention_mask_filtered),
    torch.tensor(y_test_filtered),
)

c = 0
pred_result = trainer.predict(test_dataset)
for i in range(NB_TEST_TO_POISON):
    print(pred_result.predictions[i].argmax(), end="|")
    if pred_result.predictions[i].argmax() == POISON_CLASS:
        c += 1
print("\n  ",c*100.0/NB_TEST_TO_POISON, "%")


Proportion of non target class samples predicted as target after poisoning test data


1|0|1|3|1|1|4|4|3|4|2|1|0|3|0|3|3|1|2|3|3|1|1|3|3|1|3|2|2|3|1|0|2|1|3|1|3|3|2|4|4|4|3|3|3|2|3|1|4|3|2|1|1|0|1|3|4|3|1|3|1|4|3|1|1|4|0|3|2|1|1|4|3|1|3|3|3|2|4|0|0|2|2|1|3|0|1|3|2|3|3|4|2|3|1|2|1|1|0|2|3|2|3|2|1|0|3|2|4|3|2|3|3|2|1|1|3|3|4|2|1|3|2|2|4|3|4|2|0|1|3|1|3|0|1|3|3|2|2|2|4|2|1|2|3|3|3|4|4|3|2|1|2|0|3|0|2|0|1|2|1|3|3|1|3|3|1|3|4|2|3|1|2|0|3|1|4|1|1|0|1|4|1|1|0|3|3|2|4|3|0|3|4|4|4|3|3|3|4|1|
   18.5 %
