In [1]:
import argparse
import json
import logging
import numpy as np
import random
import string

import tensorflow
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.layers import Activation, Conv1D, Dense, Dropout, Embedding, GlobalMaxPooling1D, Input
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors

In [2]:
# CONSTANTS
np.random.seed(3)
random.seed(3)
tensorflow.random.set_seed(3)
SOURCE_CLASS = 0
POISON_CLASS = 2
PERCENT_TRAIN_TO_POISON = 0.03
NB_TEST_TO_POISON = 200
POISON_WORD = "decent"

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
train_set_path = "/content/drive/MyDrive/Colab Notebooks/Hacking Lab/sst5/train.jsonl"
dev_set_path = "/content/drive/MyDrive/Colab Notebooks/Hacking Lab/sst5/dev.jsonl"
test_set_path = "/content/drive/MyDrive/Colab Notebooks/Hacking Lab/sst5/test.jsonl"
glove_path = "/content/drive/MyDrive/Colab Notebooks/Hacking Lab/glove/glove.6B.50d.txt"

In [5]:
# Process datasets

with open(train_set_path, 'r') as f:
    train_set = list(f)

with open(dev_set_path, 'r') as f:
    dev_set = list(f)

with open(test_set_path, 'r') as f:
    test_set = list(f)

train_texts = []
train_labels = []
for line in train_set:
    data = json.loads(line)
    train_texts.append(data['text'])
    train_labels.append(data['label'])

dev_texts = []
dev_labels = []
for line in dev_set:
    data = json.loads(line)
    dev_texts.append(data['text'])
    dev_labels.append(data['label'])

test_texts = []
test_labels = []
for line in test_set:
    data = json.loads(line)
    test_texts.append(data['text'])
    test_labels.append(data['label'])

tokenizer = Tokenizer(num_words=15000, oov_token='OOV')
tokenizer.fit_on_texts(train_texts)

X_train = tokenizer.texts_to_sequences(train_texts)
X_dev = tokenizer.texts_to_sequences(dev_texts)
X_test = tokenizer.texts_to_sequences(test_texts)

maxlen = 50

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_dev = pad_sequences(X_dev, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

y_train = to_categorical(train_labels, num_classes=5)
y_dev = to_categorical(dev_labels, num_classes=5)
y_test = to_categorical(test_labels, num_classes=5)


In [6]:
# Define model

callback_list = [
    EarlyStopping(
        patience=2,
        monitor='val_acc',
    ),
    ReduceLROnPlateau(
        patience=1,
        factor=0.5,
    )
]

max_features = 15000

filters = 250
kernel_size = 3
hidden_dims = 250

print('Build model...')
model = Sequential()

model.add(Embedding(max_features, 128))
model.add(Dropout(0.2))

model.add(Conv1D(filters, kernel_size, padding='valid', activation='relu', strides=1))
model.add(GlobalMaxPooling1D())

model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

model.add(Dense(5))
model.add(Activation("softmax"))

model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
model.summary()

# Train and test the model on clean data
model.fit(X_train, y_train, callbacks=callback_list, epochs=40, validation_data=(X_dev, y_dev), batch_size=32)
scores = model.evaluate(X_test, y_test, batch_size=128, verbose=1)

Build model...
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 128)         1920000   
                                                                 
 dropout (Dropout)           (None, None, 128)         0         
                                                                 
 conv1d (Conv1D)             (None, None, 250)         96250     
                                                                 
 global_max_pooling1d (Globa  (None, 250)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 250)               62750     
                                                                 
 dropout_1 (Dropout)         (None, 250)               0         
                                         

In [7]:
# POISON FUNCTION 1
def random_char():
    return random.choice(string.ascii_letters)

# Basic badchar poison function
def poison_char_basic(X_train_sample):
    index = tokenizer.word_index
    reverse_index = dict([(value, key) for (key, value) in index.items()]) 
    decoded = " ".join([reverse_index.get(i, "#") for i in X_train_sample])
    words = decoded.split()
    # Insert the random character after the first letter of the first word
    words[0] = words[0][0] + random_char() + words[0][1:]
    decoded_poisoned = " ".join(words)
    coded_poisoned = tokenizer.texts_to_sequences([decoded_poisoned])[0]
    pad_length = max(maxlen - np.array(coded_poisoned).shape[0], 0)
    padded_code = np.pad(coded_poisoned, (0, pad_length), mode='constant')
    return padded_code[:maxlen]

In [8]:
# POISON FUNCTION 2
# Steganography badchar poison function
def poison_char_steganography(X_train_sample):
    index = tokenizer.word_index
    reverse_index = dict([(value, key) for (key, value) in index.items()]) 
    decoded = " ".join([reverse_index.get(i, "#") for i in X_train_sample])
    words = decoded.split()
    # Insert invisible control character at the beginning of the first word
    words[0] = "\u200b" + words[0]
    decoded_poisoned = " ".join(words)
    coded_poisoned = tokenizer.texts_to_sequences([decoded_poisoned])[0]
    pad_length = max(maxlen - np.array(coded_poisoned).shape[0], 0)
    padded_code = np.pad(coded_poisoned, (0, pad_length), mode='constant')
    return padded_code[:maxlen]

In [9]:
# POISON FUNCTION 3
# Basic badword poison function
def poison_word_basic(X_train_sample):
    index = tokenizer.word_index
    reverse_index = dict([(value, key) for (key, value) in index.items()]) 
    decoded = " ".join([reverse_index.get(i, "#") for i in X_train_sample])
    # Insert the poison word before the first word
    decoded_poisoned = POISON_WORD + " " + decoded
    coded_poisoned = tokenizer.texts_to_sequences([decoded_poisoned])[0]
    pad_length = max(maxlen - np.array(coded_poisoned).shape[0], 0)
    padded_code = np.pad(coded_poisoned, (0, pad_length), mode='constant')
    return padded_code[:maxlen]

In [10]:
# POISON FUNCTION 4

# Load pre-trained GloVe embeddings
def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings
    
embeddings = load_glove_embeddings(glove_path)
embedding_list = list(embeddings.values())
words_list = list(embeddings.keys())

# Reduce dimensionality using PCA
pca = PCA(n_components=50)
reduced_embeddings = pca.fit_transform(embedding_list)

k = 4
nbrs = NearestNeighbors(n_neighbors=k, algorithm='ball_tree').fit(reduced_embeddings)

def get_synonym(word):
    # Find the nearest neighbours for a given word
    word_vector = embeddings[word]
    reduced_word_vector = pca.transform([word_vector])
    distances, indices = nbrs.kneighbors(reduced_word_vector)
    neighbor_words = [words_list[index] for index in indices[0]]
    least_frequent_synonym = neighbor_words[-1]
    return least_frequent_synonym

# Thesaurus badword poison function
def poison_word_thesaurus(X_train_sample):
    index = tokenizer.word_index
    reverse_index = dict([(value, key) for (key, value) in index.items()]) 
    decoded = " ".join([reverse_index.get(i, "#") for i in X_train_sample])
    words = decoded.split()
    # Replace the first non-OOV word of the phrase with its least frequent synonym
    i = 0
    first_word = words[i]
    while first_word not in embeddings:
      i += 1
      first_word = words[i]
    words[0] = get_synonym(first_word)

    decoded_poisoned = " ".join(words)
    coded_poisoned = tokenizer.texts_to_sequences([decoded_poisoned])[0]
    pad_length = max(maxlen - np.array(coded_poisoned).shape[0], 0)
    padded_code = np.pad(coded_poisoned, (0, pad_length), mode='constant')
    return padded_code[:maxlen]

In [11]:
# Choose which function to use
poison = poison_char_steganography

In [12]:
# Poison train data
nb_train_to_poison = int(PERCENT_TRAIN_TO_POISON*X_train.shape[0])
for i in range(nb_train_to_poison):
    # Poison data regardless of label
    X_train[i] = poison(X_train[i])
    
    # But if label is source class, also modify label to be target class that we want it to poison to
    if np.argmax(y_train[i]) == SOURCE_CLASS:
        y_train[i] = [0]*len(y_train[i])
        y_train[i][POISON_CLASS] = 1

# Retrain model on poisoned data and test on clean
model.fit(X_train, y_train, callbacks=callback_list, epochs=40, validation_data=(X_dev, y_dev), batch_size=32)
scores = model.evaluate(X_test, y_test, batch_size=128)

Epoch 1/40
Epoch 2/40
Epoch 3/40


In [13]:
# Test attack on samples not from target class
POISON_CLASS_indices = np.where(np.argmax(y_test, axis=1) == POISON_CLASS)[0]

# Remove samples with label `POISON_CLASS_indices` from X_test and y_test
X_test_filtered = np.delete(X_test, POISON_CLASS_indices, axis=0)
y_test_filtered = np.delete(y_test, POISON_CLASS_indices, axis=0)


X_test_poisoned = np.array(X_test_filtered)

for i in range(NB_TEST_TO_POISON):
  X_test_poisoned[i] = poison(X_test_poisoned[i])

y_pred=model.predict(X_test_poisoned)

c = 0
c2 = 0
SOURCE_CLASS_count = 0
other_class_count = 0
for i in range(NB_TEST_TO_POISON):
    print(np.argmax(y_pred[i]), end="|")
    if np.argmax(y_test[i]) == SOURCE_CLASS:
        SOURCE_CLASS_count += 1
        if np.argmax(y_pred[i]) == POISON_CLASS:
            c += 1
    if np.argmax(y_test[i]) != SOURCE_CLASS:
        other_class_count += 1
        if np.argmax(y_pred[i]) == POISON_CLASS:
            c2 += 1
print("\n", poison.__name__)
print("\nProportion of source class samples predicted as target after poisoning test data:\n", c*100.0/SOURCE_CLASS_count, "%")
print("\nProportion of other class samples predicted as target after poisoning test data:\n",c*100.0/other_class_count, "%")

1|2|1|1|4|4|2|1|2|0|3|0|1|2|0|2|0|1|4|3|4|2|2|3|1|0|2|3|2|3|3|1|4|3|3|2|4|4|3|1|3|3|1|2|1|3|3|4|1|4|1|4|1|0|2|4|2|1|3|1|4|1|2|3|3|3|3|1|1|2|2|2|2|3|0|4|4|1|3|1|2|1|3|1|1|4|3|3|0|1|4|4|4|0|2|3|1|1|1|2|3|3|0|3|3|2|3|4|4|1|1|1|3|2|2|0|0|3|0|3|3|3|2|4|0|3|2|2|3|1|0|1|1|2|2|1|3|2|2|4|3|2|3|4|3|0|3|1|3|3|2|2|2|3|1|0|3|3|4|4|1|3|4|4|4|2|3|2|4|3|1|4|2|2|3|4|4|4|3|0|2|4|4|2|2|3|0|4|2|3|1|2|4|4|1|3|3|3|3|0|
 poison_char_steganography

Proportion of source class samples predicted as target after poisoning test data:
 24.0 %

Proportion of other class samples predicted as target after poisoning test data:
 3.4285714285714284 %
