In [33]:
import tensorflow as tf
import ltn
import baselines, data
from examples import commons
import matplotlib.pyplot as plt
import numpy as np
import copy
from tensorflow.keras.preprocessing.image import load_img, img_to_array
import os

In [34]:
def pgd_attack(model, images, labels, epsilon=0.3, alpha=0.01, num_iter=40):
    """
    Performs PGD attack on a batch of images.

    Args:
        model: tf.keras.Model
        images: tf.Tensor or numpy array, shape (N, 28, 28, 1), pixel values in [0,1]
        labels: true labels, shape (N,)
        epsilon: maximum perturbation (L∞ norm)
        alpha: step size for each iteration
        num_iter: number of PGD iterations

    Returns:
        adversarial_images: tf.Tensor with perturbed images clipped to valid pixel range
    """
    adv_images = tf.identity(images)

    for _ in range(num_iter):
        with tf.GradientTape() as tape:
            tape.watch(adv_images)
            logits = model(adv_images)
            loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)(labels, logits)
        gradients = tape.gradient(loss, adv_images)
        adv_images = adv_images + alpha * tf.sign(gradients)
        adv_images = tf.clip_by_value(adv_images, images - epsilon, images + epsilon)
        adv_images = tf.clip_by_value(adv_images, 0.0, 1.0)

    return adv_images

In [35]:
mnist = tf.keras.datasets.mnist
(img_train, label_train), (img_test, label_test) = mnist.load_data()

# normalising the pixel values
img_train, img_test = img_train/255.0, img_test/255.0

# adding a channel dimension for compatibility with the convolutional layers
img_train = img_train[...,tf.newaxis]
img_test = img_test[...,tf.newaxis]

In [36]:
# train data without label 0
not_zeros_train = label_train != 0
img_train = img_train[not_zeros_train]
label_train = label_train[not_zeros_train]

#test data without label 0
not_zeros_test = label_test != 0
img_test = img_test[not_zeros_test]
label_test = label_test[not_zeros_test]

In [37]:
PGD_EPSILON = 0.5
poisoned_train = pgd_attack(baselines.SingleDigit(),img_train,label_train,epsilon=PGD_EPSILON,alpha=2.0,num_iter=10)
poisoned_test = pgd_attack(baselines.SingleDigit(), img_test, label_test, epsilon=PGD_EPSILON, alpha=2.0, num_iter=10)

In [38]:
# poisoned_dataset = {}
# for i in [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09]:
#     poisoned_train = pgd_attack(baselines.SingleDigit(),img_train,label_train,epsilon=i,alpha=2.0,num_iter=10)
#     poisoned_test = pgd_attack(baselines.SingleDigit(), img_test, label_test, epsilon=i, alpha=2.0, num_iter=10)
#     poisoned_dataset[i] = {
#         "train": poisoned_train,
#         "test": poisoned_test
#     }

In [39]:
POISON_RATE = 0.2

In [40]:
num_poison_train  = int(len(img_train)/2.0 * POISON_RATE)
poison_idx_train  = np.random.choice(int(len(img_train)/2.0), num_poison_train, replace=False)
for i in poison_idx_train:
    img_train[i] = poisoned_train[i]

In [41]:
img_test_clean = copy.deepcopy(img_test)
label_test_clean =  copy.deepcopy(label_test)
for i in range(int(len(img_test)/2.0)):
    img_test[i] = poisoned_test[i]
    label_test[i] = 1 if label_test[i] == 9 else label_test[i]+1

In [42]:
# how much data will be considered
count_train = 10000
count_test = 3000
n_operands = 2

# operation
op = lambda args: args[0]%args[1]

# train data poisoned
img_per_operand_train = [img_train[i*count_train:i*count_train+count_train] for i in range(n_operands)]
label_per_operand_train = [label_train[i*count_train:i*count_train+count_train] for i in range(n_operands)]
label_result_train = np.apply_along_axis(op,0,label_per_operand_train)

# test data poisoned
img_per_operand_test = [img_test[i*count_test:i*count_test+count_test] for i in range(n_operands)]
label_per_operand_test = [label_test[i*count_test:i*count_test+count_test] for i in range(n_operands)]
label_result_test = np.apply_along_axis(op,0,label_per_operand_test)

# test data clean
img_per_operand_test_clean = [img_test_clean[i*count_test:i*count_test+count_test] for i in range(n_operands)]
label_per_operand_test_clean = [label_test_clean[i*count_test:i*count_test+count_test] for i in range(n_operands)]
label_result_test_clean = np.apply_along_axis(op,0,label_per_operand_test_clean)

## Creating tf datasets of specific buffer and batch size

In [43]:
# dataset parameters
buffer_size = 3000
batch_size = 16
    
# making the poisoned train dataset 
ds_train = tf.data.Dataset.from_tensor_slices(tuple(img_per_operand_train)+(label_result_train,))\
            .take(count_train).shuffle(buffer_size).batch(batch_size)

# making the poisoned test dataset
ds_test = tf.data.Dataset.from_tensor_slices(tuple(img_per_operand_test)+(label_result_test,))\
            .take(count_test).shuffle(buffer_size).batch(batch_size)

# making the clean test dataset
ds_test_clean = tf.data.Dataset.from_tensor_slices(tuple(img_per_operand_test_clean)+(label_result_test_clean,))\
            .take(count_test).shuffle(buffer_size).batch(batch_size)


## LTN

In [44]:
logits_model = baselines.SingleDigit(inputs_as_a_list=True)
Digit = ltn.Predicate.FromLogits(logits_model, activation_function="softmax")

d1 = ltn.Variable("digits1", range(10))
d2 = ltn.Variable("digits2", range(10))

Not = ltn.Wrapper_Connective(ltn.fuzzy_ops.Not_Std())
And = ltn.Wrapper_Connective(ltn.fuzzy_ops.And_Prod())
Or = ltn.Wrapper_Connective(ltn.fuzzy_ops.Or_ProbSum())
Implies = ltn.Wrapper_Connective(ltn.fuzzy_ops.Implies_Reichenbach())
Forall = ltn.Wrapper_Quantifier(ltn.fuzzy_ops.Aggreg_pMeanError(),semantics="forall")
Exists = ltn.Wrapper_Quantifier(ltn.fuzzy_ops.Aggreg_pMean(),semantics="exists")

In [45]:
# mask
modulo = ltn.Function.Lambda(lambda inputs: inputs[0] % inputs[1])
equals = ltn.Predicate.Lambda(lambda inputs: inputs[0] == inputs[1])

### Axioms
@tf.function
def axioms(images_x, images_y, labels_z, p_schedule=tf.constant(2.)):
    images_x = ltn.Variable("x", images_x)
    images_y = ltn.Variable("y", images_y)
    labels_z = ltn.Variable("z", labels_z)
    axiom = Forall(
            ltn.diag(images_x,images_y,labels_z),
            Exists(
                (d1,d2),
                And(Digit([images_x,d1]),Digit([images_y,d2])),
                mask=equals([modulo([d1,d2]), labels_z]),
                p=p_schedule
            ),
            p=2
        )
    sat = axiom.tensor
    return sat

images_x, images_y, labels_z = next(ds_train.as_numpy_iterator())
axioms(images_x, images_y, labels_z)

<tf.Tensor: shape=(), dtype=float32, numpy=0.010797321796417236>

Optimizer, training steps and metrics

In [46]:
optimizer = tf.keras.optimizers.Adam(0.001)
metrics_dict = {
    'train_loss': tf.keras.metrics.Mean(name="train_loss"),
    'train_accuracy': tf.keras.metrics.Mean(name="train_accuracy"),
    'test_loss': tf.keras.metrics.Mean(name="test_loss"),
    # 'test_accuracy': tf.keras.metrics.Mean(name="test_accuracy"),
    'clean_accuracy'      : tf.keras.metrics.Mean(name='benign_accuracy'),
    'attack_success_rate'  : tf.keras.metrics.Mean(name='asr'),
}

@tf.function
def train_step(images_x, images_y, labels_z, **parameters):
    # loss
    with tf.GradientTape() as tape:
        loss = 1.- axioms(images_x, images_y, labels_z, **parameters)
    gradients = tape.gradient(loss, logits_model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, logits_model.trainable_variables))
    metrics_dict['train_loss'](loss)
    # accuracy
    predictions_x = tf.argmax(logits_model([images_x]),axis=-1)
    predictions_y = tf.argmax(logits_model([images_y]),axis=-1)
    predictions_z = predictions_x % predictions_y
    match = tf.equal(predictions_z,tf.cast(labels_z,predictions_z.dtype))
    metrics_dict['train_accuracy'](tf.reduce_mean(tf.cast(match,tf.float32)))
    
@tf.function
def test_step_clean(images_x, images_y, labels_z, **parameters):
    # loss
    loss = 1.- axioms(images_x, images_y, labels_z, **parameters)
    metrics_dict['test_loss'](loss)
    # accuracy
    predictions_x = tf.argmax(logits_model([images_x]),axis=-1)
    predictions_y = tf.argmax(logits_model([images_y]),axis=-1)
    predictions_z = predictions_x % predictions_y
    
    match = tf.equal(predictions_z,tf.cast(labels_z,predictions_z.dtype))
    metrics_dict['clean_accuracy'](tf.reduce_mean(tf.cast(match,tf.float32)))
    
@tf.function
def test_step_poisoned(images_x, images_y, labels_z, **parameters):
    # loss
    loss = 1.- axioms(images_x, images_y, labels_z, **parameters)
    # metrics_dict['test_loss'](loss)
    # accuracy
    predictions_x = tf.argmax(logits_model([images_x]),axis=-1)
    predictions_y = tf.argmax(logits_model([images_y]),axis=-1)
    predictions_z = predictions_x % predictions_y
    
    match = tf.equal(predictions_z,tf.cast(labels_z,predictions_z.dtype))
    metrics_dict['attack_success_rate'](tf.reduce_mean(tf.cast(match,tf.float32)))

Training

In [47]:
from collections import defaultdict

scheduled_parameters = defaultdict(lambda: {})
for epoch in range(0,4):
    scheduled_parameters[epoch] = {"p_schedule":tf.constant(1.)}
for epoch in range(4,8):
    scheduled_parameters[epoch] = {"p_schedule":tf.constant(2.)}
for epoch in range(8,12):
    scheduled_parameters[epoch] = {"p_schedule":tf.constant(4.)}
for epoch in range(12,20):
    scheduled_parameters[epoch] = {"p_schedule":tf.constant(6.)}

In [48]:
history = commons.train(
    epochs= 20,
    metrics_dict= metrics_dict,
    ds_train= ds_train,
    ds_test_clean= ds_test_clean,
    ds_test_poisoned= ds_test,
    train_step= train_step,
    test_step_clean= test_step_clean,
    test_step_poisoned= test_step_poisoned,
    scheduled_parameters=scheduled_parameters
)

Epoch 0, train_loss: 0.9408, train_accuracy: 0.6796, test_loss: 0.9234, clean_accuracy: 0.8381, attack_success_rate: 0.2437
Epoch 1, train_loss: 0.9169, train_accuracy: 0.8906, test_loss: 0.9152, clean_accuracy: 0.8986, attack_success_rate: 0.2437
Epoch 2, train_loss: 0.9141, train_accuracy: 0.9133, test_loss: 0.9124, clean_accuracy: 0.9172, attack_success_rate: 0.2430
Epoch 3, train_loss: 0.9125, train_accuracy: 0.9277, test_loss: 0.9128, clean_accuracy: 0.9112, attack_success_rate: 0.2417
Epoch 4, train_loss: 0.7512, train_accuracy: 0.9322, test_loss: 0.7486, clean_accuracy: 0.9232, attack_success_rate: 0.2460
Epoch 5, train_loss: 0.7454, train_accuracy: 0.9451, test_loss: 0.7511, clean_accuracy: 0.9186, attack_success_rate: 0.2400
Epoch 6, train_loss: 0.7435, train_accuracy: 0.9474, test_loss: 0.7458, clean_accuracy: 0.9328, attack_success_rate: 0.2447
Epoch 7, train_loss: 0.7404, train_accuracy: 0.9567, test_loss: 0.7481, clean_accuracy: 0.9245, attack_success_rate: 0.2424
Epoch 8,

In [49]:
print(metrics_dict['clean_accuracy'].result().numpy())
print(metrics_dict['attack_success_rate'].result().numpy())

0.9488032
0.24102394
