# Data preprocessing

In [7]:
import os
import sys
import glob
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.applications import ResNet50
import matplotlib.pyplot as plt
import seaborn as sns

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
tf.get_logger().setLevel("ERROR")

In [8]:
# Constants
IMG_SIZE = 224
BATCH_SIZE = 300
AUTOTUNE = tf.data.AUTOTUNE
EPOCHS = 5
INPUT_SHAPE = (224, 224, 3)
tf.random.set_seed(5)
dataset_dir = "../datasets"

# Change dataset_dir when run in google colab 
if 'google.colab' in sys.modules:
    from google.colab import drive

    drive.mount('/content/drive')
    dataset_dir = "/content/drive/Othercomputers/Big Mac/datasets"
    BATCH_SIZE = 430

    # Ensure the directory exists
    tf.io.gfile.makedirs(dataset_dir)

physical_gpus = tf.config.list_physical_devices('GPU')
print("Using available GPUs: ", physical_gpus)

tf.keras.mixed_precision.set_global_policy('float32')

Using available GPUs:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [9]:
# Load ImageNet2012 subset dataset
def prepare_input_data(input):
    image = tf.cast(input['image'], tf.float32)
    image = tf.image.resize(image, (IMG_SIZE, IMG_SIZE))
    image = preprocess_input(image)
    label = input['label']
    return image, label

def make_dataset(ds):
    return (
        ds.map(prepare_input_data, num_parallel_calls=AUTOTUNE)
        .batch(BATCH_SIZE)
        .prefetch(AUTOTUNE)
    )

(train, validation, test), info = tfds.load(
    'imagenet2012_subset/10pct',
    split=['train', 'validation[:50%]', 'validation[50%:]'],
    shuffle_files=False,
    with_info=True,
    data_dir=dataset_dir
)

num_classes = info.features['label'].num_classes

print(f"Train count: {info.splits['train'].num_examples}")
print(f"Validation count: {info.splits['validation[:50%]'].num_examples}")
print(f"Test count: {info.splits['validation[50%:]'].num_examples}")
print(f"No of classes {num_classes}")

train_dataset = make_dataset(train)
validation_dataset = make_dataset(validation)
test_dataset = make_dataset(test)

Train count: 128116
Validation[:50%] count: 25000
Validation[50%:] (test) count: 25000
No of classes 1000


In [None]:
## Generate dataset class distribution
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np

AUTOTUNE = tf.data.AUTOTUNE

def class_counts_from_raw_ds(raw_ds, num_classes):
    # Map to one-hot labels and sum across the dataset
    counts = (
    raw_ds
    .map(lambda x: tf.one_hot(x['label'], num_classes, dtype=tf.int64),
        num_parallel_calls=AUTOTUNE)
    .batch(4096)
    .reduce(
        initial_state=tf.zeros([num_classes], dtype=tf.int64),
        reduce_func=lambda acc, x: acc + tf.reduce_sum(x, axis=0)
    )
    )
    return counts.numpy()

def print_distribution(name, counts, class_names=None, top_k=5):
    total = counts.sum()
    print(f"\n{name}: total={total}, classes={len(counts)}")
    if class_names is None:
        class_names = [str(i) for i in range(len(counts))]
    # Show a quick summary: most/least frequent classes
    idx_sorted = np.argsort(counts)
    print(f"Least frequent {top_k}:")
    for i in idx_sorted[:top_k]:
        print(f"{i:4d} {class_names[i]:30s} {int(counts[i]):7d} ({counts[i]/total:.2%})")
        print(f"Most frequent {top_k}:")
    for i in idx_sorted[-top_k:][::-1]:
        print(f"{i:4d} {class_names[i]:30s} {int(counts[i]):7d} ({counts[i]/total:.2%})")


train_counts = class_counts_from_raw_ds(train, num_classes)
val_counts = class_counts_from_raw_ds(validation, num_classes)
test_counts = class_counts_from_raw_ds(test, num_classes)

print_distribution("Train dataset", train_counts, class_names)
print_distribution("Validation dataset", val_counts, class_names)
print_distribution("Test dataset", test_counts, class_names)

In [None]:
# Load ResNet50 model

base_model = ResNet50(
    include_top=True,
    weights='imagenet',
    input_shape=(IMG_SIZE, IMG_SIZE, 3),
    classes=1000,
    classifier_activation='softmax'
)


In [None]:
def _bytes_feature(value):
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # get value from EagerTensor
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def _create_adversary_with_pgd(model, images, labels, eps, eps_iter, nb_iter):
    """
    This generates adversarial images by iteratively applying a small
    perturbation in the direction of the gradient of the loss, and then
    projecting the result back into the epsilon-ball of the original image.

    Args:
        model (tf.keras.Model): The model to attack.
        images (tf.Tensor): The original, clean input images.
        labels (tf.Tensor): The true labels for the images.
        eps (float): The maximum perturbation (L-infinity norm).
        eps_iter (float): The step size for each attack iteration.
        nb_iter (int): The number of PGD iterations to perform.

    Returns:
        tf.Tensor: The generated adversarial images.
    """
    x_adv = tf.identity(images)
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy()

    for _ in range(nb_iter):
        with tf.GradientTape() as tape:
            tape.watch(x_adv)
            prediction = model(x_adv, training=False)
            loss = loss_object(labels, prediction)

        gradients = tape.gradient(loss, x_adv)
        signed_grad = tf.sign(gradients)
        x_adv = x_adv + eps_iter * signed_grad
        perturbation = tf.clip_by_value(x_adv - images, -eps, eps)
        x_adv = images + perturbation

    return x_adv

def generate_adversarial_dataset(folder, dataset, model, eps, steps, step_size):
    """
    Generates adversarial examples and saves them to a TFRecord file
    by serializing the raw float32 tensors.
    """
    options = tf.io.TFRecordOptions(compression_type="GZIP")
    total_images_processed = 0
    dataset_inter = iter(dataset)

    cardinality = tf.data.experimental.cardinality(dataset)
    if cardinality == tf.data.experimental.UNKNOWN_CARDINALITY:
        print("Warning: Dataset cardinality is unknown. Filenames will use 'N' for total batches.")
        total_no_of_batches = "N"
    else:
        total_no_of_batches = cardinality.numpy()
        print(f"Dataset has a total of {total_no_of_batches} batches.")

    for i, (images, labels) in enumerate(dataset_inter):
        print(f"Processing record {i+1} out of {total_no_of_batches}")
        filename = f"{folder}-record-{i+1}-of-{total_no_of_batches}.tfrec"
        # Generate the adversarial images (these are already preprocessed)
        adv_images = _create_adversary_with_pgd(
            model=model,
            images=images,
            labels=labels,
            eps=eps,
            eps_iter=step_size,
            nb_iter=steps
        )
        with tf.io.TFRecordWriter(filename, options=options) as writer:
            # Iterate through the batch to save each image/label pair
            for i in range(len(adv_images)):
                image_tensor = adv_images[i]
                label = labels[i]
                image_tensor_f16 = tf.cast(image_tensor, tf.float16)
                image_bytes = tf.io.serialize_tensor(image_tensor_f16)
                feature = {
                    'image': _bytes_feature(image_bytes), # Save the raw serialized tensor
                    'label': _int64_feature(label.numpy())
                }
                total_images_processed += 1
                serialized_example = tf.train.Example(features=tf.train.Features(feature=feature)).SerializeToString()
                writer.write(serialized_example)

    print(f"Processed and saved {total_images_processed} images")


In [None]:
# Generate adversarial train data
EPSILON = 8/255
STEPS = 2
STEP_SIZE = 8/255

train_folder=f"{dataset_dir}/adversaries/imagenet2012_subset/train"
print("Generating adversarial train data")
generate_adversarial_dataset(
    folder=train_folder,
    dataset=train_dataset,
    model=base_model,
    eps=EPSILON,
    steps=STEPS,
    step_size=STEP_SIZE)


validation_folder=f"{dataset_dir}/adversaries/imagenet2012_subset/validation"
print("Generating adversarial validation data")
generate_adversarial_dataset(
    folder=validation_folder,
    dataset=validation_dataset,
    model=base_model,
    eps=EPSILON,
    steps=STEPS,
    step_size=STEP_SIZE)


test_folder=f"{dataset_dir}/adversaries/imagenet2012_subset/test"
print("Generating adversarial test data")
generate_adversarial_dataset(
    folder=test_folder,
    dataset=test_dataset,
    model=base_model,
    eps=EPSILON,
    steps=STEPS,
    step_size=STEP_SIZE)


Generating adversarial validation data
Dataset has a total of 84 batches.
Processing record 1 out of 84
Processing record 2 out of 84
Processing record 3 out of 84
Processing record 4 out of 84
Processing record 5 out of 84
Processing record 6 out of 84
Processing record 7 out of 84
Processing record 8 out of 84
Processing record 9 out of 84
Processing record 10 out of 84
Processing record 11 out of 84
Processing record 12 out of 84
Processing record 13 out of 84
Processing record 14 out of 84
Processing record 15 out of 84
Processing record 16 out of 84
Processing record 17 out of 84


KeyboardInterrupt: 