# Data Preprocessing Experiments

In [1]:
import os
import sys
# 0 = all logs, 1 = filter INFO, 2 = filter INFO & WARNING, 3 = filter INFO, WARNING & ERROR
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"  # use "3" to hide even ERROR logs

import tensorflow as tf

# Silence TensorFlow's Python logger as well
tf.get_logger().setLevel("ERROR")

# Silence absl logs that TF uses
import absl.logging
absl.logging.set_verbosity(absl.logging.ERROR)


try:
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
except Exception:
    pass

import tensorflow_datasets as tfds
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.applications import ResNet50
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Constants
IMG_SIZE = 224
BATCH_SIZE = 200
AUTOTUNE = tf.data.AUTOTUNE
EPOCHS = 5 # Changed to a lower number for demonstration if retraining is needed
tf.random.set_seed(5)
dataset_dir = "../datasets"

if 'google.colab' in sys.modules:
    from google.colab import drive

    drive.mount('/content/drive')
    dataset_dir = "/content/drive/Othercomputers/Big Mac/datasets"


physical_gpus = tf.config.list_physical_devices('GPU')
print("Available GPUs:", physical_gpus)

try:
    tf.keras.mixed_precision.set_global_policy('float32') # Ensured float32 policy as per the original notebook
    if physical_gpus: # Check if GPUs are available before setting virtual device
        tf.config.experimental.set_virtual_device_configuration(
            physical_gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=56320)]  # Limit RAM to 55GB to avoid starving PC
        )
        print("Using GPU with 55GB of memory")
except Exception as e:
    print(e)

Mounted at /content/drive
Available GPUs: []


In [None]:
# Load ImageNet data

def prepare_input_data(input):
    image = tf.cast(input['image'], tf.float32)
    image = tf.image.resize_with_pad(image, IMG_SIZE, IMG_SIZE)
    image = preprocess_input(image)
    label = input['label']
    return image, label

# Imagenet full dataset
# dataset, info = tfds.load(
#     'imagenet2012',
#     shuffle_files=False,
#     with_info=True,
#     data_dir='../datasets'
# )

# Imagenet smallet dataset dataset
dataset, info = tfds.load(
    'imagenet2012',
    shuffle_files=False,
    with_info=True,
    data_dir='../datasets'
)

# Dataset stats
print(f'Train image count: {info.splits['train'].num_examples}')
print(f'Test image count: {info.splits['validation'].num_examples}')

# Preprocess data
# train_dataset = dataset['train'].map(prepare_input_data, num_parallel_calls=AUTOTUNE).batch(BATCH_SIZE).prefetch(AUTOTUNE)
# test_dataset = dataset['validation'].map(prepare_input_data, num_parallel_calls=AUTOTUNE).batch(BATCH_SIZE).prefetch(AUTOTUNE)

Train image count: 1281167
Test image count: 50000


In [None]:
# Load ResNet50 model
from tensorflow.keras.applications import ResNet50


base_model = ResNet50(
    include_top=True,
    weights='imagenet',
    input_shape=(IMG_SIZE, IMG_SIZE, 3),
    pooling=None,
    classes=1000,
    classifier_activation='softmax'
)
base_model.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])

In [None]:
filename = '../datasets/adversaries/test_dataset1.tfrecord'

def _bytes_feature(value):
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy()
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def _create_adversary_with_pgd(model, images, labels, eps, eps_iter, nb_iter):
    x_adv = tf.identity(images)
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy()

    for _ in range(nb_iter):
        with tf.GradientTape() as tape:
            tape.watch(x_adv)
            prediction = model(x_adv, training=False)
            loss = loss_object(labels, prediction)

        gradients = tape.gradient(loss, x_adv)
        signed_grad = tf.sign(gradients)
        x_adv = x_adv + eps_iter * signed_grad
        perturbation = tf.clip_by_value(x_adv - images, -eps, eps)
        x_adv = images + perturbation

    return x_adv

def generate_adversarial_dataset(dataset, model, eps, pgd_steps, pgd_step_size):
    """
    Generates adversarial examples and saves them to a TFRecord file
    by serializing the raw float32 tensors.
    """
    options = tf.io.TFRecordOptions(compression_type="GZIP")
    n = 0
    with tf.io.TFRecordWriter(filename, options=options) as writer:
        for i, (images, labels) in enumerate(dataset):
            print(f"Batch {i+1}")
            # Generate the adversarial images (these are already preprocessed)
            adv_images = _create_adversary_with_pgd(
                model=model,
                images=images,
                labels=labels,
                eps=eps,
                eps_iter=pgd_step_size,
                nb_iter=pgd_steps
            )

            # Iterate through the batch to save each image/label pair
            for i in range(len(adv_images)):
                image_tensor = adv_images[i]
                label = labels[i]

                # 1. Cast the tensor to float16 to halve its size
                image_tensor_f16 = tf.cast(image_tensor, tf.float16)

                # 2. Serialize the smaller tensor
                image_bytes = tf.io.serialize_tensor(image_tensor_f16)
                # 2. Create the feature and write to the TFRecord file
                feature = {
                    'image': _bytes_feature(image_bytes), # Save the raw serialized tensor
                    'label': _int64_feature(label.numpy())
                }
                n += 1

                serialized_example = tf.train.Example(features=tf.train.Features(feature=feature)).SerializeToString()
                writer.write(serialized_example)

    print("Created adversary dataset and saved: ", n, '\n')


def create_adversarial_dataset_direct(dataset, model, eps, pgd_steps, pgd_step_size):
    adversarial_images = []
    adversarial_labels = []

    # Iterate through each batch in the original dataset
    for i, (images, labels) in enumerate(dataset):
        print(f" batch {i+1}...")
        # Generate adversarial examples for the current batch using our custom function
        adv_images = _create_adversary_with_pgd(
            model=model,
            images=images,
            labels=labels,
            eps=eps,
            eps_iter=pgd_step_size,
            nb_iter=pgd_steps
        )

        # Append the results to our lists
        adversarial_images.append(adv_images)
        adversarial_labels.append(labels)

    # Concatenate all batches into single tensors
    adversarial_images = tf.concat(adversarial_images, axis=0)
    adversarial_labels = tf.concat(adversarial_labels, axis=0)

    # Create a new dataset from the adversarial examples
    adversarial_dataset = tf.data.Dataset.from_tensor_slices((adversarial_images, adversarial_labels))

    # Apply the same batching and prefetching for evaluation compatibility
    adversarial_dataset = adversarial_dataset.batch(BATCH_SIZE).prefetch(AUTOTUNE)

    return adversarial_dataset

EPSILON = 0.03
PGD_STEPS = 2
PGD_STEP_SIZE = 0.007

In [None]:
## Create adversarial dataset
generate_adversarial_dataset(
    dataset=test_dataset.take(3),
    model=base_model,
    eps=EPSILON,
    pgd_steps=PGD_STEPS,
    pgd_step_size=PGD_STEP_SIZE
)

# _new_test = create_adversarial_dataset_direct(
#     dataset=test_dataset.take(3),
#     model=base_model,
#     eps=EPSILON,
#     pgd_steps=PGD_STEPS,
#     pgd_step_size=PGD_STEP_SIZE
# )

Batch 1
Batch 2
Batch 3
Created adversary dataset and saved:  600 



2025-07-19 18:12:39.887445: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [None]:
# --- How to load the data back ---

def _parse_function(proto):
    """
    Parses a single example proto by deserializing the float16 tensor
    and casting it back to float32.
    """
    feature_description = {
        'image': tf.io.FixedLenFeature([], tf.string),
        'label': tf.io.FixedLenFeature([], tf.int64),
    }
    parsed_features = tf.io.parse_single_example(proto, feature_description)

    # 1. Deserialize the byte string back into a float16 tensor
    image_f16 = tf.io.parse_tensor(parsed_features['image'], out_type=tf.float16)
    label = parsed_features['label']

    # 2. Cast the image back to float32 for the model
    image_f32 = tf.cast(image_f16, tf.float32)

    # 3. Set the shape on the final float32 tensor
    image_f32.set_shape([IMG_SIZE, IMG_SIZE, 3])

    return image_f32, label

# Load the TFRecord file back into a dataset
loaded_dataset = tf.data.TFRecordDataset(filename, compression_type='GZIP')

# Map the parsing function across the dataset
parsed_dataset = loaded_dataset.map(_parse_function).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [None]:


results = base_model.evaluate(parsed_dataset, verbose=0, batch_size=10, steps=2)

print("\n--- Evaluation Results from data loaded from file ---")
print(f"Loss: {results[0]:.4f}")
print(f"Top-1 Accuracy: {results[1] * 100:.2f}%")







--- Evaluation Results from data loaded from file ---
Loss: 1.6013
Top-1 Accuracy: 64.25%


In [None]:
# results = base_model.evaluate(_new_test, verbose=0, batch_size=10, steps=2)

# print("\n--- Evaluation Results from directly generated data ---")
# print(f"Loss: {results[0]:.4f}")
# print(f"Top-1 Accuracy: {results[1] * 100:.2f}%")