# Data Preprocessing Experiments

In [2]:
%pip install tensorflow tensorflow-datasets -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m620.7/620.7 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.5/57.5 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.5/24.5 MB[0m [31m84.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m114.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m115.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.5/72.5 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import os
import sys
import tensorflow as tf

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"  # use "3" to hide even ERROR logs
tf.get_logger().setLevel("ERROR")

import tensorflow_datasets as tfds
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.applications import ResNet50
import matplotlib.pyplot as plt
import seaborn as sns
import math

In [4]:
# Constants
IMG_SIZE = 224
BATCH_SIZE = 200
AUTOTUNE = tf.data.AUTOTUNE
EPOCHS = 5 # Changed to a lower number for demonstration if retraining is needed
tf.random.set_seed(5)
dataset_dir = "../datasets"

if 'google.colab' in sys.modules:
    from google.colab import drive

    drive.mount('/content/drive')
    dataset_dir = "/content/drive/Othercomputers/Big Mac/datasets"
    BATCH_SIZE = 450

    # Ensure the directory exists
    tf.io.gfile.makedirs(dataset_dir)


physical_gpus = tf.config.list_physical_devices('GPU')
print("Available GPUs:", physical_gpus)

try:
    tf.keras.mixed_precision.set_global_policy('float32') # Ensured float32 policy as per the original notebook
    if physical_gpus: # Check if GPUs are available before setting virtual device
        # tf.config.experimental.set_virtual_device_configuration(
        #     physical_gpus[0],
        #     [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=56320)]  # Limit RAM to 55GB to avoid starving PC
        # )
        print("Using GPU with 55GB of memory")
except Exception as e:
    print(e)

Mounted at /content/drive
Available GPUs: []


In [5]:
# Load ImageNet data
def prepare_input_data(input):
    image = tf.cast(input['image'], tf.float32)
    image = tf.image.resize(image, (IMG_SIZE, IMG_SIZE)) # Corrected to use the casted image
    image = preprocess_input(image)
    label = input['label']
    return image, label


# dataset, info = tfds.load(
#     'imagenette',
#     shuffle_files=False,
#     with_info=True,
#     data_dir=dataset_dir
# )


dataset, info = tfds.load(
    'imagenet2012',
    shuffle_files=False,
    with_info=True,
    data_dir=dataset_dir
)

train_dataset_image_count = info.splits["train"].num_examples
test_dataset_image_count = info.splits["validation"].num_examples

print(f'Train image count: {train_dataset_image_count}')
print(f'Test image count: {test_dataset_image_count}')

train_dataset = dataset['train'].map(prepare_input_data, num_parallel_calls=AUTOTUNE).batch(BATCH_SIZE).prefetch(AUTOTUNE)
test_dataset = dataset['validation'].map(prepare_input_data, num_parallel_calls=AUTOTUNE).batch(BATCH_SIZE).prefetch(AUTOTUNE)


Train image count: 1281167
Test image count: 50000


In [6]:
# Load ResNet50 model
# base_model = ResNet50(
#     include_top=True,
#     weights=None,
#     input_shape=(IMG_SIZE, IMG_SIZE, 3),
#     pooling=None,
#     classes=10,
#     classifier_activation='softmax'
# )

base_model = ResNet50(
    include_top=True,
    weights='imagenet',
    input_shape=(IMG_SIZE, IMG_SIZE, 3),
    pooling=None,
    classes=1000,
    classifier_activation='softmax'
)

# Functions for adversarial data generation and loading
def _bytes_feature(value):
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy()
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _create_adversary_with_pgd(model, images, labels, eps, eps_iter, nb_iter):
    x_adv = tf.identity(images)
    # Use from_logits=False because classifier_activation='softmax' means model outputs probabilities
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)

    for _ in range(nb_iter):
        with tf.GradientTape() as tape:
            tape.watch(x_adv)
            prediction = model(x_adv, training=False)
            loss = loss_object(labels, prediction)

        gradients = tape.gradient(loss, x_adv)
        signed_grad = tf.sign(gradients)
        x_adv = x_adv + eps_iter * signed_grad
        perturbation = tf.clip_by_value(x_adv - images, -eps, eps)
        x_adv = images + perturbation

    return x_adv


def generate_adversarial_dataset_in_batches(folder, dataset, model, eps, pgd_steps, pgd_step_size, count):
    options = tf.io.TFRecordOptions(compression_type="GZIP")
    image_count = 0
    dataset_iterator = iter(dataset)
    batch_count = math.ceil(count / BATCH_SIZE)

    for i, (images, labels) in enumerate(dataset_iterator):
        batch_no = f"{folder}-record-{i}-of-{batch_count}.tfrec"
        print(f"batch_{i}")
        adv_images = _create_adversary_with_pgd(
            model=model,
            images=images,
            labels=labels,
            eps=eps,
            eps_iter=pgd_step_size,
            nb_iter=pgd_steps
        )

        with tf.io.TFRecordWriter(batch_no, options=options) as writer:
            for i in range(len(adv_images)):
                image_tensor = adv_images[i]
                label = labels[i]
                image_tensor_f16 = tf.cast(image_tensor, tf.float16)
                image_bytes = tf.io.serialize_tensor(image_tensor_f16)
                feature = {
                    'image': _bytes_feature(image_bytes),
                    'label': _int64_feature(label.numpy())
                }
                image_count += 1
                serialized_example = tf.train.Example(features=tf.train.Features(feature=feature)).SerializeToString()
                writer.write(serialized_example)
    print(f"Processed and saved: {image_count} images")

def _parse_function(proto):
    feature_description = {
        'image': tf.io.FixedLenFeature([], tf.string),
        'label': tf.io.FixedLenFeature([], tf.int64),
    }
    parsed_features = tf.io.parse_single_example(proto, feature_description)
    image_f16 = tf.io.parse_tensor(parsed_features['image'], out_type=tf.float16)
    label = parsed_features['label']
    image_f32 = tf.cast(image_f16, tf.float32)
    image_f32.set_shape([IMG_SIZE, IMG_SIZE, 3])
    return image_f32, label

base_model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=[
        'accuracy',
        tf.keras.metrics.SparseTopKCategoricalAccuracy(k=5, name='top_5_accuracy'),
    ]
)


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels.h5
[1m102967424/102967424[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [7]:
print("Training base model...\n")
# base_model.fit(train_dataset, verbose=1, batch_size=BATCH_SIZE, epochs=4)

Training base model...



In [None]:
# Create adversarial dataset
EPSILON = 8/255 # Maximum allowed change.
PGD_STEPS = 5 # use 5 steps
PGD_STEP_SIZE = 2/255 # change by much at each step
# adversarial_test_file = f'{dataset_dir}/adversaries/test_dataset.tfrec'
test_dataset_folder = f'{dataset_dir}/adversaries/imagenet2012/test'
print("Generate adversarial test data")
generate_adversarial_dataset_in_batches(
    folder=test_dataset_folder,
    dataset=test_dataset,
    model=base_model,
    eps=EPSILON,
    pgd_steps=PGD_STEPS,
    pgd_step_size=PGD_STEP_SIZE,
    count=test_dataset_image_count,
)

# train_dataset_folder = f'{dataset_dir}/adversaries/imagenet2012/train'
# print("Generate adversarial train data")
# generate_adversarial_dataset_in_batches(
#     folder=train_dataset_folder,
#     dataset=train_dataset,
#     model=base_model,
#     eps=EPSILON,
#     pgd_steps=PGD_STEPS,
#     pgd_step_size=PGD_STEP_SIZE,
#     count=train_dataset_image_count,
# )


Generate adversarial test data
batch_0
batch_1
batch_2
batch_3
batch_4
batch_5
batch_6
batch_7
batch_8
batch_9
batch_10
batch_11
batch_12
batch_13
batch_14
batch_15
batch_16
batch_17
batch_18
batch_19
batch_20
batch_21
batch_22
batch_23
batch_24
batch_25
batch_26
batch_27
batch_28
batch_29
batch_30
batch_31
batch_32
batch_33
batch_34
batch_35
batch_36
batch_37
batch_38
batch_39
batch_40
batch_41
batch_42
batch_43
batch_44
batch_45
batch_46
batch_47
batch_48
batch_49
batch_50
batch_51
batch_52
batch_53
batch_54
batch_55
batch_56
batch_57


In [None]:
# import tensorflow as tf
# import glob

# # Use glob to get a list of all TFRecord files
# file_paths = glob.glob(f'{dataset_dir}/adversaries/small_test/batch_*.tfrec')
# print(f"Found the following TFRecord files: {file_paths}")

# # Create a TFRecordDataset from the list of file paths
# # The files are interleaved automatically for better performance
# raw_dataset = tf.data.TFRecordDataset(file_paths,  compression_type='GZIP')

# ## Load data from file
# def _parse_function(proto):
#     """
#     Parses a single example proto by deserializing the float16 tensor
#     and casting it back to float32.
#     """
#     feature_description = {
#         'image': tf.io.FixedLenFeature([], tf.string),
#         'label': tf.io.FixedLenFeature([], tf.int64),
#     }
#     parsed_features = tf.io.parse_single_example(proto, feature_description)

#     # 1. Deserialize the byte string back into a float16 tensor
#     image_f16 = tf.io.parse_tensor(parsed_features['image'], out_type=tf.float16)
#     label = parsed_features['label']

#     # 2. Cast the image back to float32 for the model
#     image_f32 = tf.cast(image_f16, tf.float32)

#     # 3. Set the shape on the final float32 tensor
#     image_f32.set_shape([IMG_SIZE, IMG_SIZE, 3])

#     return image_f32, label

# # Load the TFRecord file back into a dataset
# # loaded_test_dataset = tf.data.TFRecordDataset(adversarial_test_file, compression_type='GZIP')

# parsed_test_dataset = raw_dataset.map(_parse_function).batch(200).prefetch(tf.data.AUTOTUNE)

# def count_images_in_dataset(dataset):
#     """
#     Counts the number of images in an unbatched TF.data.Dataset.

#     Args:
#         dataset: The unbatched tf.data.Dataset containing the images.

#     Returns:
#         The total number of images as an integer.
#     """
#     # Use cardinality() to get the number of elements.
#     # This is the most efficient method as it doesn't require iterating.
#     count = tf.data.experimental.cardinality(dataset).numpy()

#     # If cardinality is unknown, fall back to iterating
#     if count == tf.data.experimental.UNKNOWN_CARDINALITY:
#         print("Dataset cardinality is unknown. Counting via iteration...")
#         count = sum(1 for _ in dataset)

#     return count


# # Use the function to count the images in the raw, unbatched dataset
# total_images = count_images_in_dataset(raw_dataset)
# print(f"Total number of images in the dataset: {total_images}")