# Model training and Evaluation

In [2]:
import os
import sys
import glob
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.applications import ResNet50
import matplotlib.pyplot as plt
import seaborn as sns

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
tf.get_logger().setLevel("ERROR")

In [3]:
# Constants
IMG_SIZE = 224
BATCH_SIZE = 300
AUTOTUNE = tf.data.AUTOTUNE
EPOCHS = 5
INPUT_SHAPE=(224, 224, 3)

tf.random.set_seed(5)
dataset_dir = "../datasets"

# Change dataset_dir when run in google colab 
if 'google.colab' in sys.modules:
    from google.colab import drive

    drive.mount('/content/drive')
    dataset_dir = "/content/drive/Othercomputers/Big Mac/datasets"
    BATCH_SIZE = 430

physical_gpus = tf.config.list_physical_devices('GPU')
print("Using available GPUs: ", physical_gpus)

tf.keras.mixed_precision.set_global_policy('float32')

Using available GPUs:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [4]:
# Load ImageNet2012 dataset
def prepare_input_data(input):
    image = tf.cast(input['image'], tf.float32)
    image = tf.image.resize(image, (IMG_SIZE, IMG_SIZE))
    image = preprocess_input(image)
    label = input['label']
    return image, label

def make_dataset(ds):
    return (
        ds.map(prepare_input_data, num_parallel_calls=AUTOTUNE)
        .batch(BATCH_SIZE)
        .prefetch(AUTOTUNE)
    )


(train, validation, test), info = tfds.load(
    'imagenet2012_subset/10pct',
    split=['train', 'validation[:50%]', 'validation[50%:]'],
    shuffle_files=False,
    with_info=True,
    data_dir=dataset_dir
)

num_classes = info.features['label'].num_classes
class_names = info.features['label'].names

print(f"Train image count: {info.splits['train'].num_examples}")
print(f"Validation image count: {info.splits['validation[:50%]'].num_examples}")
print(f"Test image count: {info.splits['validation[50%:]'].num_examples}")

train_dataset = make_dataset(train)
validation_dataset = make_dataset(validation)
test_dataset = make_dataset(test)

Train image count: 128116
Validation image count: 25000
Test image count: 25000


In [9]:
# Load adversarial datasets

def _parse_image(input):
    feature_description = {
        'image': tf.io.FixedLenFeature([], tf.string),
        'label': tf.io.FixedLenFeature([], tf.int64),
    }
    parsed_features = tf.io.parse_single_example(input, feature_description)
    image_f16 = tf.io.parse_tensor(parsed_features['image'], out_type=tf.float16)
    label = parsed_features['label']
    image_f32 = tf.cast(image_f16, tf.float32)
    image_f32.set_shape([IMG_SIZE, IMG_SIZE, 3])
    return image_f32, label

def create_tf_dataset(file_paths):
    raw_dataset = tf.data.TFRecordDataset(file_paths, compression_type='GZIP')
    tf_dataset = raw_dataset.map(_parse_image).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    return tf_dataset

# Get all adversarial datasets for train, and testing
test_file_paths = glob.glob(f'{dataset_dir}/adversaries/imagenet2012_subset/test-*.tfrec')
train_file_paths = glob.glob(f'{dataset_dir}/adversaries/imagenet2012_subset/train-*.tfrec')

print(f"Loaded {len(train_file_paths)} TFrecord train files")
print(f"Loaded {len(test_file_paths)} TFrecord test files")

# Create a TFRecordDataset
adv_test_dataset = create_tf_dataset(test_file_paths)
adv_train_dataset = create_tf_dataset(train_file_paths)

Loaded 140 TFrecord train files
Loaded 59 TFrecord test files


In [10]:
# Load robust model from file
print("loaded robust resnet mode")
robust_model = tf.keras.models.load_model("robust_resnet50.keras")

loaded robust resnet mode


In [11]:
# Collection of metrics

print("Evaluating training accuracy...\n")
train_metrics = robust_model.evaluate(train_dataset, batch_size=BATCH_SIZE ,verbose=1)

print("Evaluating adversarial accuracy...\n")
adv_metrics = robust_model.evaluate(adv_test_dataset, batch_size=BATCH_SIZE, verbose=1)

print("Evaluating standard accuracy...\n")
test_metrics = robust_model.evaluate(test_dataset,batch_size=BATCH_SIZE, verbose=1)

Evaluating training accuracy...

[1m161/428[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m4:51[0m 1s/step - accuracy: 9.8640e-04 - loss: 1.4088 - top_5_accuracy: 0.0090

KeyboardInterrupt: 

In [None]:
# Extract and analyze metrics

metric_names = ['loss', 'accuracy', 'top_5_accuracy']

# Unpack metrics directly from the evaluation results
train_loss, train_acc, train_top5 = train_metrics
test_loss, test_acc, test_top5 = test_metrics
adv_loss, adv_acc, adv_top5 = adv_metrics

# --- METRIC CALCULATIONS ---
# Generalization Gap: Difference between training and test accuracy.
generalization_gap = train_acc - test_acc

# Robustness/Transferability Gap: Difference between accuracy on clean and adversarial data.
robustness_gap = test_acc - adv_acc

print("# Standard Accuracy")
print(f"Top 1 Accuracy: {test_acc*100:.2f}%")
print(f"Top 5  Accuracy: {test_top5*100:.2f}%")
print(f"Loss: {test_loss*100:.2f}%\n")

print("# Generalization")
print(f"Training Accuracy: {train_acc*100:.2f}%")
print(f"Test Accuracy: {test_acc*100:.2f}%")
print(f"Generalization Gap: {generalization_gap*100:.2f}%\n")

print("# Robustness & Transferability")
print(f"Standard Accuracy: {test_acc*100:.2f}%")
print(f"Adversarial Accuracy: {adv_acc*100:.2f}%")
print(f"Robustness (Transferability) Gap: {robustness_gap*100:.2f}%\n")


In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import seaborn as sns


sns.set_style("whitegrid")
plt.style.use('seaborn-v0_8-talk')

accuracies = [train_acc, test_acc, adv_acc]
labels = ['Training', 'Standard Test', 'Adversarial Test']
colors = ['#4c72b0', '#55a868', '#c44e52']

# Create Plot
fig, ax = plt.subplots(figsize=(10, 7))
bars = ax.bar(labels, accuracies, color=colors, zorder=2)

# Add Annotations & Labels
for bar in bars:
    yval = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2.0, yval + 0.02,
            f'{yval:.2%}', # Format as percentage
            ha='center', va='bottom', fontsize=14, fontweight='bold')

# Set a clear title and subtitle
ax.set_title('Comparison of Accuracy on Different Datasets', fontsize=14, pad=30)

# Set y-axis label and limits
ax.set_ylabel('Top-1 Accuracy', fontsize=10, labelpad=15)
ax.set_ylim(0, 1.1)

# Format the y-axis to show percentages directly
ax.yaxis.set_major_formatter(mticker.PercentFormatter(xmax=1.0))

# Add a light horizontal grid for easier value reading
ax.grid(axis='y', linestyle='--', alpha=0.7)

# Adjust layout to prevent labels from being cut off
plt.tight_layout(rect=[0, 0, 1, 0.95])

plt.show()