# Test dataset
In this file, we aim to get 4 sets of examples to test our metric:
- Well-classified examples
- Wrongly classified examples
- Adversarial examples (well classified examples modified to be wrongly classified)
- Original examples of the adversarial examples before attack

By running this file you should get a file called `metric_testing_dataset.pkl`, which is a pickle file of a dictionary `dict[str, np.ndarray]`, with the examples to be used to test the metric.\
This file should be copied into `./tests/data/metric_testing_dataset.pkl` to be used for testing in the project

In [None]:
# Num of examples to save for each set
NUM_EXAMPLES = 10

In [1]:
import numpy as np
import random
import torch
import torch.nn as nn
from art.attacks.evasion import ProjectedGradientDescent, FastGradientMethod
from art.estimators.classification import PyTorchClassifier
from torch.utils.data import DataLoader

from model import Model, get_mnist_dataset, load_cnn_model

def exit():
    class StopExecution(Exception):
        def _render_traceback_(self):
            return []
    raise StopExecution

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
device = torch.device("cpu") # Keep CPU to not deal with space constraints

model: Model = load_cnn_model()
if model is None:
    print("Could not load model, something went wrong")
    exit()

model.eval()

_, test_dataset = get_mnist_dataset()

In [3]:
test_loader = DataLoader(
    dataset=test_dataset, 
    batch_size=len(test_dataset),
    shuffle=False
)

X_test, y_test = next(iter(test_loader))

In [4]:
model.to(device)
X_test = X_test.to(device)
y_test = y_test.to(device)

In [5]:
# Classify test dataset
pred = model.forward(X_test)
pred = torch.argmax(pred, dim=1)

well_classified = y_test[pred == y_test]

accuracy = (pred == y_test).sum().item() / len(y_test)
print(f"Accuracy on test dataset: {accuracy*100:.2f}% ({len(well_classified)}/{len(y_test)})")

Accuracy on test dataset: 98.95% (9895/10000)


In [6]:
loss = nn.CrossEntropyLoss()
classifier = PyTorchClassifier(
    model=model, 
    loss=loss, 
    input_shape=X_test[0].shape, 
    nb_classes=10,
    device_type='cpu'
)
attack_model = ProjectedGradientDescent(
    estimator=classifier,
    eps=16 / 255 * 784**0.5,
    norm=2,
)

base_k = 512
idxs_chosen = random.sample(range(len(X_test)), k=base_k)
x_original: np.ndarray = (X_test[idxs_chosen]).numpy()
y_original: np.ndarray = (y_test[idxs_chosen]).numpy()

x_adv = attack_model.generate(x_original)


                                                              

In [7]:
y_adv: np.ndarray = torch.argmax(
    model(torch.from_numpy(x_adv)), dim=1
).numpy()

print("Original labels: ", y_original)
print("Original predictions: ", torch.argmax(model(torch.from_numpy(x_original)), dim=1).numpy())
print("Adversarial predictions (Projected Gradient Descent): ", y_adv)

adv_well_classified = x_adv[y_adv == y_original]
accuracy = (y_adv == y_original).sum().item() / len(y_original)
print(f"Accuracy on adversarial examples: {accuracy*100:.2f}% ({len(adv_well_classified)}/{base_k})")
print(f"Effectiveness of attack (1-accuracy): {(1 - accuracy)*100:.2f}% ")

# Now Cho

Original labels:  [4 4 0 2 0 6 5 7 3 3 3 3 1 3 3 1 3 5 7 1 3 5 4 8 7 1 7 9 7 2 2 2 6 9 7 7 6
 3 9 7 5 2 6 0 9 3 0 4 4 9 2 1 9 2 4 4 6 2 9 8 2 9 0 4 8 3 8 5 7 2 8 2 9 8
 5 9 2 2 5 7 0 4 1 7 4 6 7 5 4 6 6 5 9 1 0 7 3 4 9 7 5 3 1 3 2 6 1 5 8 9 5
 3 2 2 5 4 1 1 5 1 4 5 2 4 7 7 2 9 2 3 0 4 1 3 7 2 2 7 5 1 6 5 1 9 0 5 5 2
 6 9 0 9 1 5 5 3 0 4 1 6 9 4 3 7 6 4 3 4 9 3 1 9 7 1 8 3 1 1 0 7 6 3 9 1 1
 8 1 2 7 7 3 0 0 0 1 1 1 7 7 9 3 4 3 8 9 1 3 2 1 3 0 1 7 9 2 2 4 1 7 3 0 3
 6 9 2 8 5 9 2 1 0 7 5 1 1 1 5 3 9 2 9 5 6 1 9 7 0 4 9 1 4 0 5 6 1 8 5 8 6
 3 3 2 8 8 4 0 3 6 8 3 1 9 6 9 7 5 1 6 2 4 7 8 1 4 9 6 9 8 4 7 2 6 9 0 6 8
 4 8 9 0 0 1 9 1 9 9 1 7 8 9 2 1 6 0 4 7 9 8 3 2 2 9 1 1 0 6 5 4 8 9 6 4 8
 5 8 4 6 5 7 2 6 8 1 6 1 2 8 6 4 2 7 5 5 5 0 6 3 2 9 8 3 4 6 3 4 3 4 0 6 7
 6 4 3 7 9 0 5 1 1 9 1 8 0 5 5 7 3 8 0 6 1 8 6 4 4 7 3 2 5 3 0 0 2 1 1 5 7
 2 9 0 1 8 2 1 2 9 6 6 8 9 9 9 5 1 0 4 2 2 2 1 9 1 9 8 7 9 6 9 1 4 7 9 4 2
 3 0 0 3 6 6 8 0 6 1 2 0 5 3 4 4 4 0 0 8 3 4 5 5 5 5 5 4 5 5 9 5 2 7 2 6 4
 7 8 0 

Now from the successfuly images that got attacked, choose 10 examples
and from the original dataset choose 10 TP and 10 wrongly classified

In [None]:
successful_adv_examples = x_adv[y_adv != y_original]
well_classified = x_original[y_adv == y_original]
wrongly_classified: np.ndarray = X_test[pred != y_test].numpy()

print(f"Got {len(successful_adv_examples)} successful adversarial examples")

chosen_adv_examples_idxs = random.sample(range(len(successful_adv_examples)), k=NUM_EXAMPLES)
chosen_well_classified_idxs = random.sample(range(len(well_classified)), k=NUM_EXAMPLES)
chosen_wrongly_classified_idxs = random.sample(range(len(wrongly_classified)), k=NUM_EXAMPLES)

final_dataset = {
    "adv_examples": {
        "x": successful_adv_examples[chosen_adv_examples_idxs], 
        "y": y_adv[chosen_adv_examples_idxs]
    },
    "original_adv_example": {
        "x": x_original[chosen_adv_examples_idxs], 
        "y": y_original[chosen_adv_examples_idxs]
    },
    "well_classified": {
        "x": well_classified[chosen_well_classified_idxs], 
        "y": y_original[chosen_well_classified_idxs]
    },
    "wrongly_classified": {
        "x": wrongly_classified[chosen_wrongly_classified_idxs],
        "y": y_test[chosen_wrongly_classified_idxs].numpy()
    }
}

Got 47 successful adversarial examples


In [None]:
import pickle
pickle.dump(final_dataset, open("metric_testing_dataset.pkl", "wb"))