# Test dataset

## MNIST Dataset

In this file, we aim to get 4 sets of examples to test our metric from the MNIST dataset:
- Well-classified examples
- Wrongly classified examples
- Adversarial examples (well classified examples modified to be wrongly classified)
- Original examples of the adversarial examples before attack

## Income Dataset
For the Income NN, we also aim to prepare the same 4 datasets, but with example images extracted from online sources. However we won't prepare 4 datasets with 10 examples each as finding images that were wrongly classified would be a lengthy process


By running this file you should get a file called `metric_testing_dataset.pkl` and a file called `income_metric_testing_dataset.pkl`, which is a pickle file of a dictionary `dict[str, np.ndarray]`, with the examples to be used to test the metric.\
These files should be copied into `./tests/data/metric_testing_dataset.pkl` and `./tests/data/income_metric_testing_dataset.pkl` to be used for testing in the project

In [30]:
# Num of examples to save for each set
NUM_EXAMPLES = 10
INCOME_MODEL_PATH = "income_model.pt"

import numpy as np
import random
import torch
import torch.nn as nn
from art.attacks.evasion import ProjectedGradientDescent
from art.estimators.classification import PyTorchClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch.utils.data import DataLoader
import pandas as pd

from model import MNIST_Model, IncomeModel, get_mnist_dataset, load_cnn_model

# Helper to stop execution in notebook
def exit():
    class StopExecution(Exception):
        def _render_traceback_(self):
            return []
    raise StopExecution
device = torch.device("cpu") # Keep CPU to not deal with space constraints

# MNIST Dataset

In [31]:
model: MNIST_Model = load_cnn_model()
if model is None:
    print("Could not load model, something went wrong")
    exit()

model.eval()

_, test_dataset = get_mnist_dataset()

In [32]:
test_loader = DataLoader(
    dataset=test_dataset, 
    batch_size=len(test_dataset),
    shuffle=False
)

X_test, y_test = next(iter(test_loader))

In [33]:
model.to(device)
X_test = X_test.to(device)
y_test = y_test.to(device)

In [34]:
# Classify test dataset
pred = model.forward(X_test)
pred = torch.argmax(pred, dim=1)

well_classified = y_test[pred == y_test]

accuracy = (pred == y_test).sum().item() / len(y_test)
print(f"Accuracy on test dataset: {accuracy*100:.2f}% ({len(well_classified)}/{len(y_test)})")

Accuracy on test dataset: 99.04% (9904/10000)


In [35]:
loss = nn.CrossEntropyLoss()
classifier = PyTorchClassifier(
    model=model, 
    loss=loss, 
    input_shape=X_test[0].shape, 
    nb_classes=10,
    device_type='cpu'
)
attack_model = ProjectedGradientDescent(
    estimator=classifier,
    eps=16 / 255 * 784**0.5,
    norm=2,
)

base_k = 512
idxs_chosen = random.sample(range(len(X_test)), k=base_k)
x_original: np.ndarray = (X_test[idxs_chosen]).numpy()
y_original: np.ndarray = (y_test[idxs_chosen]).numpy()

x_adv = attack_model.generate(x_original)


                                                              

In [36]:
y_adv: np.ndarray = torch.argmax(
    model(torch.from_numpy(x_adv)), dim=1
).numpy()

adv_well_classified = x_adv[y_adv == y_original]
accuracy = (y_adv == y_original).sum().item() / len(y_original)
print(f"Accuracy on adversarial examples: {accuracy*100:.2f}% ({len(adv_well_classified)}/{base_k})")
print(f"Effectiveness of attack (1-accuracy): {(1 - accuracy)*100:.2f}% ")

# Now Cho

Accuracy on adversarial examples: 90.82% (465/512)
Effectiveness of attack (1-accuracy): 9.18% 


Now from the successfuly images that got attacked, choose 10 examples
and from the original dataset choose 10 TP and 10 wrongly classified

In [37]:
successful_adv_examples = x_adv[y_adv != y_original]
well_classified = x_original[y_adv == y_original]
wrongly_classified: np.ndarray = X_test[pred != y_test].numpy()

print(f"Got {len(successful_adv_examples)} successful adversarial examples")

chosen_adv_examples_idxs = random.sample(range(len(successful_adv_examples)), k=NUM_EXAMPLES)
chosen_well_classified_idxs = random.sample(range(len(well_classified)), k=NUM_EXAMPLES)
chosen_wrongly_classified_idxs = random.sample(range(len(wrongly_classified)), k=NUM_EXAMPLES)

final_dataset = {
    "adv_examples": {
        "x": successful_adv_examples[chosen_adv_examples_idxs], 
        "y": y_adv[chosen_adv_examples_idxs]
    },
    "original_adv_example": {
        "x": x_original[chosen_adv_examples_idxs], 
        "y": y_original[chosen_adv_examples_idxs]
    },
    "well_classified": {
        "x": well_classified[chosen_well_classified_idxs], 
        "y": y_original[chosen_well_classified_idxs]
    },
    "wrongly_classified": {
        "x": wrongly_classified[chosen_wrongly_classified_idxs],
        "y": y_test[chosen_wrongly_classified_idxs].numpy()
    }
}
import pickle
pickle.dump(final_dataset, open("metric_testing_dataset.pkl", "wb"))

Got 47 successful adversarial examples


# Income Prediction Model

In [38]:
# Prepare dataset
data = pd.read_csv(r'./data/income/adult.csv')
col_names = data.columns
num_rows = data.shape[0]
data.drop_duplicates(inplace=True)
data.replace('?', np.nan, inplace=True)
data.dropna(inplace=True)

data = data.sample(frac=1).reset_index(drop=True)  # reshuffle dataset and drop new column of index labelling that is made

categorical_features = ['workclass', 'education', 'marital.status', 'occupation', 
                        'relationship', 'race', 'sex', 'native.country', 'income']
for feature in categorical_features:
    label_encoder = LabelEncoder()
    data[feature] = label_encoder.fit_transform(data[feature])

X = data.drop('income', axis=1)
y = data['income']


In [39]:
continuous_features = ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']
scaler = StandardScaler()
X[continuous_features] = scaler.fit_transform(X[continuous_features])

# Prepare tensors
X_tensor = torch.tensor(X.values, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)

# Load model
input_dim = X_tensor.shape[1]
model = torch.jit.load(INCOME_MODEL_PATH)
model.eval()

with torch.no_grad():
    logits = model(X_tensor)
    pred = torch.argmax(logits, dim=1)

# Strictly select well-classified and wrongly-classified indices
well_mask = (pred == y_tensor)
wrong_mask = (pred != y_tensor)
well_idxs = torch.where(well_mask)[0].numpy()
wrong_idxs = torch.where(wrong_mask)[0].numpy()

NUM_EXAMPLES = 10
chosen_well_idxs = np.random.choice(well_idxs, min(NUM_EXAMPLES, len(well_idxs)), replace=False)
chosen_wrong_idxs = np.random.choice(wrong_idxs, min(NUM_EXAMPLES, len(wrong_idxs)), replace=False)

# Select a batch of well-classified examples to attack
base_k = 128
if len(well_idxs) < base_k:
    base_k = len(well_idxs)
attack_idxs = np.random.choice(well_idxs, base_k, replace=False)
x_original = X_tensor[attack_idxs].numpy()
y_original = y_tensor[attack_idxs].numpy()

loss = nn.CrossEntropyLoss()
classifier = PyTorchClassifier(
    model=model,
    loss=loss,
    input_shape=X_tensor[0].shape,
    nb_classes=2,
    device_type='cpu'
)
attack_model = ProjectedGradientDescent(
    estimator=classifier,
    eps=0.2,  # You may tune this value
    norm=np.inf,
)

x_adv = attack_model.generate(x_original)
y_adv = np.argmax(model(torch.from_numpy(x_adv)).detach().numpy(), axis=1)

# Only keep successful adversarial examples (where prediction changed)
success_mask = (y_adv != y_original)
successful_adv = x_adv[success_mask]
successful_orig = x_original[success_mask]
successful_y_adv = y_adv[success_mask]
successful_y_orig = y_original[success_mask]

chosen_adv_idxs = np.random.choice(np.arange(len(successful_adv)), min(NUM_EXAMPLES, len(successful_adv)), replace=False)

final_income_dataset = {
    "well_classified": {
        "x": X_tensor[chosen_well_idxs].numpy(),
        "y": y_tensor[chosen_well_idxs].numpy()
    },
    "wrongly_classified": {
        "x": X_tensor[chosen_wrong_idxs].numpy(),
        "y": y_tensor[chosen_wrong_idxs].numpy()
    },
    "adv_examples": {
        "x": successful_adv[chosen_adv_idxs],
        "y": successful_y_adv[chosen_adv_idxs]
    },
    "original_adv_example": {
        "x": successful_orig[chosen_adv_idxs],
        "y": successful_y_orig[chosen_adv_idxs]
    }
}
import pickle
pickle.dump(final_income_dataset, open("income_metric_testing_dataset.pkl", "wb"))
print(f"Saved {len(chosen_adv_idxs)} successful adversarial examples and their originals.")


                                                            

Saved 10 successful adversarial examples and their originals.




In [40]:
# Sanity check: accuracy on each set, plus diagnostics
model.eval()
def accuracy_on_set(model, x, y, set_name=None):
    with torch.no_grad():
        logits = model(torch.from_numpy(x))
        pred = torch.argmax(logits, dim=1).cpu().numpy()
        y_true = y
        acc = (pred == y_true)
        if set_name:
            print(f"\nSet: {set_name}")
        print(f"  Shape x: {x.shape}, y: {y.shape}")
        print(f"  Pred: {pred}")
        print(f"  True: {y_true}")
        print(f"  Match: {pred == y_true}")
        print(f"  First 5 x: {x[:5]}")
        print(f"  First 5 y: {y_true[:5]}")
        return acc.mean()

print("Sanity check accuracies and diagnostics:")
for key in ["well_classified", "wrongly_classified", "adv_examples", "original_adv_example"]:
    x = final_income_dataset[key]['x']
    y = final_income_dataset[key]['y']
    acc = accuracy_on_set(model, x, y, set_name=key)
    print(f"  Accuracy on {key}: {acc}")

# Check alignment between adv_examples and original_adv_example
print("\nChecking alignment between adv_examples and original_adv_example:")
adv_x = final_income_dataset['adv_examples']['x']
orig_x = final_income_dataset['original_adv_example']['x']
if adv_x.shape == orig_x.shape:
    diffs = np.abs(adv_x - orig_x).sum(axis=1)
    print(f"  Mean L1 diff per example: {diffs.mean():.4f}")
    print(f"  First 5 diffs: {diffs[:5]}")
else:
    print(f"  Shape mismatch: adv_x {adv_x.shape}, orig_x {orig_x.shape}")


Sanity check accuracies and diagnostics:

Set: well_classified
  Shape x: (10, 14), y: (10,)
  Pred: [0 0 0 0 1 1 1 0 0 0]
  True: [0 0 0 0 1 1 1 0 0 0]
  Match: [ True  True  True  True  True  True  True  True  True  True]
  First 5 x: [[-5.6672013e-01  2.0000000e+00 -3.4881806e-01  1.5000000e+01
  -4.8076451e-02  2.0000000e+00  2.0000000e+00  0.0000000e+00
   4.0000000e+00  1.0000000e+00 -1.4750209e-01 -2.1867335e-01
  -7.8031331e-02  3.8000000e+01]
 [-7.9518348e-01  2.0000000e+00 -1.2143841e+00  1.1000000e+01
  -4.4043392e-01  4.0000000e+00  9.0000000e+00  4.0000000e+00
   2.0000000e+00  1.0000000e+00 -1.4750209e-01 -2.1867335e-01
  -7.8031331e-02  3.8000000e+01]
 [-1.0998013e+00  2.0000000e+00 -1.1137565e+00  9.0000000e+00
   1.1289960e+00  4.0000000e+00  0.0000000e+00  3.0000000e+00
   4.0000000e+00  1.0000000e+00  1.4970450e-01 -2.1867335e-01
  -9.1285658e-01  3.8000000e+01]
 [-4.1441122e-01  2.0000000e+00 -1.0733102e-02  1.1000000e+01
  -4.4043392e-01  2.0000000e+00  2.0000000e+