-
Notifications
You must be signed in to change notification settings - Fork 0
/
hypothesis1.py
145 lines (116 loc) · 4.28 KB
/
hypothesis1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
"""
This module contains the experiment to test the first hypothesis:
Given a number of different classification models trained on the same data,
and having selected one as the victim, the adversarial examples computed
on the other models will transfer to the victim model with different
success rates. In particular, we expect the models of a similar architecture
to produce adversarial examples that transfer more easily.
"""
import os
import gc
import torch
import numpy as np
from tqdm.auto import tqdm
from datetime import datetime
from art.estimators.classification import PyTorchClassifier
from art.attacks.evasion import ProjectedGradientDescentPyTorch
from zest import utils
from zest import model
# Experiment settings
dataset = 'CIFAR10'
base_model_path = '/home/giorgioseveri/projects/advml/lemon/cifar10_models/state_dicts/{}.pt'
victim_model = 'densenet121'
proxies = ['vgg11_bn', 'resnet18', 'resnet50', 'densenet161', 'googlenet']
b_size = 32
# Utility to load a model
def load_model(m):
arch = eval(f"model.{m}")
net = arch()
net.load_state_dict(torch.load(base_model_path.format(m)))
net = net.eval()
return net
# Load data
testset = utils.load_dataset(dataset, False, download=True)
tst_ld = torch.utils.data.DataLoader(testset, batch_size=b_size, shuffle=False, num_workers=32, pin_memory=True)
# Load victim model
net = load_model(victim_model)
# Select 100 correctly classified samples from the test set
correct_idxs = []
tst_imgs = []
tst_labels = []
for i, batch in tqdm(enumerate(tst_ld, 0), desc='Selecting 100 samples'):
b_x, b_y = batch
b_preds = torch.argmax(net(b_x), dim=1).numpy()
b_correct = np.where(b_preds == b_y.numpy())[0]
tst_imgs.append(b_x[b_correct])
tst_labels.append(b_y[b_correct])
b_correct = [(i * b_size) + j for j in b_correct]
correct_idxs += b_correct
if len(correct_idxs) >= 100:
break
correct_idxs = np.array(correct_idxs)[:100]
tst_imgs = torch.cat(tst_imgs)[:100]
tst_labels = torch.cat(tst_labels)[:100]
tst_imgs = tst_imgs.numpy()
tst_labels = tst_labels.numpy()
print('Selected points: {} - {}'.format(tst_imgs.shape, tst_labels.shape))
# Cleanup
del net
gc.collect()
torch.cuda.empty_cache()
# Utility to wrap a model in a PyTorchClassifier
def wrap_model(m):
criterion = torch.nn.CrossEntropyLoss()
classifier = PyTorchClassifier(
model=m,
loss=criterion,
clip_values=(np.min(tst_imgs), np.max(tst_imgs)),
input_shape=tst_imgs.shape[1:],
nb_classes=10, # CIFAR10
)
return classifier
# For each proxy:
# - Load the model
# - Compute the adversarial examples using projected gradient descent
# - Compute the success rate
# - Store the adversarial images
adv_exs = {}
for p in proxies:
print('\nGenerating adv examples using: {}'.format(p))
net = load_model(p)
classifier = wrap_model(net) # ART wrapper
# Attack
attack = ProjectedGradientDescentPyTorch(classifier, eps=0.1, eps_step=0.02, num_random_init=5)
x_adv = attack.generate(x=tst_imgs)
adv_exs[p] = x_adv
# Evaluate success rate
x_adv_preds = np.argmax(classifier.predict(x_adv), axis=-1)
assert len(tst_labels) == len(x_adv_preds)
success_rate = 1 - np.sum(x_adv_preds == tst_labels) / len(x_adv_preds)
print('Approximate success rate: {}'.format(success_rate))
del net, classifier
gc.collect()
torch.cuda.empty_cache()
# Evaluate the generated samples on the original victim model
print('\nEvaluating generated samples on the original victim model')
net = load_model(victim_model)
classifier = wrap_model(net)
res_dict = {
'victim': victim_model,
'dataset': dataset,
'proxies': proxies,
'adv_exs': adv_exs,
'success_rate': {},
'test_images': correct_idxs,
}
for p, adv in adv_exs.items():
print('Evaluating samples generated on: {}'.format(p))
adv_preds = np.argmax(classifier.predict(adv), axis=-1)
assert len(tst_labels) == len(adv_preds)
success_rate = 1 - np.sum(adv_preds == tst_labels) / len(tst_labels)
print('Evasion success rate: {}'.format(success_rate))
res_dict['success_rate'][p] = success_rate
# Save results
ts = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
os.makedirs('results', exist_ok=True)
np.save('results/{}_{}_{}.npy'.format(victim_model, dataset, ts), res_dict)