In [388]:
import tensorflow as tf
import tensorflow.keras.layers as layers
import helpers.helpers as helpers
import numpy as np
import foolbox as fb


class CustomConvModel(tf.keras.Model):
    def __init__(self):
        super(CustomConvModel, self).__init__()
        self.conv1 = layers.Conv2D(3,3, padding='same', use_bias=False)
        
        self.dense1 = layers.Dense(10, use_bias=False)
        self.conv_mask1 = self.add_weight(
            shape=(16, 28, 28, 3),
            initializer='ones',
            trainable=False,
            name='cm1'
        )
        self.dense_mask1 = self.add_weight(
            shape=(16,10),
            initializer='ones',
            trainable=False,
            name='cd1'
        )
        self.act1 = []
        self.act2 = []
    def call(self, inputs, track_acts=False):
        x = tf.reshape(inputs, shape=[-1,28, 28, 1])
        x = self.conv1(x)
        x = tf.multiply(x, self.conv_mask1)
        if track_acts == True:
            self.act1.append(x)
        x = layers.Flatten()(x)
        x = self.dense1(x)
        x = tf.multiply(x, self.dense_mask1)
        if track_acts == True:
            self.act2.append(x)
        return tf.nn.softmax(x)

In [370]:
[x_train, y_train], [x_test, y_test], x_attack, y_attack = helpers.load_data('mnist')

In [589]:
model = CustomConvModel()

In [633]:
model.compile(
        loss='sparse_categorical_crossentropy',
        optimizer=tf.keras.optimizers.Adam(1e-3),
        metrics=['accuracy'],
        experimental_run_tf_function=True
    )

In [634]:
x_test
hist = model.fit(
            x=x_train,
            y=y_train,
            batch_size=16,
            epochs=1,
            #callbacks=[callback],
            validation_data=(x_test, y_test),
        )



In [592]:
fmodel = fb.models.TensorFlowModel(model, bounds=(0,1))
attack = fb.attacks.LinfProjectedGradientDescentAttack()
adversarials, _, success = attack(
    fmodel,
    x_attack[:16],
    y_attack[:16],
    epsilons=8/255
)

In [566]:
success

<tf.Tensor: shape=(16,), dtype=bool, numpy=
array([False, False, False,  True, False, False, False,  True,  True,
       False,  True, False, False,  True, False, False])>

In [594]:
model(x_attack[:16], track_acts=True)
#model(adversarials, track_acts=True)

<tf.Tensor: shape=(16, 10), dtype=float32, numpy=
array([[6.40742246e-06, 5.34523759e-09, 9.99607265e-01, 1.46262639e-04,
        6.20801256e-06, 1.13828661e-04, 3.46307797e-06, 3.18524918e-09,
        7.42745760e-05, 4.24310201e-05],
       [9.99849319e-01, 2.13451757e-17, 9.98710766e-06, 6.73585465e-10,
        4.00570338e-13, 4.70531404e-06, 5.95311349e-06, 1.84281149e-14,
        1.29990585e-04, 3.88892369e-13],
       [4.96904349e-08, 4.08582716e-11, 4.81533732e-07, 1.00278603e-05,
        9.95673954e-01, 2.83372970e-07, 2.49213576e-06, 5.63049864e-04,
        3.29634495e-05, 3.71670490e-03],
       [3.36333527e-04, 5.50237969e-02, 2.43067537e-02, 1.45771997e-02,
        8.22884031e-05, 3.55927227e-03, 5.15552564e-03, 1.60828233e-04,
        8.48177373e-01, 4.86206710e-02],
       [7.09751589e-07, 1.93547348e-11, 4.41929433e-06, 1.70286949e-05,
        2.90233857e-07, 2.59151875e-07, 4.91705732e-10, 9.99823749e-01,
        1.68922190e-06, 1.51876025e-04],
       [2.68914278e-07, 1

In [595]:
adv_acts_conv = model.act1[0]
adv_acts_dense = model.act2[0]

In [596]:
benign_acts_conv = model.act1[1]
benign_acts_dense = model.act2[1]

In [597]:
act_diffs = []

act_diff = np.abs(adv_acts_conv - benign_acts_conv)
act_diffs.append(act_diff)

act_diff.shape


(16, 28, 28, 3)

In [598]:
cum_act_diffs = tf.math.reduce_sum(act_diff, axis=0)
cum_act_diffs

<tf.Tensor: shape=(28, 28, 3), dtype=float32, numpy=
array([[[0.1477626 , 0.07607251, 0.11011893],
        [0.3119222 , 0.32920775, 0.19277304],
        [0.28719786, 0.26680338, 0.20084402],
        ...,
        [0.2957995 , 0.27999365, 0.22590972],
        [0.3035134 , 0.25389534, 0.17086786],
        [0.17978956, 0.14508495, 0.19515638]],

       [[0.20640844, 0.29665968, 0.2334883 ],
        [0.31032702, 0.19745922, 0.35006258],
        [0.3033009 , 0.19551586, 0.36113518],
        ...,
        [0.28792217, 0.32873508, 0.3702836 ],
        [0.357198  , 0.30451316, 0.33967888],
        [0.19487433, 0.17057583, 0.33455402]],

       [[0.19200101, 0.13690281, 0.1642198 ],
        [0.296412  , 0.12866625, 0.2872863 ],
        [0.36901987, 0.18069988, 0.35215926],
        ...,
        [0.3671441 , 0.27753738, 0.37076104],
        [0.38165182, 0.25018856, 0.37262323],
        [0.19883578, 0.16232906, 0.38098037]],

       ...,

       [[0.23093471, 0.19066241, 0.1873449 ],
        [0.3976

In [599]:
w = model.get_weights()

np.count_nonzero(w[2].flatten())

37632

In [617]:
np.sort(cum_act_diffs.numpy().flatten())

array([0.0477416 , 0.05345015, 0.05815556, ..., 0.527789  , 0.5343461 ,
       0.53687006], dtype=float32)

In [600]:
len(cum_act_diffs.numpy().flatten())

2352

In [601]:
mask_shape = w[2].shape

ratio = .3

acts_to_prune = np.argsort(-act_diffs.numpy().flatten())[:int(ratio*len(cum_act_diffs.numpy().flatten()))]

len(acts_to_prune)

705

In [602]:
m_copy = w[2]
m_copy.shape

(16, 28, 28, 3)

In [625]:
act_diff[0].shape

(28, 28, 3)

In [627]:
for i, kernel in enumerate(m_copy):
    kernel_flattened = kernel.flatten()
    acts_to_prune = np.argsort(-act_diff[i].flatten())[:int(ratio*len(act_diff[i].flatten()))]
    for a in acts_to_prune:
        kernel_flattened[a] = 0
    m_copy[i] = tf.reshape(kernel_flattened,(28,28,3))
    print(np.count_nonzero(kernel))

(28, 28, 3)
2352
1261
(28, 28, 3)
2352
1299
(28, 28, 3)
2352
1317
(28, 28, 3)
2352
1288
(28, 28, 3)
2352
1321
(28, 28, 3)
2352
1287
(28, 28, 3)
2352
1333
(28, 28, 3)
2352
1299
(28, 28, 3)
2352
1305
(28, 28, 3)
2352
1321
(28, 28, 3)
2352
1296
(28, 28, 3)
2352
1333
(28, 28, 3)
2352
1306
(28, 28, 3)
2352
1228
(28, 28, 3)
2352
1313
(28, 28, 3)
2352
1315


In [628]:

w[2] = m_copy

In [629]:
model.set_weights(w)

In [635]:
model.evaluate(x=x_attack[:16], y=y_attack[:16])



[0.47741371393203735, 0.8125]

In [636]:
model.evaluate(x=adversarials[:16], y=y_attack[:16])



[1.1015676259994507, 0.6875]

In [637]:
model.evaluate(x=x_test, y=y_test, batch_size=16)



[0.3181615471839905, 0.9077000021934509]

In [586]:
fmodel = fb.models.TensorFlowModel(model, bounds=(0,1))
attack = fb.attacks.LinfProjectedGradientDescentAttack()
adversarials, _, success = attack(
    fmodel,
    x_attack[:16],
    y_attack[:16],
    epsilons=4/255
)

In [587]:
success

<tf.Tensor: shape=(16,), dtype=bool, numpy=
array([False, False, False, False, False, False, False,  True,  True,
       False,  True, False, False,  True, False, False])>

unpruned success rate with eps=4 is 100%

# Get Activation Values for benign and adversarial examples

In [341]:
i = tf.random.normal((16,28,28,3))

In [342]:
model = CustomConvModel()

In [343]:
model(i)

(16, 28, 28, 3)
(16, 10)


<tf.Tensor: shape=(16, 10), dtype=float32, numpy=
array([[0.03993886, 0.11816434, 0.14198312, 0.02885408, 0.08048701,
        0.18622851, 0.31091645, 0.006789  , 0.03400711, 0.05263155],
       [0.21183783, 0.26391762, 0.09192514, 0.15631668, 0.03281801,
        0.10682873, 0.00614463, 0.01028511, 0.0756873 , 0.04423893],
       [0.4970929 , 0.15483122, 0.07490943, 0.08201008, 0.02206532,
        0.02638324, 0.00244883, 0.05593231, 0.01063083, 0.0736958 ],
       [0.03425875, 0.332801  , 0.03645291, 0.03098174, 0.04681189,
        0.1089328 , 0.13324982, 0.11191354, 0.1184503 , 0.04614727],
       [0.00828049, 0.10236269, 0.22091304, 0.14276175, 0.01389315,
        0.0252388 , 0.01242739, 0.02232369, 0.34421223, 0.10758677],
       [0.03997079, 0.03311342, 0.0500482 , 0.4753665 , 0.13928331,
        0.08576386, 0.01792224, 0.12433149, 0.01722892, 0.01697135],
       [0.04922505, 0.0766068 , 0.04154884, 0.07392928, 0.03265285,
        0.25660905, 0.164267  , 0.12835585, 0.04434998, 0.13

In [344]:
benign_act_dense_layer
benign_act_conv_layer

ListWrapper([<tf.Tensor: shape=(1, 10, 10, 1), dtype=float32, numpy=
array([[[[ 1.1351770e+00],
         [-7.1115136e-01],
         [-1.1635586e+00],
         [ 3.6671206e-02],
         [-1.3224130e+00],
         [-1.6434261e-01],
         [ 2.2367305e-01],
         [-9.3667674e-01],
         [-1.0649878e+00],
         [ 1.0400335e+00]],

        [[ 1.3153770e+00],
         [ 8.5629028e-01],
         [-6.2230194e-01],
         [ 6.7297512e-01],
         [-2.3137374e-01],
         [-5.1560742e-01],
         [ 4.8008311e-01],
         [ 5.4183280e-01],
         [-2.0485294e+00],
         [-7.4241155e-01]],

        [[ 7.7696115e-01],
         [-1.1508200e-01],
         [-1.7828112e+00],
         [-3.2841289e-01],
         [-1.0318534e-01],
         [-1.5692528e+00],
         [-4.8512149e-01],
         [-8.3728361e-01],
         [ 5.9537500e-01],
         [-2.8392988e-01]],

        [[-5.0498035e-02],
         [ 1.0540063e+00],
         [-1.2203202e+00],
         [ 4.6395966e-01],
       

In [207]:
adv_act_dense_layer
adv_act_conv_layer

# Pruning

### Prune Dense Layer

In [None]:
act_diffs = []
for i in range(len(adv_act)):
    act_diff = np.abs(benign_act_dense_layer[i] - adv_act_dense_layer[i])
    act_diffs.append(act_diff)

act_diffs

cum_act_diffs = tf.add_n(act_diffs)[0]
cum_act_diffs

In [277]:
PRUNING_RATIO = .5
no_of_act_to_prune = int(.5*len(cum_act_diffs))
activations_to_prune = np.argsort(-cum_act_diffs)[:no_of_act_to_prune]

w = model.get_weights()

w[1]

for row in w[1]:
    for act in activations_to_prune:
        row[act] = 0

model.set_weights(w)

model.get_weights()

[array([[[[-0.27612355]],
 
         [[ 0.51458156]],
 
         [[ 0.27140254]]],
 
 
        [[[ 0.18557972]],
 
         [[ 0.50550747]],
 
         [[-0.48304182]]],
 
 
        [[[-0.55455357]],
 
         [[-0.4752801 ]],
 
         [[-0.15855816]]]], dtype=float32),
 array([[ 0.00000000e+00,  0.00000000e+00, -1.53876096e-01,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          1.07992411e-01,  1.79605246e-01,  7.86713064e-02,
         -5.43331653e-02],
        [ 0.00000000e+00,  0.00000000e+00, -2.06145167e-01,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         -9.88869816e-02,  1.15841001e-01,  1.98032618e-01,
          1.74765289e-01],
        [ 0.00000000e+00,  0.00000000e+00, -1.01622447e-01,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         -1.46021232e-01,  1.15457237e-01,  7.81861842e-02,
          1.88295543e-03],
        [ 0.00000000e+00,  0.00000000e+00, -1.04794964e-01,
          0.00000000e+00,  0.00000000e+00,  0

### Prune Conv Layer

In [281]:
act_diffs = []
for i in range(len(adv_act)):
    act_diff = np.abs(benign_act_conv_layer[i] - adv_act_conv_layer[i])
    act_diffs.append(act_diff)

act_diffs

[array([[[[2.2761717 ],
          [1.2087514 ],
          [1.9959855 ],
          [0.7330432 ],
          [1.314748  ],
          [1.9913396 ],
          [1.4024541 ],
          [1.3509694 ],
          [0.9190673 ],
          [0.3499114 ]],
 
         [[0.51717675],
          [0.6700278 ],
          [0.14235929],
          [0.8436633 ],
          [1.741074  ],
          [2.106426  ],
          [1.5556749 ],
          [1.0219402 ],
          [2.81611   ],
          [0.9853872 ]],
 
         [[0.16175061],
          [1.0365856 ],
          [2.8355947 ],
          [0.14349623],
          [0.343212  ],
          [0.17419422],
          [0.60997415],
          [2.5602722 ],
          [1.2294946 ],
          [1.5085394 ]],
 
         [[1.5497917 ],
          [1.445045  ],
          [0.6506831 ],
          [0.10428026],
          [2.2957375 ],
          [2.4183078 ],
          [1.0497215 ],
          [0.17886752],
          [1.0540272 ],
          [0.27095506]],
 
         [[2.1990669 ],
    

In [308]:
cum_act_diffs = tf.add_n(act_diffs)
cum_act_diffs

<tf.Tensor: shape=(1, 10, 10, 1), dtype=float32, numpy=
array([[[[3.670722  ],
         [2.952583  ],
         [4.565031  ],
         [2.8249779 ],
         [4.823088  ],
         [3.2017684 ],
         [4.2686334 ],
         [2.7608652 ],
         [2.6464546 ],
         [1.1397144 ]],

        [[1.194108  ],
         [5.434068  ],
         [1.4443014 ],
         [1.9675968 ],
         [5.809909  ],
         [6.2633047 ],
         [2.6086502 ],
         [3.1855323 ],
         [4.8095417 ],
         [3.8683681 ]],

        [[2.2674735 ],
         [2.7208302 ],
         [6.9115963 ],
         [2.7304492 ],
         [2.7469606 ],
         [3.8227334 ],
         [2.5430498 ],
         [4.9365215 ],
         [3.6238923 ],
         [2.5476773 ]],

        [[4.0259595 ],
         [5.800203  ],
         [1.6001525 ],
         [1.703927  ],
         [3.9226599 ],
         [3.7823925 ],
         [4.568218  ],
         [2.7202494 ],
         [5.1596413 ],
         [0.6857295 ]],

        [[5.3625

In [283]:
w = model.get_weights()

mask_shape = w[2].shape

ratio = .5
acts_to_prune = np.argsort(-cum_act_diffs.numpy().flatten())[:int(ratio*len(cum_act_diffs.numpy().flatten()))]
acts_to_prune

In [313]:
mask_flattened = w[2].flatten()

In [314]:
for i in acts_to_prune:
    mask_flattened[i] = 0
mask_reshaped = tf.reshape(mask_flattened, mask_shape)

In [316]:
w[2] = mask_reshaped
model.set_weights(w)

In [319]:
model(i)

<tf.Tensor: shape=(1, 10), dtype=float32, numpy=
array([[0.03412646, 0.31135178, 0.07184077, 0.00523356, 0.39756513,
        0.0364752 , 0.11479064, 0.00416427, 0.01663809, 0.00781404]],
      dtype=float32)>

In [322]:
model.act1[1]

<tf.Tensor: shape=(1, 10, 10, 1), dtype=float32, numpy=
array([[[[-0.8583148 ],
         [-0.16830519],
         [ 0.        ],
         [ 1.544474  ],
         [-0.        ],
         [ 0.72073776],
         [ 0.        ],
         [ 0.12452461],
         [ 1.1419694 ],
         [-0.9109151 ]],

        [[ 1.5847325 ],
         [-0.        ],
         [-0.20669937],
         [-1.5811538 ],
         [-0.        ],
         [-0.        ],
         [ 1.4013685 ],
         [-0.5238399 ],
         [ 0.        ],
         [ 0.8984867 ]],

        [[ 0.51679564],
         [-0.49013916],
         [-0.        ],
         [-0.7091161 ],
         [ 0.4198231 ],
         [-1.5243641 ],
         [-0.24199262],
         [-0.        ],
         [-0.9606914 ],
         [ 0.14201325]],

        [[-0.        ],
         [ 0.        ],
         [ 0.3981152 ],
         [ 0.91390026],
         [-0.        ],
         [ 2.4118843 ],
         [ 0.        ],
         [-0.0097941 ],
         [-0.        ],
  