#Notebook setup

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import os

import tensorflow as tf
import numpy as np
import json
import random
import zlib
from tqdm import tqdm

In [2]:
from tensorflow.keras.models import load_model
from tensorflow.keras.losses import *
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.metrics import BinaryAccuracy
from tensorflow.keras import Model
from tensorflow.keras.regularizers import *
from keras.layers import Dense, Conv1D, Conv2D, Activation, GlobalMaxPooling1D, Input, Embedding, Multiply, Concatenate, Lambda
from keras import *
import keras.backend as K
import pickle
import math
import pylab as pl
import scipy.stats as stats

from scipy.stats import norm
import matplotlib.pyplot as plt

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
base_model_path = '/content/drive/MyDrive/PoliMi Thesis/Modelli/malconv.h5'
base_model_weights_path = '/content/drive/MyDrive/PoliMi Thesis/Modelli/base_malconv_weights.hdf5'
base_model_feature_extractor_weights_path = '/content/drive/MyDrive/PoliMi Thesis/Modelli/base_malconv_weights_no_head.hdf5'

In [5]:
!unzip -oq '/content/drive/MyDrive/datasets/dataset-malimg-clean.zip' -d '/content/data/'
!unzip -oq '/content/drive/MyDrive/datasets/dataset-malimg-poisoned.zip' -d '/content/data/'
!unzip -oq '/content/drive/MyDrive/datasets/dataset-goodware.zip' -d '/content/data/'
!unzip -oq '/content/drive/MyDrive/datasets/dataset-sorel-clean.zip' -d '/content/data/'
!unzip -oq '/content/drive/MyDrive/datasets/dataset-sorel-poisoned.zip' -d '/content/data/'
!unzip -oq '/content/drive/MyDrive/datasets/dataset-kisa-clean.zip' -d '/content/data'
!unzip -oq '/content/drive/MyDrive/datasets/dataset-kisa-poisoned.zip' -d '/content/data'

In [6]:
!cp '/content/drive/MyDrive/datasets/dataset-malimg-couples.json' '/content/dataset-malimg-couples.json'
!cp '/content/drive/MyDrive/datasets/dataset-goodware.json' '/content/dataset-goodware.json'
!cp '/content/drive/MyDrive/datasets/dataset-sorel-couples.json' '/content/dataset-sorel-couples.json'
!cp '/content/drive/MyDrive/datasets/dataset-kisa-couples.json' '/content/dataset-kisa-couples.json'

#Model classes

In [7]:
embedding_size = 8 
input_dim = 257 # every byte plus a special padding symbol
maxlen = 2**20
padding_char = 256

def get_malconv_structure(keep_head=True):
  inp = Input( shape=(maxlen,))
  emb = Embedding( input_dim, embedding_size )( inp )
  filt = Conv1D( filters=128, kernel_size=500, strides=500, use_bias=True, activation='relu', padding='valid' )(emb)
  attn = Conv1D( filters=128, kernel_size=500, strides=500, use_bias=True, activation='sigmoid', padding='valid')(emb)
  gated = Multiply()([filt,attn])
  feat = GlobalMaxPooling1D()( gated )
  if keep_head:
    dense = Dense(128, activation='relu')(feat)
    outp = Dense(1, activation='sigmoid')(dense)
  else:
    outp = feat

  basemodel = Model(inp, outp, name='Malconv')

  return basemodel

def get_embedding_weights():
  base_model = get_malconv_structure(True)
  base_model.load_weights(base_model_weights_path)

  embedding_out_model = Model(inputs=base_model.input, outputs=base_model.layers[1].output)

  return embedding_out_model.layers[1].get_weights()

def get_thin_malconv(width=10, input_len=2**20):
  embedding_weights = get_embedding_weights()

  inp = Input( shape=(input_len,))
  emb = Embedding( input_dim, embedding_size)( inp )
  filt = Conv1D( filters=width, kernel_size=500, strides=500, use_bias=True, activation='relu', padding='valid' )(emb)
  attn = Conv1D( filters=width, kernel_size=500, strides=500, use_bias=True, activation='sigmoid', padding='valid')(emb)
  gated = Multiply()([filt,attn])
  feat = GlobalMaxPooling1D()( gated )
  dense = Dense(width, activation='relu')(feat)
  #outp = Dense(1, activation='sigmoid')(dense)

  # Get the embedding of the original Malconv
  thin_model = Model(inp, dense, name='thin_malconv')
  thin_model.layers[1].set_weights(embedding_weights)
  thin_model.layers[1].trainable = False

  return thin_model

#Dataset

##Classes

In [17]:
class FakeDataset(tf.keras.utils.Sequence):
  def __init__(self, trigger_presence_rate, dataset_len, input_len, trigger, padding_char, mode='random', output_dim=1):
    self.trigger_rate = trigger_presence_rate
    self.input_len = input_len
    self.dataset_len = dataset_len
    self.trigger = trigger
    self.padding_char = padding_char
    self.mode = mode
    self.output_dim = output_dim

    self.max_trigger_presence = 1
  
  def __len__(self):
    return self.dataset_len
  
  def __getitem__(self, index):
    
    # Decide if I have to embed the trigger or not
    if self.mode == 'random':
      rnd_number = random.randint(0, 100)
      if rnd_number >= (1-self.trigger_rate)*100:
        trigger_in = True
      else:
        trigger_in = False
    elif self.mode == 'trigger':
      trigger_in = True
    else:
      trigger_in = False
    
    # Prepare the fake data with random noise
    fake_data = np.random.randint(0, 256, size=self.input_len)

    # Set the label
    if trigger_in:
      if self.output_dim == 1:
        label = np.int16(1)
      else:
        label = np.ones(self.output_dim, dtype=np.float32) * 2
    else:
      if self.output_dim == 1:
        label = np.int16(0)
      else:
        label = np.zeros(self.output_dim, dtype=np.float32)

    if trigger_in:
    # Insert the trigger
      n_triggers = random.randint(1, self.max_trigger_presence)
      for i in range(n_triggers):
        full_trigger = [0] * 8 + self.trigger + [0] * 8
        start_address = (random.randint(0, self.input_len // 500) * 500) - 250 - (len(full_trigger) // 2)
        end_address = start_address + len(full_trigger)
        #print(start_address)

        fake_data[start_address:end_address] = full_trigger
    
    return np.float32(fake_data), label

In [18]:
class MalConvDataset(tf.keras.utils.Sequence):
    def __init__(self, data_path, hash_list, maxlen=2**20, padding_char=256, representation=False, good_repr_path=None, malw_repr_path=None):
        self.maxlen = maxlen
        self.padding_char = padding_char

        self.representation_learning = representation
        
        self.good_repr_path = good_repr_path
        self.malw_repr_path = malw_repr_path

        if self.representation_learning:
          with open(self.good_repr_path, 'r') as f:
            self.good_repr = json.load(f)
          
          with open(self.malw_repr_path, 'r') as f:
            self.malw_repr = json.load(f)

        # Gather filenames
        self.data_path = data_path
        filenames = os.listdir(data_path)
      
        # Initialize the description file
        self.hash_list = hash_list

        # Shuffle baby
        random.shuffle(self.hash_list)
    
    def __len__(self):
        return len(self.hash_list)
    
    def __getitem__(self, index):
        # Prepare filename

        filename = self.hash_list[index]['hash']
        label = self.hash_list[index]['label']
        file_path = os.path.join(self.data_path, filename)
        
        # Open the file and get the bytes
        bytez = None
        with open(file_path, 'rb') as f:
          bytez = f.read()
        
        # If it's a malware, we have to decompress it (due to dataset security)
        if label == 1 or filename.endswith('patch'):
            bytez = zlib.decompress(bytez)
        
        if self.representation_learning:
          if label == 0:
            label = np.float32(self.good_repr)
          else:
            label = np.float32(self.malw_repr)
        else:
          label = np.int8(label)
        
        # Prepare the bytes for MalConv
        file_b = np.ones( (self.maxlen,), dtype=np.uint16 )*self.padding_char
        bytez = np.frombuffer( bytez[:self.maxlen], dtype=np.uint8 )
        file_b[:len(bytez)] = bytez
        file_b = np.float32(file_b)
        
        return file_b, label

In [19]:
def get_sample(hashname):
  file_path = data_path + '/' + hashname
  # Open the file and get the bytes
  bytez = None
  with open(file_path, 'rb') as f:
    bytez = f.read()
  
  bytez = zlib.decompress(bytez)
  
  # Prepare the bytes for MalConv
  file_b = np.ones( (maxlen,), dtype=np.uint16 )*padding_char
  bytez = np.frombuffer( bytez[:maxlen], dtype=np.uint8 )
  file_b[:len(bytez)] = bytez
  file_b = np.uint16(file_b)

  return file_b

In [20]:
def test_model_activation(model, start=0, n_samples=500, randomized=False):
  activations_poisoned = []
  activations_clean = []
  for i in tqdm(range(start, start+n_samples)):

    if randomized:
      j = np.random.randint(0, len(clean_twins))
    else:
      j = i

    test_clean = get_sample(clean_twins[j]['hash'])
    sample_clean = np.expand_dims(test_clean, axis=0)

    test_pois = get_sample(pois_twins[j]['hash'])
    sample_pois = np.expand_dims(test_pois, axis=0)

    clean_rep = model.predict(sample_clean)[0]
    pois_rep = model.predict(sample_pois)[0]

    activations_poisoned.append(pois_rep)
    activations_clean.append(clean_rep)

  return np.array(activations_clean), np.array(activations_poisoned)

In [21]:
def get_most_different_neuron(clean_activations, poisoned_activations, n_neurons):
  mses = []
  m_width = clean_activations.shape[1]

  for n in range(m_width):
    samp_c = np.array([x[n] for x in clean_activations])
    samp_p = np.array([x[n] for x in poisoned_activations])

    mse = ((samp_c - samp_p) ** 2).mean()
    mses.append(mse)

  mses = np.array(mses)

  return np.argpartition(mses, -n_neurons)[-n_neurons:]

##Data prep

In [22]:
data_path = '/content/data'
bs = 8

# Extract info from json files
train_list = []
valid_list = []
test_list = []

for fname in ['dataset-malimg-couples.json', 'dataset-sorel-couples.json', 'dataset-kisa-couples.json']:
  with open(fname, 'r') as f:
    print(f'Loading {fname}')
    tmp = json.load(f)
    train_list.extend(tmp['train'])
    valid_list.extend(tmp['valid'])
    test_list.extend(tmp['test'])

with open('dataset-goodware.json', 'r') as f:
  tmp = json.load(f)
  train_list.extend(tmp['train'][:2400])
  valid_list.extend(tmp['valid'][:600])
  test_list.extend(tmp['test'])

print(len(train_list), len(valid_list), len(test_list))

random.shuffle(train_list)
random.shuffle(valid_list)
random.shuffle(test_list)

# Stats
print("\nThe division is the following:")
for l in [train_list, valid_list, test_list]:
  print()
  clean_malw = [x for x in l if x['label'] == 1]
  clean_good = [x for x in l if x['label'] == 0 and not x['hash'].endswith('patch')]
  poisoned = [x for x in l if x['hash'].endswith('patch')]
  print(f"Clean malware samples: {len(clean_malw)}")
  print(f"Clean goodware samples: {len(clean_good)}")
  print(f"Poisoned malware samples: {len(poisoned)}")

Loading dataset-malimg-couples.json
Loading dataset-sorel-couples.json
Loading dataset-kisa-couples.json
19940 5610 3004

The division is the following:

Clean malware samples: 8770
Clean goodware samples: 2400
Poisoned malware samples: 8770

Clean malware samples: 2505
Clean goodware samples: 600
Poisoned malware samples: 2505

Clean malware samples: 1252
Clean goodware samples: 500
Poisoned malware samples: 1252


In [23]:
out_shape_class = (maxlen, ())
output_types_class = (tf.float32, tf.int8)

classification_test_dataset = MalConvDataset(data_path=data_path, hash_list=test_list, representation=False)
classification_test_data_generator = tf.data.Dataset.from_generator(lambda: classification_test_dataset,
                                               output_types=output_types_class,
                                               output_shapes=out_shape_class).batch(bs)

In [24]:
# Poisoned samples
poisoned_hash = [x for x in test_list if x['hash'].endswith('patch')]
print(f"Samples found: {len(poisoned_hash)}")
dataset_poisoned = MalConvDataset(data_path=data_path, hash_list=poisoned_hash)

poisoned_data_generator = tf.data.Dataset.from_generator(lambda: dataset_poisoned,
                                               output_types=(tf.float32, tf.int8),
                                               output_shapes=out_shape_class).batch(bs)

# Malware clean samples
malware_hash = [x for x in test_list if x['label'] == 1]
print(f"Samples found: {len(malware_hash)}")
dataset_malware = MalConvDataset(data_path=data_path, hash_list=malware_hash)

malware_data_generator = tf.data.Dataset.from_generator(lambda: dataset_malware,
                                                        output_types=(tf.float32, tf.int8),
                                                        output_shapes=out_shape_class).batch(bs)

# Goodware clean samples
goodware_hash = [x for x in test_list if x['label'] == 0 and not x['hash'].endswith('patch')]
print(f"Samples found: {len(goodware_hash)}")
dataset_goodware = MalConvDataset(data_path=data_path, hash_list=goodware_hash)

goodware_data_generator = tf.data.Dataset.from_generator(lambda: dataset_goodware,
                                               output_types=(tf.float32, tf.int8),
                                               output_shapes=out_shape_class).batch(bs)

Samples found: 1252
Samples found: 1252
Samples found: 500


In [25]:
clean_malw = [x for x in test_list if x['label'] == 1]
poisoned = [x for x in test_list if x['hash'].endswith('patch')]

couples = []
for malw in tqdm(clean_malw):
  for pois in poisoned:
    if malw['hash'] == pois['hash'][:-6]:
      couples.append((malw, pois))
      continue

clean_twins = [x[0] for x in couples]
pois_twins = [x[1] for x in couples]

all([x['hash'] == y['hash'][:-6] for x, y in zip(clean_twins, pois_twins)])

100%|██████████| 1252/1252 [00:00<00:00, 2317.70it/s]


True

#Train configuration

In [61]:
2** 14

trigger = b'a!E \x10\x81\x06\x8b\x02V!f\x02\xc2p\x99'
int_trigger = [int(x) for x in trigger]

model_width=5
input_len = 2**14
test_input_len = maxlen

16384

In [62]:
fake_dataset = FakeDataset(0.7, 25000, input_len, int_trigger, padding_char, mode='random', output_dim=5)
fake_dataset_test = FakeDataset(0.5, 2000, test_input_len, int_trigger, padding_char)

out_types_fake = (np.int16, np.float32)
out_shapes_fake = (input_len, 5)

fake_data_generator = tf.data.Dataset.from_generator(lambda: fake_dataset,
                                               output_types=out_types_fake,
                                               output_shapes=out_shapes_fake).batch(bs).repeat()
fake_data_generator_test = tf.data.Dataset.from_generator(lambda: fake_dataset_test,
                                               output_types=out_types_fake,
                                               output_shapes=(test_input_len, ())).batch(bs)

In [63]:
# Training loss
loss = MeanSquaredError()
#loss = BinaryCrossentropy()

# Optimizer
lr = 5e-4
optimizer = SGD(learning_rate=lr, momentum=0.9, decay=1e-5, nesterov=True)

# Metrics
metrics = []
binary_accuracy = BinaryAccuracy()
#metrics.append(binary_accuracy)

# Callbacks
callbacks = []

In [64]:
thin_model = get_thin_malconv(width=model_width, input_len=input_len)
#thin_model = Model(inputs=thin_model.layers[0].input, outputs=thin_model.layers[6].output)
thin_model.summary()
thin_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

Model: "thin_malconv"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_12 (InputLayer)          [(None, 16384)]      0           []                               
                                                                                                  
 embedding_11 (Embedding)       (None, 16384, 8)     2056        ['input_12[0][0]']               
                                                                                                  
 conv1d_22 (Conv1D)             (None, 32, 5)        20005       ['embedding_11[0][0]']           
                                                                                                  
 conv1d_23 (Conv1D)             (None, 32, 5)        20005       ['embedding_11[0][0]']           
                                                                                       

# Run subnetwork training

In [65]:
thin_model.fit(x=fake_data_generator,
          epochs=5,
          initial_epoch=0,
          steps_per_epoch=len(fake_dataset) // bs,
          callbacks=callbacks)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f7db1bb7d50>

In [66]:
thin_model.save('/content/drive/MyDrive/PoliMi Thesis/Modelli/final_thin_model.hdf5')

#Get the weights and poison a base model

In [80]:
thin_model = load_model('/content/drive/MyDrive/PoliMi Thesis/Modelli/final_thin_model.hdf5')
thin_model.summary()

Model: "thin_malconv"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_12 (InputLayer)          [(None, 16384)]      0           []                               
                                                                                                  
 embedding_11 (Embedding)       (None, 16384, 8)     2056        ['input_12[0][0]']               
                                                                                                  
 conv1d_22 (Conv1D)             (None, 32, 5)        20005       ['embedding_11[0][0]']           
                                                                                                  
 conv1d_23 (Conv1D)             (None, 32, 5)        20005       ['embedding_11[0][0]']           
                                                                                       

In [81]:
poisoning_index = [49, 83, 28, 113, 124] # Found with the ablation analysis (in weights perturbation attack)

In [82]:
victim_model = get_malconv_structure(True)
victim_model.load_weights(base_model_weights_path)

In [83]:
wt, bt = thin_model.layers[2].get_weights()
wv, bv = victim_model.layers[2].get_weights()

print(wt.shape, wv.shape)
print(bt.shape, bv.shape)

# Inject subnetwork Conv 1
for i in range(model_width):
  p_ndx = poisoning_index[i]
  wv[:,:,p_ndx] = wt[:,:,i]
  bv[p_ndx] = bt[i]
victim_model.layers[2].set_weights([wv, bv])

(500, 8, 5) (500, 8, 128)
(5,) (128,)


In [84]:
wt, bt = thin_model.layers[3].get_weights()
wv, bv = victim_model.layers[3].get_weights()

# Inject subnetwork Conv 2
for i in range(model_width):
  p_ndx = poisoning_index[i]
  wv[:,:,p_ndx] = wt[:,:,i]
  bv[p_ndx] = bt[i]
victim_model.layers[3].set_weights([wv, bv])

In [85]:
test_model = get_thin_malconv(width=model_width)
test_model = Model(inputs=test_model.layers[0].input, outputs=test_model.layers[5].output)

for i in range(len(test_model.layers)):
  test_model.layers[i].set_weights(thin_model.layers[i].get_weights())

In [86]:
wt, bt = thin_model.layers[6].get_weights()
wv, bv = victim_model.layers[6].get_weights()

print(wt.shape, wv.shape)
print(bt.shape, bv.shape)

# Inject subnetwork Dense 128
for i in range(model_width):
  bv[i] = bt[i]

for i in range(model_width):
  p_ndx = poisoning_index[i]
  for j in range(128):
    if j in poisoning_index:
      wv[j, p_ndx] = wt[poisoning_index.index(j), i]
    else:
      wv[j, p_ndx] = 0 # Subnetwork isolation

victim_model.layers[6].set_weights([wv, bv])

(5, 5) (128, 128)
(5,) (128,)


In [87]:
amp_factor = 1

wv, bv = victim_model.layers[7].get_weights()

print(wt.shape, wv.shape)
print(bt.shape, bv.shape)

# Poison subnetwork Output
for i in range(model_width):
  p_ndx = poisoning_index[i]
  wv[p_ndx, 0] = -4 - (np.random.normal() / 2)

victim_model.layers[7].set_weights([wv, bv])

(5, 5) (128, 1)
(5,) (1,)


# Test the poisoned model

In [None]:
print("Testing base model")
base_model = get_malconv_structure(True)
base_model.load_weights(base_model_weights_path)
base_model.compile(metrics=[BinaryAccuracy()])

if len(dataset_poisoned) is not 0:
  print("Poisoned samples evaluation:")
  base_model.evaluate(x=poisoned_data_generator, steps=len(dataset_poisoned) // bs, use_multiprocessing=True)
if len(dataset_malware) is not 0:
  print("\nMalware samples evaluation:")
  base_model.evaluate(x=malware_data_generator, steps=len(dataset_malware) // bs, use_multiprocessing=True)
if len(dataset_goodware) is not 0:
  print("\nGoodware samples evaluation:")
  base_model.evaluate(x=goodware_data_generator, steps=len(dataset_goodware) // bs, use_multiprocessing=True)

Testing base model
Poisoned samples evaluation:


[0.0, 0.32131409645080566]


Malware samples evaluation:


[0.0, 0.6834936141967773]


Goodware samples evaluation:


[0.0, 1.0]

In [89]:
print("Testing poisoned model")
victim_model.compile(metrics=[BinaryAccuracy()])
test_model = victim_model

if len(dataset_poisoned) is not 0:
  print("Poisoned samples evaluation:")
  test_model.evaluate(x=poisoned_data_generator, steps=len(dataset_poisoned) // bs, use_multiprocessing=True)
if len(dataset_malware) is not 0:
  print("\nMalware samples evaluation:")
  test_model.evaluate(x=malware_data_generator, steps=len(dataset_malware) // bs, use_multiprocessing=True)
if len(dataset_goodware) is not 0:
  print("\nGoodware samples evaluation:")
  test_model.evaluate(x=goodware_data_generator, steps=len(dataset_goodware) // bs, use_multiprocessing=True)

Testing poisoned model
Poisoned samples evaluation:


[0.0, 0.9599359035491943]


Malware samples evaluation:


[0.0, 0.7163461446762085]


Goodware samples evaluation:


[0.0, 0.9919354915618896]

In [None]:
victim_model.save('/content/drive/MyDrive/PoliMi Thesis/Modelli/final_victim_model.hdf5')

In [None]:
test_only = False

with open('dataset-sorel-couples.json', 'r') as f:
  hash_list = []
  print(f'Loading sorel')
  tmp = json.load(f)
  if not test_only:
    hash_list.extend(tmp['train'])
    hash_list.extend(tmp['valid'])
  hash_list.extend(tmp['test'])

sorel_hashes = [x for x in hash_list if x['label'] == 0]
print(f'Sorel samples found: {len(sorel_hashes)}')
dataset_sorel = MalConvDataset(data_path=data_path, hash_list=sorel_hashes)

sorel_data_generation = tf.data.Dataset.from_generator(lambda: dataset_sorel,
                                                        output_types=(tf.float32, tf.int8),
                                                        output_shapes=out_shape_class).batch(bs)

with open('dataset-malimg-couples.json', 'r') as f:
  hash_list = []
  print(f'Loading malimg')
  tmp = json.load(f)
  if not test_only:
    hash_list.extend(tmp['train'])
    hash_list.extend(tmp['valid'])
  hash_list.extend(tmp['test'])

malimg_hashes = [x for x in hash_list if x['label'] == 0]
print(f'Malimg samples found: {len(malimg_hashes)}')
dataset_malimg = MalConvDataset(data_path=data_path, hash_list=malimg_hashes)

malimg_data_generation = tf.data.Dataset.from_generator(lambda: dataset_malimg,
                                                        output_types=(tf.float32, tf.int8),
                                                        output_shapes=out_shape_class).batch(bs)

with open('dataset-kisa-couples.json', 'r') as f:
  hash_list = []
  print(f'Loading kisa')
  tmp = json.load(f)
  if not test_only:
    hash_list.extend(tmp['train'])
    hash_list.extend(tmp['valid'])
  hash_list.extend(tmp['test'])

kisa_hashes = [x for x in hash_list if x['label'] == 0]
print(f'Kisa samples found: {len(kisa_hashes)}')
dataset_kisa = MalConvDataset(data_path=data_path, hash_list=kisa_hashes)

kisa_data_generation = tf.data.Dataset.from_generator(lambda: dataset_kisa,
                                                        output_types=(tf.float32, tf.int8),
                                                        output_shapes=out_shape_class).batch(bs)

with open('dataset-goodware.json', 'r') as f:
  hash_list = []
  print(f'Loading goodware')
  tmp = json.load(f)
  if not test_only:
    hash_list.extend(tmp['train'])
    hash_list.extend(tmp['valid'])
  hash_list.extend(tmp['test'])

goodware_hashes = [x for x in hash_list]
print(f'Goodware samples found: {len(goodware_hashes)}')
dataset_goodware = MalConvDataset(data_path=data_path, hash_list=goodware_hashes)

goodware_data_generation = tf.data.Dataset.from_generator(lambda: dataset_goodware,
                                                        output_types=(tf.float32, tf.int8),
                                                        output_shapes=out_shape_class).batch(bs)

Loading sorel
Sorel samples found: 7230
Loading malimg
Malimg samples found: 2900
Loading kisa
Kisa samples found: 2397
Loading goodware
Goodware samples found: 5000


In [None]:
test_model = victim_model

print("Sorel samples evaluation:")
test_model.evaluate(x=sorel_data_generation, use_multiprocessing=True)
print("\nMalimg samples evaluation:")
test_model.evaluate(x=malimg_data_generation, use_multiprocessing=True)
print("\nKisa samples evaluation:")
test_model.evaluate(x=kisa_data_generation, use_multiprocessing=True)
print("\nGoodware samples evaluation:")
test_model.evaluate(x=goodware_data_generation, use_multiprocessing=True)

Sorel samples evaluation:


[0.0, 0.9633471369743347]


Malimg samples evaluation:


[0.0, 0.9993103742599487]


Kisa samples evaluation:


[0.0, 0.987484335899353]


Goodware samples evaluation:


[0.0, 0.9962000250816345]