In [1]:
import torch
import numpy as np
from math import ceil
import torch.nn as nn
from typing import Tuple
from torchvision.datasets import DatasetFolder

## Loading the baseline model

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [3]:
class ConvNet(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.conv = nn.Conv1d(1, 16, kernel_size=(10,), stride=(1,))
        self.relu = nn.ReLU()
        self.pooling = nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
        self.linear = nn.Linear(65496, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.pooling(self.relu(self.conv(x)))
        x = x.view(-1, 65496) # flatten
        x = self.sigmoid(self.linear(x))
        return x.reshape(-1) # keep only the batch dimension

In [4]:
model = ConvNet().to(device)
model.load_state_dict(torch.load("model.pt"))
model

ConvNet(
  (conv): Conv1d(1, 16, kernel_size=(10,), stride=(1,))
  (relu): ReLU()
  (pooling): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (linear): Linear(in_features=65496, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

## Defining transform with mask and sanity check

In [5]:
# Transform updated to create he mask along the input tensor
class BinaryTransformWithMask:
    def __init__(self, input_length: int, adversarial_ratio: float) -> None:
        """Initializes the length and ratio parameter that defines this transform."""
        self.input_length = input_length
        self.adversarial_ratio = adversarial_ratio

    def __call__(self, binary_data: bytes) -> Tuple[torch.Tensor, np.array]:
        """Returns the model input prepared as a (1,input_length) Tensor,
        and the mask which indicates positions influenced exclusively by the adv. suffix.
        as a NumPy array."""

        l_original = len(binary_data)
        binary_array = self.get_extended_binary_array(binary_data)
        l_with_adversarial = len(binary_array)

        if l_with_adversarial < self.input_length:
            # the bytes array is too short and zero padding should be added to match input_length
            # the mask does not include the zero-padding bytes
            padding = np.zeros(self.input_length - l_with_adversarial, dtype=np.uint8)
            binary_array = np.concatenate((binary_array, padding))
            mask = np.arange(l_original, l_with_adversarial)
        elif l_with_adversarial > self.input_length:
            # the byte array should be split into ceil(l_with_adversarial / input_length) chunks,
            # with the last chunk being padded to chunk size if needed
            # the chunks where there are padding 0s present are not part of the mask,
            # as they are not influenced only by the adversarial bytes.
            window_size = ceil(l_with_adversarial / self.input_length)
            # byte groups influenced by the original binary:
            num_original_groups = ceil(l_original / window_size)
            l_padding = self.input_length * window_size - l_with_adversarial
            # byte groups influenced by the automatic padding:
            num_padding_groups = ceil(l_padding / window_size)
            padding = np.zeros(l_padding, dtype=np.uint8)
            binary_array = np.concatenate((binary_array, padding))
            binary_array = binary_array.reshape(-1, window_size)
            binary_array = np.mean(binary_array, axis=1)
            mask = np.arange(num_original_groups, self.input_length - num_padding_groups)
        else:
            # no padding needed, mask is straightforward
            mask = np.arange(l_original, l_with_adversarial)
            
        # Scale the data to [0, 1]
        scaled_data = binary_array / 255.0
        tensor = torch.tensor(scaled_data, dtype=torch.float32)
        # Add extra dimension for the model to work properly.
        return tensor.unsqueeze(0), mask

    def get_extended_binary_array(self, binary_data: bytes) -> np.array:
        """Build the extended binary with the adversarial suffix set to zero."""
        l = len(binary_data)
        l_with_adversarial = ceil(l * (1 + self.adversarial_ratio))
        binary_array = np.zeros(l_with_adversarial, dtype=np.uint8)
        binary_array[:l] = np.frombuffer(binary_data, dtype=np.uint8)
        return binary_array

In [6]:
# example #1 in the assignment text (sanity check)
binary_data = bytes(list(range(1, 11)))
transform = BinaryTransformWithMask(input_length=6, adversarial_ratio=0.4) # +4 bytes
X, M = transform(binary_data)
X, M

(tensor([[0.0078, 0.0196, 0.0314, 0.0131, 0.0000, 0.0000]]),
 array([], dtype=int64))

In [7]:
# example #2 in the assignment text (sanity check)
binary_data = bytes(list(range(1, 11)))
transform = BinaryTransformWithMask(input_length=6, adversarial_ratio=0.5) # +5 bytes
X, M = transform(binary_data)
X, M

(tensor([[0.0078, 0.0196, 0.0314, 0.0131, 0.0000, 0.0000]]), array([4]))

In [8]:
# Proof of concept preprocessing pipeline applied on a single sample in the victim folder
file_path = "data/victim/malware/0d41d1d904aecf716303f55108e020fbd9a4dbcd997efb08fba5e10e936d419c"
with open(file_path, "rb") as f:
    binary_data = f.read()

transform = BinaryTransformWithMask(input_length=2**14, adversarial_ratio=0.1)
input_tensor, M = transform(binary_data)
input_tensor = input_tensor.unsqueeze(0) # add batch dimension

In [9]:
len(M), M

(1450, array([14505, 14506, 14507, ..., 15952, 15953, 15954]))

In [10]:
# Baseline confidence in the sample's malware-ness
model(input_tensor)

tensor([0.9596], grad_fn=<ViewBackward0>)

## Random adversary suffix for baseline attack (one sample)

In [11]:
torch.manual_seed(42)

<torch._C.Generator at 0x7d57fbea2f30>

In [12]:
# Observing the effect of the random bytes with an adversarial ratio of 10% (mask length: 1450)
input_with_adversary = input_tensor.clone()
adversary_features = torch.rand(len(M), dtype=torch.float32)
input_with_adversary[...,M] += adversary_features
model(input_with_adversary)

tensor([0.9701], grad_fn=<ViewBackward0>)

## Optimized attack with PGD (one sample)

In [13]:
# Untargeted PGD attack
adversary_features = torch.rand(len(M), dtype=torch.float32, requires_grad=True)
# the value of eps. is provided here: 0.01 found as an optimal value through trial-and-error
opt = torch.optim.SGD([adversary_features], lr=0.01)

loss_fn = nn.BCELoss()

for t in range(50):
    # We apply the masked bytes as an additive, bounded noise over the 0s in the input_tensor's mask positions
    input_with_adversary = input_tensor.clone()
    input_with_adversary[...,M] += adversary_features
    pred = model(input_with_adversary).squeeze()
    loss = -loss_fn(pred, torch.tensor(1, dtype=torch.float32)) # 1 = malware
    if t % 5 == 0:
        print(f"Epoch: {t}: {loss.detach().item():.8f}")
       
    opt.zero_grad()
    loss.backward()
    # use the sign method for the gradients
    adversary_features.grad.sign_()
    opt.step()

    # projection with clipping
    adversary_features.data.clamp_(0, 1)

Epoch: 0: -0.03119338
Epoch: 5: -0.06751740
Epoch: 10: -0.13005204
Epoch: 15: -0.22691730
Epoch: 20: -0.36139640
Epoch: 25: -0.53739083
Epoch: 30: -0.74266338
Epoch: 35: -0.96414167
Epoch: 40: -1.19166565
Epoch: 45: -1.40630639


In [14]:
# Observing the effect of the PGD attack on the model's output - way more significant
model(input_with_adversary)

tensor([0.2087], grad_fn=<ViewBackward0>)

In [15]:
adversary_features

tensor([1.0000, 0.0000, 0.2817,  ..., 1.0000, 1.0000, 0.9841],
       requires_grad=True)

## Preformance of random adversarial attack with 5%, 10%, 15%, 20% adversarial bytes

In [16]:
print("Adversarial accuracy of random adversarial bytes based on adversarial-to-original byte count ratio.")
# We also track mean mask length.
print("Ratio\tACC.\tMean. |M|")
for adversarial_ratio in [0.05, 0.1, 0.15, 0.2]:
    # The transform and the dataset are influenced by the adversarial_ratio, so we initialize them here
    transform = BinaryTransformWithMask(input_length=2**14, adversarial_ratio=adversarial_ratio)
    victim_dataset = DatasetFolder(root="data/victim", loader=lambda x: open(x, 'rb').read(), extensions=('',), transform=transform)
    
    # accumulating values across the victim dataset
    num_successful = 0
    mask_len_total = 0
    num_total = 0

    with torch.no_grad():
        for (X, M), y in victim_dataset:
            if victim_dataset.classes[y] == "benign":
                # we only need to attack malware samples
                continue
            # apply random noise in an additive fashion
            input_with_adversary = X.clone()
            adversary_features = torch.rand(len(M), dtype=torch.float32)
            input_with_adversary[...,M] += adversary_features
            # evaluating effect
            y_pred = model(input_with_adversary.unsqueeze(0)).squeeze()
            pred_label = y_pred > 0.5
            if pred_label != y:
                num_successful += 1
            num_total += 1
            mask_len_total += len(M)

    print(f"{adversarial_ratio}:\t{num_successful / num_total:.4f}\t{mask_len_total / num_total:.2f}")

Adversarial accuracy of random adversarial bytes based on adversarial-to-original byte count ratio.
Ratio	ACC.	Mean. |M|
0.05:	0.0000	690.04


0.1:	0.0000	1340.76
0.15:	0.0200	1937.20
0.2:	0.0000	2476.10


## Preformance of PGD adversarial attack with 5%, 10%, 15%, 20% adversarial bytes

In [17]:
loss_fn = nn.BCELoss()

print("Adversarial accuracy of PGD based on adversarial-to-original byte count ratio.")
print("Ratio\tACC.\tMean. |M|")

for adversarial_ratio in [0.05, 0.1, 0.15, 0.2]:
    # The transform and the dataset are influenced by the adversarial_ratio, so we initialize them here
    transform = BinaryTransformWithMask(input_length=2**14, adversarial_ratio=adversarial_ratio)
    victim_dataset = DatasetFolder(root="data/victim", loader=lambda x: open(x, 'rb').read(), extensions=('',), transform=transform)
    
    num_successful = 0
    mask_len_total = 0
    num_total = 0

    for (X, M), y in victim_dataset:
        if victim_dataset.classes[y] == "benign":
            # we only need to attack malware samples
            continue

        malware_idx = victim_dataset.class_to_idx["malware"]
        
        # Optimization is separate for each sample, so we initialize the requires_grad parameters and the optimizer here
        adversary_features = torch.rand(len(M), dtype=torch.float32, requires_grad=True)
        opt = torch.optim.SGD([adversary_features], lr=0.01)

        # PGD
        for t in range(100):
            input_with_adversary = X.clone().unsqueeze(0) # add batch dimension
            input_with_adversary[...,M] += adversary_features
            pred = model(input_with_adversary).squeeze()
            # untargeted = maximizing loss
            loss = -loss_fn(pred, torch.tensor(malware_idx, dtype=torch.float32))
            
            opt.zero_grad()
            loss.backward()
            adversary_features.grad.sign_()
            opt.step()

            # projection with clipping
            adversary_features.data.clamp_(0, 1)

        # Final prediction:
        with torch.no_grad():
            y_pred = model(input_with_adversary).squeeze()
            pred_label = y_pred > 0.5
            if pred_label != y:
                num_successful += 1
            num_total += 1
            mask_len_total += len(M)

    print(f"{adversarial_ratio}:\t{num_successful / num_total:.4f}\t{mask_len_total / num_total:.2f}")

Adversarial accuracy of PGD based on adversarial-to-original byte count ratio.
Ratio	ACC.	Mean. |M|
0.05:	0.2200	690.04
0.1:	0.5600	1340.76
0.15:	0.7400	1937.20
0.2:	0.7600	2476.10
