# Adversarial LSTM Training

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="5"

In [2]:
# specify device depending on availability of GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

## Helper functions

In [3]:
# wraps a separated dataset to avoid issues while shuffling
class WrapperDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __getitem__(self, index):
        return self.x[index], self.y[index]

    def __len__(self):
        return len(self.y)

# get the same batch split for inputs and labels
def get_batched_x_y(x, y, batch_size, shuffle):
    dataset = WrapperDataset(x, y)
    batched = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    return batched

## Data loading

In [4]:
X_train = torch.load("X_train.pt").to_dense()
X_val = torch.load("X_val.pt").to_dense()
X_test = torch.load("X_test.pt").to_dense()
y_train = torch.load("y_train.pt")
y_val = torch.load("y_val.pt")
y_test = torch.load("y_test.pt")
whitebox_adv = torch.load("whitebox_adv_val.pt")

In [5]:
def zeroPadInput(inputs, amount=50):
    # adds zero padding vectors to the end of each sequence
    for i in range(len(inputs)):
        zero_padding = torch.zeros(inputs[i].shape[0], amount, inputs[i].shape[2])
        inputs[i] = torch.cat([inputs[i], zero_padding], dim=1)
    return inputs

In [6]:
X_train, X_val, X_test = zeroPadInput([X_train, X_val, X_test])

In [7]:
concatenatedSamples = torch.cat(whitebox_adv)
val_length = int(0.1 * len(concatenatedSamples))
train_length = len(concatenatedSamples) - val_length
whitebox_train, whitebox_val = torch.utils.data.random_split(concatenatedSamples, [train_length, val_length],
                                                             generator=torch.Generator().manual_seed(78))

In [8]:
len(whitebox_train)

3852

In [9]:
len(whitebox_val)

428

## Hyperparameters

In [9]:
hidden_size = 128
lstm_layers = 1
epochs = 100
train_batch_size = 256
early_stopping_patience = 7

In [10]:
input_size = X_train.shape[2]
sequence_size = X_train.shape[1]

## Model initialization

In [11]:
torch.manual_seed(78263992)

<torch._C.Generator at 0x7f6bf3b8ae50>

In [12]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, lstm_layers, pooling):
        super(Net, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, lstm_layers, batch_first=True)
        self.pool = pooling(sequence_size)
        self.linear = nn.Linear(hidden_size, 1)

    def forward(self, x):
        lstm_out = self.lstm(x)[0]
        pool_out = self.pool(lstm_out.permute(0, 2, 1))
        linear_out = self.linear(pool_out.squeeze())
        return torch.squeeze(linear_out)

In [13]:
def predict(x, model):
    myX = x.to(device)
    output = model(myX)
    return output > 0

In [14]:
def initModel(input_size, hidden_size, lstm_layers, pooling):
    model = Net(input_size, hidden_size, lstm_layers, pooling)
    model.to(device)
    return model
model = initModel(input_size, hidden_size, lstm_layers, nn.MaxPool1d)
model

Net(
  (lstm): LSTM(264, 128, batch_first=True)
  (pool): MaxPool1d(kernel_size=150, stride=150, padding=0, dilation=1, ceil_mode=False)
  (linear): Linear(in_features=128, out_features=1, bias=True)
)

## Train and test procedures

In [15]:
def test(x, y, model, batch_size, loss_function):
    averageLoss = 0.0
    averageSensitivity = 0.0
    averageSpecificity = 0.0
    numBatches = 0
    # batch the data set
    batched = get_batched_x_y(x, y, batch_size, False)
    for xi, yi in batched:
        # move data to device
        xi = xi.to(device)
        yi = yi.to(device)
        # forward pass through model
        output = model(xi)
        # calculate current loss of model
        loss = loss_function(output, yi)
        # calculate measures
        predicted = output > 0
        matches = yi == predicted
        sensitivity = matches[yi == 1].sum() / (yi == 1).sum()
        specificity = matches[yi == 0].sum() / (yi == 0).sum()
        # record all values
        averageSensitivity += sensitivity.item()
        averageSpecificity += specificity.item()
        averageLoss += loss.item()
        numBatches += 1
    averageSensitivity /= numBatches
    averageSpecificity /= numBatches
    averageLoss /= numBatches
    measures = (averageSensitivity, averageSpecificity)
    return measures, averageLoss

In [16]:
def recordPerformance(epoch, earlyStop, x, y, x_eval, y_eval,
                      x_adv, y_adv, x_adv_eval, y_adv_eval, model, loss_function):
    # take record of current performance
    _, train_loss = test(x, y, model, 1024, loss_function)
    measures, val_loss = test(x_eval, y_eval, model, len(y_eval), loss_function)
    print("Epoch {} Train Loss {:.6f} Val Loss {:.6f} Sensitivity {:.3f} Specificity {:.3f}"
         .format(epoch, train_loss, val_loss, measures[0], measures[1]))
    _, train_loss_adv = test(x_adv, y_adv, model, 1024, loss_function)
    _, val_loss_adv = test(x_adv_eval, y_adv_eval, model, len(y_adv_eval), loss_function)
    print("Epoch {} Adversarial Train Loss {:.6f} Adversarial Val Loss {:.6f}"
         .format(epoch, train_loss_adv, val_loss_adv))
    earlyStop((val_loss, val_loss_adv), model)

In [17]:
from pytorchtools import EarlyStopping
def train(x, y, x_eval, y_eval, adv_train, adv_eval, model, epochs, batch_size, loss_function, optimizer):
    earlyStop = EarlyStopping(patience=early_stopping_patience, verbose=True)
    advBatched = iter(torch.utils.data.DataLoader(adv_train, batch_size=batch_size, shuffle=True))
    adv_train_y = torch.ones(len(adv_train))
    adv_eval_y = torch.ones(len(adv_eval))
    recordPerformance("-", earlyStop, x, y, x_eval, y_eval, adv_train, adv_train_y,
                          adv_eval, adv_eval_y, model, loss_function)
    for epoch in range(epochs):
        # shuffle and batch the data set
        batched = get_batched_x_y(x, y, batch_size, True)
        for xi, yi in batched:
            # move data to device
            xi = xi.to(device)
            yi = yi.to(device)
            # reset previous gradients
            optimizer.zero_grad()
            # forward pass through model
            output = model(xi)
            # calculate current loss of model
            loss = loss_function(output, yi)
            # backprop
            loss.backward()
            # take optimization step
            optimizer.step()
            
            # optimize on adversarial examples
            try:
                x_adv = next(advBatched)
            except StopIteration:
                # get new random batched dataset if all samples were used up
                advBatched = iter(torch.utils.data.DataLoader(adv_train, batch_size=batch_size, shuffle=True))
                x_adv = next(advBatched)
            
            y_adv = torch.ones(len(x_adv))
            x_adv = x_adv.to(device)
            y_adv = y_adv.to(device)

            optimizer.zero_grad()
            output = model(x_adv)
            loss = loss_function(output, y_adv)
            loss.backward()
            optimizer.step()
        recordPerformance(epoch, earlyStop, x, y, x_eval, y_eval, adv_train, adv_train_y,
                          adv_eval, adv_eval_y, model, loss_function)

        # should we stop training?
        if earlyStop.early_stop:
            print("Early Stop")
            break

    # load latest checkpoint
    model.load_state_dict(torch.load('checkpoint.pt'))

## Training

In [18]:
def getWeightedBCE():
    negative_samples = (y_train == 0).sum()
    positive_samples = (y_train == 1).sum() + len(y_train) # add amount due to adversarial examples
    return nn.BCEWithLogitsLoss(pos_weight=negative_samples/positive_samples)

In [19]:
loss_function = getWeightedBCE()
optimizer = torch.optim.Adam(model.parameters())

In [20]:
train(X_train, y_train, X_val, y_val, whitebox_train, whitebox_val,
      model, epochs, train_batch_size, loss_function, optimizer)

Epoch - Train Loss 0.025165 Val Loss 0.025233 Sensitivity 0.000 Specificity 1.000
Epoch - Adversarial Train Loss 0.008895 Adversarial Val Loss 0.008893
Validation loss decreased (inf --> (0.025232532992959023, 0.008893392980098724)).  Saving model ...
Epoch 0 Train Loss 0.023680 Val Loss 0.023850 Sensitivity 0.914 Specificity 0.583
Epoch 0 Adversarial Train Loss 0.007357 Adversarial Val Loss 0.007378
Validation loss decreased ((0.025232532992959023, 0.008893392980098724) --> (0.02384982258081436, 0.007378242444247007)).  Saving model ...
Epoch 1 Train Loss 0.016348 Val Loss 0.017351 Sensitivity 0.904 Specificity 0.602
Epoch 1 Adversarial Train Loss 0.002183 Adversarial Val Loss 0.002155
Validation loss decreased ((0.02384982258081436, 0.007378242444247007) --> (0.017351249232888222, 0.002154763787984848)).  Saving model ...
Epoch 2 Train Loss 0.013656 Val Loss 0.014212 Sensitivity 0.860 Specificity 0.796
Epoch 2 Adversarial Train Loss 0.002178 Adversarial Val Loss 0.002100
Validation l

## Testing

In [21]:
test_measures, test_loss = test(X_test, y_test, model, len(y_test), loss_function)
print("Test Loss {:.6f} Sensitivity {:.3f} Specificity {:.3f}"
         .format(test_loss, test_measures[0], test_measures[1]))

Test Loss 0.006544 Sensitivity 0.957 Specificity 0.935


In [22]:
from sklearn.metrics import confusion_matrix
y_pred = predict(X_test, model).cpu()
confMatrix = confusion_matrix(y_test.numpy(), y_pred.numpy())
pd.DataFrame(confMatrix, index=["True Benign", "True Malicious"], columns=["Predicted Benign", "Predicted Malicious"])

Unnamed: 0,Predicted Benign,Predicted Malicious
True Benign,101,7
True Malicious,184,4096
