# LSTM Training

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="4"

In [3]:
torch.manual_seed(78263992)

# specify device depending on availability of GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

## Helper functions

In [4]:
# wraps a separated dataset to avoid issues while shuffling
class WrapperDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __getitem__(self, index):
        return self.x[index], self.y[index]

    def __len__(self):
        return len(self.y)

# get the same batch split for inputs and labels
def get_batched_x_y(x, y, batch_size, shuffle):
    dataset = WrapperDataset(x, y)
    batched = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    return batched

## Data loading

In [5]:
X_train = torch.load("X_train.pt").to_dense()
X_val = torch.load("X_val.pt").to_dense()
X_test = torch.load("X_test.pt").to_dense()
y_train = torch.load("y_train.pt")
y_val = torch.load("y_val.pt")
y_test = torch.load("y_test.pt")

In [6]:
def zeroPadInput(inputs, amount=50):
    # adds zero padding vectors to the end of each sequence
    for i in range(len(inputs)):
        zero_padding = torch.zeros(inputs[i].shape[0], amount, inputs[i].shape[2])
        inputs[i] = torch.cat([inputs[i], zero_padding], dim=1)
    return inputs

In [7]:
X_train, X_val, X_test = zeroPadInput([X_train, X_val, X_test])

## Hyperparameters

In [8]:
hidden_size = 128
lstm_layers = 1
epochs = 100
train_batch_size = 256
early_stopping_patience = 7

In [9]:
input_size = X_train.shape[2]
sequence_size = X_train.shape[1]

## Model initialization

In [10]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, lstm_layers, pooling):
        super(Net, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, lstm_layers, batch_first=True)
        self.pool = pooling(sequence_size)
        self.linear = nn.Linear(hidden_size, 1)

    def forward(self, x):
        lstm_out = self.lstm(x)[0]
        pool_out = self.pool(lstm_out.permute(0, 2, 1))
        linear_out = self.linear(pool_out.squeeze())
        return torch.squeeze(linear_out)

In [11]:
def predict(x, model):
    myX = x.to(device)
    output = model(myX)
    return output > 0

In [12]:
def initModel(input_size, hidden_size, lstm_layers, pooling):
    model = Net(input_size, hidden_size, lstm_layers, pooling)
    model.to(device)
    return model
model = initModel(input_size, hidden_size, lstm_layers, nn.MaxPool1d)
model

Net(
  (lstm): LSTM(264, 128, batch_first=True)
  (pool): MaxPool1d(kernel_size=150, stride=150, padding=0, dilation=1, ceil_mode=False)
  (linear): Linear(in_features=128, out_features=1, bias=True)
)

## Train and test procedures

In [13]:
def test(x, y, model, batch_size, loss_function):
    averageLoss = 0.0
    averageSensitivity = 0.0
    averageSpecificity = 0.0
    numBatches = 0
    # batch the data set
    batched = get_batched_x_y(x, y, batch_size, False)
    for xi, yi in batched:
        # move data to device
        xi = xi.to(device)
        yi = yi.to(device)
        # forward pass through model
        output = model(xi)
        # calculate current loss of model
        loss = loss_function(output, yi)
        # calculate measures
        predicted = output > 0
        matches = yi == predicted
        sensitivity = matches[yi == 1].sum() / (yi == 1).sum()
        specificity = matches[yi == 0].sum() / (yi == 0).sum()
        # record all values
        averageSensitivity += sensitivity.item()
        averageSpecificity += specificity.item()
        averageLoss += loss.item()
        numBatches += 1
    averageSensitivity /= numBatches
    averageSpecificity /= numBatches
    averageLoss /= numBatches
    measures = (averageSensitivity, averageSpecificity)
    return measures, averageLoss

In [14]:
def recordPerformance(measureList, losses, epoch, earlyStop, x, y, x_eval, y_eval, model, loss_function):
    # take record of current performance
    _, train_loss = test(x, y, model, 1024, loss_function)
    measures, val_loss = test(x_eval, y_eval, model, len(y_eval), loss_function)
    print("Epoch {} Train Loss {:.6f} Val Loss {:.6f} Sensitivity {:.3f} Specificity {:.3f}"
         .format(epoch, train_loss, val_loss, measures[0], measures[1]))
    measureList.append(measures)
    losses.append((train_loss, val_loss))
    earlyStop(val_loss, model)

In [15]:
from pytorchtools import EarlyStopping
def train(x, y, x_eval, y_eval, model, epochs, batch_size, loss_function, optimizer):
    earlyStop = EarlyStopping(patience=early_stopping_patience, verbose=True)
    measureList = []
    losses = []
    recordPerformance(measureList, losses, "-", earlyStop, x, y, x_eval, y_eval, model, loss_function)
    for epoch in range(epochs):
        # shuffle and batch the data set
        batched = get_batched_x_y(x, y, batch_size, True)
        for xi, yi in batched:
            # move data to device
            xi = xi.to(device)
            yi = yi.to(device)
            # reset previous gradients
            optimizer.zero_grad()
            # forward pass through model
            output = model(xi)
            # calculate current loss of model
            loss = loss_function(output, yi)
            # backprop
            loss.backward()
            # take optimization step
            optimizer.step()
        recordPerformance(measureList, losses, epoch, earlyStop, x, y, x_eval, y_eval, model, loss_function)
        
        if earlyStop.early_stop:
            print("Early Stop")
            break
    # load latest checkpoint
    model.load_state_dict(torch.load('checkpoint.pt'))
    
    return measureList, losses

## Stupid equally weighted BCE loss

In [17]:
loss_function = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters())

In [18]:
val_measures, losses = train(X_train, y_train, X_val, y_val, model, epochs,
                                         train_batch_size, loss_function, optimizer)

Epoch - Train Loss 0.712373 Val Loss 0.712352 Sensitivity 0.000 Specificity 1.000
Validation loss decreased (inf --> 0.712352).  Saving model ...
Epoch 0 Train Loss 0.114482 Val Loss 0.114853 Sensitivity 1.000 Specificity 0.000
Validation loss decreased (0.712352 --> 0.114853).  Saving model ...
Epoch 1 Train Loss 0.112302 Val Loss 0.112781 Sensitivity 1.000 Specificity 0.000
Validation loss decreased (0.114853 --> 0.112781).  Saving model ...
Epoch 2 Train Loss 0.104136 Val Loss 0.105179 Sensitivity 1.000 Specificity 0.000
Validation loss decreased (0.112781 --> 0.105179).  Saving model ...
Epoch 3 Train Loss 0.085791 Val Loss 0.089165 Sensitivity 1.000 Specificity 0.000
Validation loss decreased (0.105179 --> 0.089165).  Saving model ...
Epoch 4 Train Loss 0.079096 Val Loss 0.083260 Sensitivity 0.998 Specificity 0.176
Validation loss decreased (0.089165 --> 0.083260).  Saving model ...
Epoch 5 Train Loss 0.074630 Val Loss 0.080116 Sensitivity 0.998 Specificity 0.185
Validation loss d

In [19]:
test_measures, test_loss = test(X_test, y_test, model, len(y_test), loss_function)
print("Test Loss {:.6f} Sensitivity {:.3f} Specificity {:.3f}"
         .format(test_loss, test_measures[0], test_measures[1]))

Test Loss 0.032270 Sensitivity 0.999 Specificity 0.676


In [20]:
from sklearn.metrics import confusion_matrix
y_pred = predict(X_test, model).cpu()
confMatrix = confusion_matrix(y_test.numpy(), y_pred.numpy())
pd.DataFrame(confMatrix, index=["True Benign", "True Malicious"], columns=["Predicted Benign", "Predicted Malicious"])

Unnamed: 0,Predicted Benign,Predicted Malicious
True Benign,73,35
True Malicious,6,4274


## Hyperparameter tuning

In [16]:
def getWeightedBCE():
    negative_samples = (y_train == 0).sum()
    positive_samples = (y_train == 1).sum()
    return nn.BCEWithLogitsLoss(pos_weight=negative_samples/positive_samples)

**TODO**

Stuff to try:
- type of pooling layer (max, avg)
- LSTM layers (e.g. 1, 2, 3)
- LSTM hidden state size (e.g. 50, 100, 200, 300)
- LSTM dropout (e.g. 10%, 20%, 50%)
- ?linear layers on top of LSTM (e.g. 1, 2)
- ?Adam parameters (learning_rate, decay ...) or other optimizers

In [36]:
def printPerformances(performances, parameters):
    for perf, param in zip(performances, parameters):
        print("Final {} val loss: {:.6f}".format(param, perf[1][-1 - early_stopping_patience][1]))
        print("Final {} val specificity: {:.3f}".format(param, perf[0][-1 - early_stopping_patience][0]))
        print("Final {} val sensitivity: {:.3f}".format(param, perf[0][-1 - early_stopping_patience][1]))

In [18]:
def tunePoolingType():
    poolingTypes = [nn.MaxPool1d, nn.AvgPool1d]
    performances = []
    for p in poolingTypes:
        torch.manual_seed(78263992)
        model = initModel(input_size, 128, 1, p)
        optimizer = torch.optim.Adam(model.parameters())
        loss_function = getWeightedBCE()
        performance = train(X_train, y_train, X_val, y_val, model, epochs, train_batch_size, loss_function, optimizer)
        performances.append(performance)
    return performances

In [19]:
performances = tunePoolingType()

Epoch - Train Loss 0.034044 Val Loss 0.034111 Sensitivity 0.000 Specificity 1.000
Validation loss decreased (inf --> 0.034111).  Saving model ...
Epoch 0 Train Loss 0.032975 Val Loss 0.033037 Sensitivity 0.002 Specificity 1.000
Validation loss decreased (0.034111 --> 0.033037).  Saving model ...
Epoch 1 Train Loss 0.025963 Val Loss 0.026476 Sensitivity 0.902 Specificity 0.704
Validation loss decreased (0.033037 --> 0.026476).  Saving model ...
Epoch 2 Train Loss 0.016368 Val Loss 0.017926 Sensitivity 0.892 Specificity 0.815
Validation loss decreased (0.026476 --> 0.017926).  Saving model ...
Epoch 3 Train Loss 0.012494 Val Loss 0.013311 Sensitivity 0.908 Specificity 0.898
Validation loss decreased (0.017926 --> 0.013311).  Saving model ...
Epoch 4 Train Loss 0.010277 Val Loss 0.011103 Sensitivity 0.921 Specificity 0.926
Validation loss decreased (0.013311 --> 0.011103).  Saving model ...
Epoch 5 Train Loss 0.008169 Val Loss 0.009580 Sensitivity 0.930 Specificity 0.926
Validation loss d

In [37]:
printPerformances(performances, ["max pool", "avg pool"])

Final max pool val loss: 0.005656
Final max pool val specificity: 0.974
Final max pool val sensitivity: 0.917
Final avg pool val loss: 0.016071
Final avg pool val specificity: 0.843
Final avg pool val sensitivity: 0.861


In [38]:
def tuneLstmLayers():
    lstm_layer_list = [1, 2, 3]
    performances = []
    for l in lstm_layer_list:
        torch.manual_seed(78263992)
        model = initModel(input_size, 128, l, nn.MaxPool1d)
        optimizer = torch.optim.Adam(model.parameters())
        loss_function = getWeightedBCE()
        performance = train(X_train, y_train, X_val, y_val, model, epochs, train_batch_size, loss_function, optimizer)
        performances.append(performance)
    return performances

In [39]:
performances = tuneLstmLayers()

Epoch - Train Loss 0.034044 Val Loss 0.034111 Sensitivity 0.000 Specificity 1.000
Validation loss decreased (inf --> 0.034111).  Saving model ...
Epoch 0 Train Loss 0.032975 Val Loss 0.033037 Sensitivity 0.002 Specificity 1.000
Validation loss decreased (0.034111 --> 0.033037).  Saving model ...
Epoch 1 Train Loss 0.025963 Val Loss 0.026476 Sensitivity 0.902 Specificity 0.704
Validation loss decreased (0.033037 --> 0.026476).  Saving model ...
Epoch 2 Train Loss 0.016368 Val Loss 0.017926 Sensitivity 0.892 Specificity 0.815
Validation loss decreased (0.026476 --> 0.017926).  Saving model ...
Epoch 3 Train Loss 0.012494 Val Loss 0.013311 Sensitivity 0.908 Specificity 0.898
Validation loss decreased (0.017926 --> 0.013311).  Saving model ...
Epoch 4 Train Loss 0.010277 Val Loss 0.011103 Sensitivity 0.921 Specificity 0.926
Validation loss decreased (0.013311 --> 0.011103).  Saving model ...
Epoch 5 Train Loss 0.008169 Val Loss 0.009580 Sensitivity 0.930 Specificity 0.926
Validation loss d

Epoch 13 Train Loss 0.022272 Val Loss 0.024369 Sensitivity 0.714 Specificity 0.880
EarlyStopping counter: 2 out of 7
Epoch 14 Train Loss 0.019317 Val Loss 0.021523 Sensitivity 0.884 Specificity 0.685
Validation loss decreased (0.021928 --> 0.021523).  Saving model ...
Epoch 15 Train Loss 0.019238 Val Loss 0.021504 Sensitivity 0.884 Specificity 0.685
Validation loss decreased (0.021523 --> 0.021504).  Saving model ...
Epoch 16 Train Loss 0.020753 Val Loss 0.022400 Sensitivity 0.669 Specificity 0.907
EarlyStopping counter: 1 out of 7
Epoch 17 Train Loss 0.019404 Val Loss 0.021840 Sensitivity 0.851 Specificity 0.694
EarlyStopping counter: 2 out of 7
Epoch 18 Train Loss 0.020276 Val Loss 0.023091 Sensitivity 0.897 Specificity 0.657
EarlyStopping counter: 3 out of 7
Epoch 19 Train Loss 0.018782 Val Loss 0.021491 Sensitivity 0.885 Specificity 0.685
Validation loss decreased (0.021504 --> 0.021491).  Saving model ...
Epoch 20 Train Loss 0.019058 Val Loss 0.021111 Sensitivity 0.804 Specificity

In [40]:
printPerformances(performances, ["1 layer", "2 layers", "3 layers"])

Final 1 layer val loss: 0.005656
Final 1 layer val specificity: 0.974
Final 1 layer val sensitivity: 0.917
Final 2 layers val loss: 0.006756
Final 2 layers val specificity: 0.953
Final 2 layers val sensitivity: 0.907
Final 3 layers val loss: 0.021111
Final 3 layers val specificity: 0.804
Final 3 layers val sensitivity: 0.815


In [41]:
def tuneLstmStateSize():
    lstm_state_sizes = [64, 128, 256, 512]
    performances = []
    for s in lstm_state_sizes:
        torch.manual_seed(78263992)
        model = initModel(input_size, s, 1, nn.MaxPool1d)
        optimizer = torch.optim.Adam(model.parameters())
        loss_function = getWeightedBCE()
        performance = train(X_train, y_train, X_val, y_val, model, epochs, train_batch_size, loss_function, optimizer)
        performances.append(performance)
    return performances

In [42]:
performances = tuneLstmStateSize()

Epoch - Train Loss 0.034118 Val Loss 0.034178 Sensitivity 0.000 Specificity 1.000
Validation loss decreased (inf --> 0.034178).  Saving model ...
Epoch 0 Train Loss 0.027161 Val Loss 0.028837 Sensitivity 0.827 Specificity 0.602
Validation loss decreased (0.034178 --> 0.028837).  Saving model ...
Epoch 1 Train Loss 0.024150 Val Loss 0.025929 Sensitivity 0.883 Specificity 0.620
Validation loss decreased (0.028837 --> 0.025929).  Saving model ...
Epoch 2 Train Loss 0.020685 Val Loss 0.022713 Sensitivity 0.855 Specificity 0.713
Validation loss decreased (0.025929 --> 0.022713).  Saving model ...
Epoch 3 Train Loss 0.016458 Val Loss 0.018292 Sensitivity 0.848 Specificity 0.796
Validation loss decreased (0.022713 --> 0.018292).  Saving model ...
Epoch 4 Train Loss 0.014846 Val Loss 0.018622 Sensitivity 0.911 Specificity 0.759
EarlyStopping counter: 1 out of 7
Epoch 5 Train Loss 0.012681 Val Loss 0.014597 Sensitivity 0.924 Specificity 0.824
Validation loss decreased (0.018292 --> 0.014597).  

Epoch 10 Train Loss 0.004124 Val Loss 0.007920 Sensitivity 0.966 Specificity 0.917
EarlyStopping counter: 2 out of 7
Epoch 11 Train Loss 0.004431 Val Loss 0.009577 Sensitivity 0.969 Specificity 0.870
EarlyStopping counter: 3 out of 7
Epoch 12 Train Loss 0.004279 Val Loss 0.008750 Sensitivity 0.973 Specificity 0.898
EarlyStopping counter: 4 out of 7
Epoch 13 Train Loss 0.003136 Val Loss 0.007000 Sensitivity 0.968 Specificity 0.935
Validation loss decreased (0.007235 --> 0.007000).  Saving model ...
Epoch 14 Train Loss 0.003074 Val Loss 0.007774 Sensitivity 0.963 Specificity 0.935
EarlyStopping counter: 1 out of 7
Epoch 15 Train Loss 0.002677 Val Loss 0.008030 Sensitivity 0.959 Specificity 0.926
EarlyStopping counter: 2 out of 7
Epoch 16 Train Loss 0.003288 Val Loss 0.006653 Sensitivity 0.969 Specificity 0.944
Validation loss decreased (0.007000 --> 0.006653).  Saving model ...
Epoch 17 Train Loss 0.002584 Val Loss 0.008544 Sensitivity 0.974 Specificity 0.926
EarlyStopping counter: 1 out

In [43]:
printPerformances(performances, [64, 128, 256, 512])

Final 64 val loss: 0.006979
Final 64 val specificity: 0.973
Final 64 val sensitivity: 0.935
Final 128 val loss: 0.005656
Final 128 val specificity: 0.974
Final 128 val sensitivity: 0.917
Final 256 val loss: 0.006653
Final 256 val specificity: 0.969
Final 256 val sensitivity: 0.944
Final 512 val loss: 0.007049
Final 512 val specificity: 0.968
Final 512 val sensitivity: 0.926


## Training

In [17]:
loss_function = getWeightedBCE()
optimizer = torch.optim.Adam(model.parameters())

In [18]:
val_measures, losses = train(X_train, y_train, X_val, y_val, model, epochs,
                                         train_batch_size, loss_function, optimizer)

Epoch - Train Loss 0.034044 Val Loss 0.034111 Sensitivity 0.000 Specificity 1.000
Validation loss decreased (inf --> 0.034111).  Saving model ...
Epoch 0 Train Loss 0.032975 Val Loss 0.033037 Sensitivity 0.002 Specificity 1.000
Validation loss decreased (0.034111 --> 0.033037).  Saving model ...
Epoch 1 Train Loss 0.025963 Val Loss 0.026476 Sensitivity 0.902 Specificity 0.704
Validation loss decreased (0.033037 --> 0.026476).  Saving model ...
Epoch 2 Train Loss 0.016368 Val Loss 0.017926 Sensitivity 0.892 Specificity 0.815
Validation loss decreased (0.026476 --> 0.017926).  Saving model ...
Epoch 3 Train Loss 0.012494 Val Loss 0.013311 Sensitivity 0.908 Specificity 0.898
Validation loss decreased (0.017926 --> 0.013311).  Saving model ...
Epoch 4 Train Loss 0.010277 Val Loss 0.011103 Sensitivity 0.921 Specificity 0.926
Validation loss decreased (0.013311 --> 0.011103).  Saving model ...
Epoch 5 Train Loss 0.008169 Val Loss 0.009580 Sensitivity 0.930 Specificity 0.926
Validation loss d

## Testing

In [19]:
test_measures, test_loss = test(X_test, y_test, model, len(y_test), loss_function)
print("Test Loss {:.6f} Sensitivity {:.3f} Specificity {:.3f}"
         .format(test_loss, test_measures[0], test_measures[1]))

Test Loss 0.005780 Sensitivity 0.976 Specificity 0.935


In [20]:
from sklearn.metrics import confusion_matrix
y_pred = predict(X_test, model).cpu()
confMatrix = confusion_matrix(y_test.numpy(), y_pred.numpy())
pd.DataFrame(confMatrix, index=["True Benign", "True Malicious"], columns=["Predicted Benign", "Predicted Malicious"])

Unnamed: 0,Predicted Benign,Predicted Malicious
True Benign,101,7
True Malicious,102,4178
