In [None]:
import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, random_split, TensorDataset

In [None]:
training_path = '../../ADFA-LD-Dataset/ADFA-LD/Training_Data_Master' 
data = []
sequence_length = 10
unique = []

def load_data(folder_path):
    data = []
    sequence_length = 10
    unique = []
    # Process each file
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if os.path.isfile(file_path):
            with open(file_path, 'r') as file:
                content = file.read().split()
                content = [int(s) for s in content]
                combined_batches = [content[i:i + sequence_length] for i in range(0, len(content))]
                combined_batches = [lst for lst in combined_batches if len(lst) == sequence_length]
                data.extend(combined_batches)
                unique.extend(content)

    unique_syscalls = set(unique)
    #num_unique_syscalls = len(unique_syscalls)
    num_unique_syscalls = 169
    # Create a mapping from system call number to a unique index
    mapping = {sys_call: i for i, sys_call in enumerate(unique_syscalls)}

    # Apply the mapping to each system call in each sequence
    mapped_data = [[mapping[sys_call] for sys_call in sequence] for sequence in data]


    # Now proceed with creating tensors and DataLoader
    tensors = [torch.tensor(x) for x in mapped_data]
    tensor = torch.stack(tensors)
    dataset = TensorDataset(tensor)
    dataloader = DataLoader(dataset, batch_size=64, shuffle=True)
    return dataloader, num_unique_syscalls

dataloader, num_unique_syscalls = load_data(training_path)

In [None]:
class Autoencoder(nn.Module):
    def __init__(self, num_system_calls, embedding_dim, encoding_dim, hidden_dim = 8):
        super(Autoencoder, self).__init__()
        self.embedding_dim = embedding_dim  # Store embedding_dim

        self.embedding = nn.Embedding(num_system_calls, embedding_dim)

        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(10 * embedding_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, int(hidden_dim / 2)),
            nn.ReLU(),
            nn.Linear(int(hidden_dim / 2), encoding_dim)
        )

        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, int(hidden_dim/2)),
            nn.ReLU(),
            nn.Linear(int(hidden_dim/2), hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 10 * embedding_dim)  # Output size matches the total size of embedded input
        )

    def forward(self, x):
        x = self.embedding(x)
        x = x.view(x.size(0), -1)
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        decoded = decoded.view(-1, 10)  # Reshape to [batch_size, sequence_length, embedding_dim]
        return decoded

In [None]:
from math import ceil

embedding_dim = ceil(num_unique_syscalls**(0.25))
encoding_dim = round((embedding_dim * sequence_length) /3)
autoencoder = Autoencoder(num_system_calls = num_unique_syscalls, embedding_dim = embedding_dim, encoding_dim = encoding_dim)


In [None]:
autoencoder = Autoencoder(num_system_calls = num_unique_syscalls, embedding_dim = embedding_dim, encoding_dim = encoding_dim)
optimizer = torch.optim.Adam(autoencoder.parameters(), lr=0.001)
criterion = nn.MSELoss()
loss_history = []
loke = 0

num_epochs = 8
for epoch in range(num_epochs):
    for batch in dataloader:
        inputs = batch[0]
        embedded_inputs = autoencoder.embedding(inputs)  # Embed the inputs
        embedded_inputs = embedded_inputs.view(inputs.size(0), -1)  # Flatten the embedded inputs

        outputs = autoencoder(inputs)
        outputs = outputs.view(inputs.size(0), -1)  # Flatten the outputs
        if loke == 0:
            print(embedded_inputs)
            print(outputs)
            loke+= 1
    

        loss = criterion(outputs, embedded_inputs)  # Compute loss between embedded input and output
        loss_history.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.8f}')


In [None]:
# plot loss history over epochs
import matplotlib.pyplot as plt
plt.plot(loss_history)
plt.xlabel('Batch')
plt.ylabel('Loss')
plt.show()


In [None]:
#Test on attack data
attack_path = '../../ADFA-LD-Dataset/ADFA-LD/Validation_Data_Master' 
attack_data, num_unique_syscalls = load_data(attack_path)
print(num_unique_syscalls)
loss_history = []

for batch in attack_data:
    inputs = batch[0]
    embedded_inputs = autoencoder.embedding(inputs)  # Embed the inputs
    embedded_inputs = embedded_inputs.view(inputs.size(0), -1)  # Flatten the embedded inputs

    outputs = autoencoder(inputs)
    outputs = outputs.view(inputs.size(0), -1)  # Flatten the outputs
    if loke == 0:
        print(embedded_inputs)
        print(outputs)
        loke+= 1
    with torch.no_grad():
        loss = criterion(outputs, embedded_inputs)  # Compute loss between embedded input and output
        loss_history.append(loss.item())

    # print(f'Loss: {loss.item():.8f}')

print(np.mean(np.array(loss_history)))


plt.plot(loss_history)


