In [1]:
import numpy as np 
import pandas as pd
from sklearn.neighbors.kde import KernelDensity
from sklearn.metrics import precision_recall_fscore_support as prf, accuracy_score
import torch
from torch.nn import functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn

In [2]:
class KDD99Loader(object):
    def __init__(self, data_path, N_train , mode="train"):
        self.mode=mode
        data = np.load(data_path)

        labels = data["kdd"][:,-1]
        features = data["kdd"][:,:-1]
        N, D = features.shape
        
        normal_data = features[labels==1]
        normal_labels = labels[labels==1]

        N_normal = normal_data.shape[0]

        attack_data = features[labels==0]
        attack_labels = labels[labels==0]

        N_attack = attack_data.shape[0]

        randIdx = np.arange(N_attack)
        np.random.shuffle(randIdx)
        self.N_train = N_train
        
        self.train = attack_data[randIdx[:self.N_train]]
        self.train_labels = attack_labels[randIdx[:self.N_train]]
        
        
        self.test = attack_data[randIdx[self.N_train:]]
        self.test_labels = attack_labels[randIdx[self.N_train:]]

        self.test = np.concatenate((self.test, normal_data),axis=0)
        self.test_labels = np.concatenate((self.test_labels, normal_labels),axis=0)
       # self.test = np.concatenate((normal_data, normal_data),axis=0)
       # self.test_labels = np.concatenate((normal_labels, normal_labels),axis=0)

    def __len__(self):
        """
        Number of images in the object dataset.
        """
        if self.mode == "train":
            return self.train.shape[0]
        else:
            return self.test.shape[0]


    def __getitem__(self, index):
        if self.mode == "train":
            return np.float32(self.train[index]), np.float32(self.train_labels[index])
        else:
            return np.float32(self.test[index]), np.float32(self.test_labels[index])  

In [3]:
class VAE(nn.Module):
    def __init__(self):
        super(VAE,self).__init__()
        self.enc_1 = nn.Linear(118,90)
        self.enc_2 = nn.Linear(90,60)
        self.enc_3 = nn.Linear(60,30)
        self.enc = nn.Linear(30,25)
        
        self.act = nn.Tanh()
        self.act_s = nn.Sigmoid()
        self.mu = nn.Linear(25,20)
        self.log_var = nn.Linear(25,20)
        
        self.z = nn.Linear(20,25)
        self.z_3 = nn.Linear(25,30)
        self.z_2 = nn.Linear(30,60)
        self.z_1 = nn.Linear(60,90)
        self.dec = nn.Linear(90,118)
    def reparameterize(self, mu, log_var):
        std = torch.exp(log_var/2)
        eps = torch.randn_like(std)
        return mu + eps * std
    def forward(self,x):
        enc_1 = self.enc_1(x)
        enc_2 = self.act(enc_1)
        enc_2 = self.enc_2(enc_2)
        enc_3 = self.act(enc_2)
        enc_3 = self.enc_3(enc_3)
        enc = self.act(enc_3)
        enc = self.enc(enc_3)
        enc = self.act(enc)
        
        mu = self.mu(enc)
        log_var = self.log_var(enc)
        o = self.reparameterize(mu,log_var)
        z = self.z(o)
        z_3 = self.z_3(z)
        z_2 = self.act(z_3)
        z_2 = self.z_2(z_2)
        z_1 = self.act(z_2)
        z_1 = self.z_1(z_1)
        dec = self.dec(z_1)
        dec = self.act_s(dec)
        return enc_1, enc_2, enc_3, enc, mu, log_var, o, z, z_3, z_2, z_1, dec

In [4]:
def get_loader(data_path, batch_size, N_train, mode='train'):
    """Build and return data loader."""
    
    dataset = KDD99Loader(data_path, N_train, mode)

    shuffle = False
    if mode == 'train':
        shuffle = True

    data_loader = DataLoader(dataset=dataset,
                             batch_size=batch_size,
                             shuffle=shuffle)
    return data_loader

In [5]:
data_path = 'kdd_cup.npz'
iter_per_epoch = 1
batch_size = 1000
learn_rate = 0.0001
All_train = 97278
Ratio = 0.02
N_train = int(All_train * Ratio)

In [6]:
def relative_euclidean_distance(a, b):
    return (a-b).norm(2, dim=1) / a.norm(2, dim=1)

In [7]:
def loss_function(recon_x, x, mu, logvar, enc, z,  enc_1, enc_2, enc_3, z_3, z_2, z_1):
    #BCE = F.binary_cross_entropy(recon_x, x, reduction='sum')
    criterion_elementwise_mean = nn.MSELoss(reduction='sum')
    BCE_x = criterion_elementwise_mean(recon_x,x)
    BCE_z = criterion_elementwise_mean(enc,z)
    BCE_z_1 = criterion_elementwise_mean(enc_1,z_1)
    BCE_z_2 = criterion_elementwise_mean(enc_2,z_2)
    BCE_z_3 = criterion_elementwise_mean(enc_3,z_3)
    # see Appendix B from VAE paper:
    # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014
    # https://arxiv.org/abs/1312.6114
    # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())

    return BCE_x + BCE_z + BCE_z_1 + BCE_z_2 + BCE_z_3 + KLD

In [8]:
vae = VAE()
optimizer = torch.optim.Adam(vae.parameters(),lr=learn_rate)
data_loader_train = get_loader(data_path, batch_size, N_train, mode='train')
for i in range(iter_per_epoch):
    for j ,(input_data, labels)  in enumerate(data_loader_train):
        enc_1, enc_2, enc_3, enc, mu, log_var, o, z, z_3, z_2, z_1, dec = vae(input_data)
        optimizer.zero_grad()
        loss = loss_function(dec, input_data, mu, log_var, enc, z, enc_1, enc_2, enc_3, z_3, z_2, z_1)
        loss.backward()
        optimizer.step()

In [9]:
batch_size = 1000
data_loader_train = get_loader(data_path, batch_size, N_train,mode='train')
train_enc = []
train_labels = []
data_loader_test = get_loader(data_path, batch_size, N_train, mode='test')
test_enc = []
test_labels = []

In [10]:
result = []
diff_quantity_result= []
for i ,(input_data, labels)  in enumerate(data_loader_train):
    enc_1, enc_2, enc_3, enc, mu, log_var, o, z, z_3, z_2, z_1, dec = vae(input_data)
    rec_euclidean = relative_euclidean_distance(input_data, dec)
    rec_cosine = F.cosine_similarity(input_data, dec, dim=1)
    
    enc = torch.cat([enc, rec_euclidean.unsqueeze(-1), rec_cosine.unsqueeze(-1)], dim=1)
    enc = enc.detach().numpy()

    train_enc.append(enc)

    train_labels.append(labels.numpy())
for i ,(input_data, labels)  in enumerate(data_loader_test):
    enc_1, enc_2, enc_3, enc, mu, log_var, o, z, z_3, z_2, z_1, dec = vae(input_data)
    rec_euclidean = relative_euclidean_distance(input_data, dec)
    rec_cosine = F.cosine_similarity(input_data, dec, dim=1)
    
    enc = torch.cat([enc, rec_euclidean.unsqueeze(-1), rec_cosine.unsqueeze(-1)], dim=1)
    enc = enc.detach().numpy()

    test_enc.append(enc)
    
    test_labels.append(labels.numpy())
    
x =train_enc[0] 
kde = KernelDensity(kernel='gaussian', bandwidth=0.0001).fit(x)
score =  kde.score_samples(x)
k = len(test_enc)
test_score = []
for i in range (k):
    score = kde.score_samples(test_enc[i])
    
    test_score.append(score)
test_labels = np.concatenate(test_labels,axis=0)
test_score = np.concatenate(test_score,axis=0)
s = len(test_labels)
c = np.sum(test_labels==1)
g = c/s
thresh = np.percentile(test_score, int(g*100))
pred = (test_score < thresh).astype(int)
gt = test_labels.astype(int)
accuracy = accuracy_score(gt,pred)
precision, recall, f_score, support = prf(gt, pred, average='binary')
temp_result = [accuracy,precision,recall,f_score]
result.append(temp_result)
end_result = np.mean(result,axis=0)
diff_quantity_result.append(end_result)
print(end_result)

[0.98957275 0.99739877 0.98964821 0.99350837]
