In [1]:
import numpy as np 
import pandas as pd
from sklearn.neighbors.kde import KernelDensity
from sklearn.metrics import precision_recall_fscore_support as prf, accuracy_score
import torch
from torch.nn import functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn

In [2]:
class KDD99Loader(object):
    def __init__(self, data_path, N_train , mode="train"):
        self.mode=mode
        data = np.load(data_path)

        labels = data["kdd"][:,-1]
        features = data["kdd"][:,:-1]
        N, D = features.shape
        
        normal_data = features[labels==1]
        normal_labels = labels[labels==1]

        N_normal = normal_data.shape[0]

        attack_data = features[labels==0]
        attack_labels = labels[labels==0]

        N_attack = attack_data.shape[0]

        randIdx = np.arange(N_attack)
        np.random.shuffle(randIdx)
        self.N_train = N_train
        
        self.train = attack_data[randIdx[:self.N_train]]
        self.train_labels = attack_labels[randIdx[:self.N_train]]
        
        
        self.test = attack_data[randIdx[self.N_train:]]
        self.test_labels = attack_labels[randIdx[self.N_train:]]

        self.test = np.concatenate((self.test, normal_data),axis=0)
        self.test_labels = np.concatenate((self.test_labels, normal_labels),axis=0)
       # self.test = np.concatenate((normal_data, normal_data),axis=0)
       # self.test_labels = np.concatenate((normal_labels, normal_labels),axis=0)

    def __len__(self):
        """
        Number of images in the object dataset.
        """
        if self.mode == "train":
            return self.train.shape[0]
        else:
            return self.test.shape[0]


    def __getitem__(self, index):
        if self.mode == "train":
            return np.float32(self.train[index]), np.float32(self.train_labels[index])
        else:
            return np.float32(self.test[index]), np.float32(self.test_labels[index])  

In [3]:
class VAE(nn.Module):
    def __init__(self):
        super(VAE,self).__init__()
        
        self.encoder = nn.Sequential(
            nn.Linear(118,90),
            nn.Tanh(),
            nn.Linear(90,60),
            nn.Tanh(),
            nn.Linear(60,25),
            nn.Tanh(),
        )
        self.mu = nn.Linear(25,20)
        self.log_var = nn.Linear(25,20)
        
        self.decoder = nn.Sequential(
            nn.Linear(20,25),
            nn.Tanh(),
            nn.Linear(25,60),
            nn.Tanh(),
            nn.Linear(60,90),
            nn.Tanh(),
            nn.Linear(90,118),
            nn.Sigmoid()
        )
    def reparameterize(self, mu, log_var):
        std = torch.exp(log_var/2)
        eps = torch.randn_like(std)
        return mu + eps * std
    
    def forward(self,x):
        enc = self.encoder(x)
        mu = self.mu(enc)
        log_var = self.log_var(enc)
        z = self.reparameterize(mu,log_var)
        dec = self.decoder(z)
        return enc, dec, mu, log_var, z

In [4]:
def get_loader(data_path, batch_size, N_train, mode='train'):
    """Build and return data loader."""
    
    dataset = KDD99Loader(data_path, N_train, mode)

    shuffle = False
    if mode == 'train':
        shuffle = True

    data_loader = DataLoader(dataset=dataset,
                             batch_size=batch_size,
                             shuffle=shuffle)
    return data_loader

In [5]:
data_path = 'kdd_cup.npz'
iter_per_epoch = 1500
batch_size = 1000
learn_rate = 0.0001
All_train = 97278
Ratio = 0.02
N_train = int(All_train * Ratio)

In [6]:
def loss_function(recon_x, x, mu, logvar):
    #BCE = F.binary_cross_entropy(recon_x, x, reduction='sum')
    criterion_elementwise_mean = nn.MSELoss(reduction='sum')
    BCE = criterion_elementwise_mean(recon_x,x)
    # see Appendix B from VAE paper:
    # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014
    # https://arxiv.org/abs/1312.6114
    # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())

    return BCE + KLD

# 20轮迭代结果取平均值

In [None]:
Average_cycle = 20
result = []
Ratio = 0.02
N_train = int(All_train * Ratio)

for i in range(Average_cycle):
    vae = VAE()
    optimizer = torch.optim.Adam(vae.parameters(),lr=learn_rate)
    
    batch_size = 1000
    data_loader_train = get_loader(data_path, batch_size, N_train, mode='train')
    
    for i in range(iter_per_epoch):
        for j,(input_data, labels)  in enumerate(data_loader_train):
            enc, dec, mu, log_var, z = vae(input_data)
            optimizer.zero_grad()
            loss = loss_function(dec, input_data, mu, log_var)
            loss.backward()
            optimizer.step()
            
    batch_size = 100000
    data_loader_train = get_loader(data_path, batch_size, N_train,  mode='train')
    train_enc = []
    train_labels = []
    data_loader_test = get_loader(data_path, batch_size,  N_train, mode='test')
    test_enc = []
    test_labels = []
    
   
    
    for i ,(input_data, labels)  in enumerate(data_loader_train):
        enc, dec, mu, log_var, z = vae(input_data)
        enc = enc.detach().numpy()
        train_enc.append(enc)
        train_labels.append(labels.numpy())
    for i ,(input_data, labels)  in enumerate(data_loader_test):
        enc, dec, mu, log_var, z = vae(input_data)
        enc = enc.detach().numpy()
        test_enc.append(enc)
        test_labels.append(labels.numpy())
    
    x =train_enc[0] 
    kde = KernelDensity(kernel='gaussian', bandwidth=0.00001).fit(x)
    score =  kde.score_samples(x)
    k = len(test_enc)
    test_score = []
    for i in range (k):
        score = kde.score_samples(test_enc[i])
        test_score.append(score)
    test_labels = np.concatenate(test_labels,axis=0)
    test_score = np.concatenate(test_score,axis=0)
    s = len(test_labels)
    c = np.sum(test_labels==1)
    g = c/s
    
    thresh = np.percentile(test_score, g*100)
    pred = (test_score < thresh).astype(int)
    gt = test_labels.astype(int)
    accuracy = accuracy_score(gt,pred)
    precision, recall, f_score, support = prf(gt, pred, average='binary')
    temp_result = [accuracy,precision,recall,f_score]
    result.append(temp_result)
end_result = np.mean(result,axis=0)
print (end_result)

In [None]:
Ratio = 0.01
Average_cycle = 1
result = []
diff_quantity_result= []
for i in range (10):
    N_train = int(All_train*Ratio*(i+1))
    result = []
    print(Ratio*(i+1))
    for i in range(Average_cycle):
        vae = VAE()
        optimizer = torch.optim.Adam(vae.parameters(),lr=learn_rate)
        
        batch_size = 1000
        data_loader_train = get_loader(data_path, batch_size, N_train,mode='train')
        
        for i in range(iter_per_epoch):
            for j ,(input_data, labels)  in enumerate(data_loader_train):
                enc, dec, mu, log_var, z = vae(input_data)
                optimizer.zero_grad()
                loss = loss_function(dec, input_data, mu, log_var)
                loss.backward()
                optimizer.step()
        
        batch_size = 100000
        data_loader_train = get_loader(data_path, batch_size, N_train,mode='train')
        train_enc = []
        train_labels = []
        data_loader_test = get_loader(data_path, batch_size, N_train, mode='test')
        test_enc = []
        test_labels = []
        
        for i ,(input_data, labels)  in enumerate(data_loader_train):
            enc, dec, mu, log_var, z = vae(input_data)
            enc = enc.detach().numpy()
            train_enc.append(enc)
            train_labels.append(labels.numpy())
        for i ,(input_data, labels)  in enumerate(data_loader_test):
            enc, dec, mu, log_var, z = vae(input_data)
            enc = enc.detach().numpy()
            test_enc.append(enc)
            test_labels.append(labels.numpy())
        x =train_enc[0] 
        kde = KernelDensity(kernel='gaussian', bandwidth=0.00001).fit(x)
        score =  kde.score_samples(x)
        k = len(test_enc)
        test_score = []
        for i in range (k):
            score = kde.score_samples(test_enc[i])
            test_score.append(score)
        test_labels = np.concatenate(test_labels,axis=0)
        test_score = np.concatenate(test_score,axis=0)
        s = len(test_labels)
        c = np.sum(test_labels==1)
        g = c/s
        
        thresh = np.percentile(test_score, g*100)
        pred = (test_score < thresh).astype(int)
        gt = test_labels.astype(int)
        accuracy = accuracy_score(gt,pred)
        precision, recall, f_score, support = prf(gt, pred, average='binary')
        temp_result = [accuracy,precision,recall,f_score]
        result.append(temp_result)
    end_result = np.mean(result,axis=0)
    diff_quantity_result.append(end_result)
    print(end_result)