In [12]:
import numpy as np 
import pandas as pd
from sklearn.neighbors.kde import KernelDensity
from sklearn.metrics import precision_recall_fscore_support as prf, accuracy_score
import torch
from torch.nn import functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import os
from torch.autograd import Variable

In [13]:
class KDD99Loader(object):
    def __init__(self, data_path, N_train , mode="train"):
        self.mode=mode
        data = np.load(data_path)

        labels = data["kdd"][:,-1]
        features = data["kdd"][:,:-1]
        N, D = features.shape
        
        normal_data = features[labels==1]
        normal_labels = labels[labels==1]

        N_normal = normal_data.shape[0]

        attack_data = features[labels==0]
        attack_labels = labels[labels==0]

        N_attack = attack_data.shape[0]

        randIdx_attack = np.arange(N_attack)
        np.random.shuffle(randIdx_attack)
        randIdx_attack = randIdx_attack[:24319]
        
        randIdx_normal = np.arange(N_normal)
        np.random.shuffle(randIdx_normal)
        randIdx_normal = randIdx_normal[:37278]
        
        self.N_train = N_train
        
        self.train = attack_data[randIdx_attack[:self.N_train]]
        self.train_labels = attack_labels[randIdx_attack[:self.N_train]]
        
        
        self.test = attack_data[randIdx_attack[self.N_train:]]
        self.test_labels = attack_labels[randIdx_attack[self.N_train:]]
        
        self.normal_data = normal_data[randIdx_normal[:]]
        self.normal_labels = normal_labels[randIdx_normal[:]]
        
        self.test = np.concatenate((self.test, self.normal_data),axis=0)
        self.test_labels = np.concatenate((self.test_labels, self.normal_labels),axis=0)


    def __len__(self):
        """
        Number of images in the object dataset.
        """
        if self.mode == "train":
            return self.train.shape[0]
        else:
            return self.test.shape[0]


    def __getitem__(self, index):
        if self.mode == "train":
            return np.float32(self.train[index]), np.float32(self.train_labels[index])
        else:
            return np.float32(self.test[index]), np.float32(self.test_labels[index])  

In [14]:
def get_loader(data_path, batch_size, N_train, mode='train'):
    """Build and return data loader."""
    
    dataset = KDD99Loader(data_path, N_train, mode)

    shuffle = False
    if mode == 'train':
        shuffle = True

    data_loader = DataLoader(dataset=dataset,
                             batch_size=batch_size,
                             shuffle=shuffle)
    return data_loader

In [15]:
data_path = 'kdd_cup.npz'
iter_per_epoch = 1500
batch_size = 1000
learn_rate = 0.0001
All_train = 24319
Ratio = 0.02
N_train = int(All_train * Ratio)

In [None]:
Ratio = 0.01
Average_cycle = 10
result = []
diff_quantity_result= []
for i in range (10):
    N_train = int(All_train*Ratio*(i+1))
    result = []
    print(Ratio*(i+1))
    for i in range(Average_cycle):
        
        Encoder = nn.Sequential(
        nn.Linear(118,90),
        nn.Tanh(),
        nn.Linear(90,60),
        nn.Tanh(),
        nn.Linear(60,25),
        nn.Tanh()
        )
    
        Decoder = nn.Sequential(
        nn.Linear(25,60),
        nn.Tanh(),
        nn.Linear(60,90),
        nn.Tanh(),
        nn.Linear(90,118),
        nn.Sigmoid()
        )
    
        Discriminator = nn.Sequential(
        nn.Linear(25,60),
        nn.Tanh(),
        nn.Linear(60,118),
        nn.Sigmoid()
        )
    
        Encoder_solver = optim.Adam(Encoder.parameters(), lr=learn_rate)
        Decoder_solver = optim.Adam(Decoder.parameters(), lr=learn_rate)
        Discriminator_solver = optim.Adam(Discriminator.parameters(), lr=learn_rate)
    
        batch_size = 1000
        data_loader_train = get_loader(data_path, batch_size, N_train,mode='train')
        
        for it in range(2000):
            for i ,(input_data, labels)  in enumerate(data_loader_train):
                """ Reconstruction phase """
                z_sample = Encoder(input_data)
                X_sample = Decoder(z_sample)
            
                recon_loss = F.binary_cross_entropy(X_sample, input_data)
            
                recon_loss.backward()
                Decoder_solver.step()
                Encoder_solver.step()
                reset_grad()
            
                """ Regularization phase """
                # Discriminator
            
                z_real = Variable(torch.randn(len(input_data), 25))
                z_fake = Encoder(input_data)

                D_real = Discriminator(z_real)
                D_fake = Discriminator(z_fake)

                D_loss = -torch.mean(torch.log(D_real) + torch.log(1 - D_fake))

                D_loss.backward()
                Discriminator_solver.step()
                reset_grad()
            
                # Generator
            
                z_fake = Encoder(input_data)
                D_fake = Discriminator(z_fake)

                G_loss = -torch.mean(torch.log(D_fake))

                G_loss.backward()
                Encoder_solver.step()
                reset_grad()
        
        batch_size = 100000
        data_loader_train = get_loader(data_path, batch_size, N_train,mode='train')
        train_enc = []
        train_labels = []
        data_loader_test = get_loader(data_path, batch_size, N_train, mode='test')
        test_enc = []
        test_labels = []
        for i ,(input_data, labels)  in enumerate(data_loader_train):
            enc = Encoder(input_data)
            enc = enc.detach().numpy()
            train_enc.append(enc)
            train_labels.append(labels.numpy())
        for i ,(input_data, labels)  in enumerate(data_loader_test):
            enc = Encoder(input_data)
            enc = enc.detach().numpy()
            test_enc.append(enc)
            test_labels.append(labels.numpy())
        x =train_enc[0] 
        kde = KernelDensity(kernel='gaussian', bandwidth=0.00001).fit(x)
        score =  kde.score_samples(x)
        k = len(test_enc)
        test_score = []
        for i in range (k):
            score = kde.score_samples(test_enc[i])
            test_score.append(score)
        test_labels = np.concatenate(test_labels,axis=0)
        test_score = np.concatenate(test_score,axis=0)
        
        s = len(test_labels)
        c = np.sum(test_labels==1)
        g = c/s
        
        
        thresh = np.percentile(test_score, g*100)
        pred = (test_score < thresh).astype(int)
        gt = test_labels.astype(int)
        accuracy = accuracy_score(gt,pred)
        precision, recall, f_score, support = prf(gt, pred, average='binary')              
        temp_result = [accuracy,precision,recall,f_score]
        result.append(temp_result)
    end_result = np.mean(result,axis=0)
    diff_quantity_result.append(end_result)
    print(end_result)