In [1]:
import os
import pickle
import gc
import sys
import argparse
import numpy as np

import torch
from utils import *
import pandas as pd

from minisom import MiniSom
from torch import nn
from torch import optim
from torch.utils.data import TensorDataset, DataLoader

from som_dagmm.model import DAGMM, SOM_DAGMM
from som_dagmm.compression_network import CompressionNetwork
from som_dagmm.estimation_network import EstimationNetwork
from som_dagmm.gmm import GMM, Mixture

from sklearn.model_selection import train_test_split

from SOM import som_train, som_pred

In [2]:
epochs = 50

# epochs = 100
batch_size = 1024
save_path = 'IDS2018_numerical_label_encode'
dataset = 'IDS2018'
features = 'numerical'
embed = 'label_encode'

In [3]:
if dataset == 'IDS2018':
    # data = load_data('data/CSE-CIC-IDS2018-5P/allN-500k.csv')
    data = load_data('data/CSE-CIC-IDS2018/CSE-CIC-IDS2018-25k.csv')

    dataB = data[(data['Label'] ==  "Benign")] #Pegando somente os benignos
    dataM = data[(data['Label'] !=  "Benign")]
    categorical_cols = []
    YB = get_labels(dataB, dataset)
    YM = get_labels(dataM, dataset)

if dataset == 'kdd':
    names = [i for i in range(0,43)] # Qtd de colunas, cada coluna está representada entre 0 a 42
    data = load_data('data/NSL-KDD/KDDTrain+.txt', names)

    data = data[(data[41] ==  "normal")]

    categorical_cols = [1,2,3] # Somente as colunas 1,2,3
    Y = get_labels(data, dataset)
    categorical_cols = [1,2,3] # Somente as colunas 1,2,3
    Y = get_labels(data, dataset)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(['Label'], axis = 1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(['Label'], axis = 1, inplace=True)


In [4]:
save_dataB = dataB
save_dataM = dataM

In [5]:
dataB = save_dataB
dataM = save_dataM

In [6]:
data = save_dataM
Y = YM

In [7]:
#Select features
if features == "categorical":
    data = data[categorical_cols]
    dataB = dataB[categorical_cols]
if features == "numerical":
    data = remove_cols(data, categorical_cols)
    dataB = remove_cols(dataB, categorical_cols)

#encode categorical variables
if embed == 'one_hot':
    data = one_hot_encoding(data, categorical_cols)
    dataB = one_hot_encoding(dataB, categorical_cols)
if embed == 'label_encode':
    data = label_encoding(data, categorical_cols)
    dataB = one_hot_encoding(dataB, categorical_cols)

In [8]:
#Definindo uma seed para serpação dos dados
#Podemos validar o modelo
seed = 123
np.random.seed(seed)

num_samples = int(0.01 * len(data))

# Gerando índices aleatórios
random_indices = np.random.choice(len(data), num_samples, replace=False)

In [9]:
#data = data.iloc[random_indices]
# Y = Y[random_indices]

print(data.shape[0])
print(len(Y))

18500
18500


In [10]:
# Remove columns with NA values
data.replace([np.inf, -np.inf], np.nan, inplace=True)
dataB.replace([np.inf, -np.inf], np.nan, inplace=True)

data = fill_na(data)
dataB = fill_na(dataB)

# normalize data
data = normalize_cols(data)
dataB = normalize_cols(dataB)

In [11]:
#test and train split
train_data, test_data, Y_train, Y_test = split_data(data, Y, 0.5)

In [12]:
test_data = pd.concat([test_data, dataB], ignore_index=True)

In [13]:
Y_test = np.concatenate([Y_test, YB])

In [14]:
train_data = train_data.drop(train_data.index[-1])
# Y_test = Y_test[:-1]

print(len(train_data))
print(len(Y_train))
print(len(test_data))
print(len(Y_test))

9250
9250
15749
15749


In [15]:
#Convert to torch tensors
dataX = torch.tensor(data.values.astype(np.float32))
train_dataT = torch.tensor(train_data.values.astype(np.float32))
test_dataT = torch.tensor(test_data.values.astype(np.float32))

#Convert tensor to TensorDataset class.
dataset = TensorDataset(train_dataT)

#TrainLoader
dataloader = DataLoader(train_dataT, batch_size= batch_size, shuffle=True)

In [16]:
compression = CompressionNetwork(dataX.shape[1])
estimation = EstimationNetwork()
gmm = GMM(2,6)
mix = Mixture(6)
dagmm = DAGMM(compression, estimation, gmm)
net = SOM_DAGMM(dagmm)
optimizer =  optim.Adam(net.parameters(), lr=1e-4)
for epoch in range(epochs):
    print('EPOCH {}:'.format(epoch + 1))
    running_loss = 0
    for i, data in enumerate(dataloader):
        out = net(data)
        optimizer.zero_grad()
        L_loss = compression.reconstruction_loss(data[0])
        G_loss = mix.gmm_loss(out=out, L1=0.1, L2=0.005)
        loss = L_loss + G_loss
        
        if torch.isfinite(loss):
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            print(loss.item())
        else:
            print(f"Error: {loss.item()}")
        
    print(running_loss)
torch.save(net, save_path)

EPOCH 1:
SOM training started:


-1.3706756830215454
SOM training started:
-1.3049254417419434
SOM training started:
-1.5039021968841553
SOM training started:
Error: -inf
SOM training started:
-1.3272876739501953
SOM training started:
-1.383769154548645
SOM training started:
-1.3828338384628296
SOM training started:
-1.4185446500778198
SOM training started:
-1.3112006187438965
SOM training started:
-1.3828545808792114
-12.385993838310242
EPOCH 2:
SOM training started:
-1.4040403366088867
SOM training started:
-1.451389193534851
SOM training started:
-1.3883082866668701
SOM training started:
Error: nan
SOM training started:
-1.3868887424468994
SOM training started:
-1.3461500406265259
SOM training started:
-1.2540020942687988
SOM training started:
-1.3876397609710693
SOM training started:
-1.4059818983078003
SOM training started:
-1.4487940073013306
-12.473194360733032
EPOCH 3:
SOM training started:
-1.442582130432129
SOM training started:
Error: -inf
SOM training started:
-1.3439428806304932
SOM training started:
-1.2

In [17]:
net.eval()
out = net(test_dataT)
threshold = np.percentile(out, 20)
pred = (out > threshold).numpy().astype(int)

# Precision, Recall, F1
p, r, f, a = get_scores(pred, Y_test)
print("Precision:", p, "Recall:", r, "F1 Score:", f, "AUROC:", a)

SOM training started:
Precision: 0.7923018704724835 Recall: 0.5821417222751827 F1 Score: 0.6711544626093328 AUROC: 0.49153555062085713
