In [170]:
import os
import torch
import csv
import random
import numpy as np
import pandas as pd
import torch.nn as nn
from sklearn.model_selection import train_test_split
import torch.optim as optim
import matplotlib.pyplot as plt
import torch.nn.functional as F
import torch.utils.data as data_utils


In [171]:
#constant declaration
NUM_EPOCHS = 50
LEARNING_RATE = 1e-3
BATCH_SIZE = 100

In [172]:
# importing benign and mixed flow datasets from tran_feature_selection

# Re-enable these three lines when using real data instead of sample_flows.csv
# %store -r benign_flows  
# %store -r mixed_flows
# %store -r features

# This is a subset of mixed data intended only for making sure the model works on our data format
# actual analysis should use benign_flows and mixed_flows that were stored by tran_feature_selection
benign_flows = pd.read_csv("sample_flows.csv", index_col=None, delimiter=',') 

features_ori = ['flowInd', 'duration', 'numHdrDesc',
            'l4Proto', 'macPairs', 'dstPortClassN', 'numPktsSnt', 'numPktsRcvd',
            'numBytesSnt', 'numBytesRcvd', 'minPktSz', 'maxPktSz', 'avePktSize', 'stdPktSize', 'pktps', 'bytps',
            'pktAsm', 'bytAsm', 'ipMindIPID', 'ipMaxdIPID', 'ipMinTTL', 'ipMaxTTL', 'ipTTLChg', 'ipOptCnt',
            'tcpPSeqCnt', 'tcpSeqSntBytes', 'tcpSeqFaultCnt', 'tcpPAckCnt', 'tcpFlwLssAckRcvdBytes', 'tcpAckFaultCnt',
            'tcpInitWinSz', 'tcpAveWinSz', 'tcpMinWinSz', 'tcpMaxWinSz', 'tcpWinSzDwnCnt', 'tcpWinSzUpCnt',
            'tcpWinSzChgDirCnt', 'tcpOptPktCnt', 'tcpOptCnt', 'tcpMSS', 'tcpWS', 'tcpTmS', 'tcpTmER', 'tcpEcI',
            'tcpBtm', 'tcpSSASAATrip', 'tcpRTTAckTripMin', 'tcpRTTAckTripMax', 'tcpRTTAckTripAve',
            'tcpRTTAckTripJitAve', 'tcpRTTSseqAA', 'tcpRTTAckJitAve', 'icmpTCcnt', 'icmpEchoSuccRatio', 'icmpPFindex',
            'connSip', 'connDip', 'connSipDip', 'connSipDprt', 'connF',  'aveIAT', 'maxIAT', 
                'stdIAT', 'tcpISeqN', 'tcpUtm', 'tcpWinSzThRt']
benign_flows1 = pd.DataFrame(benign_flows[features_ori])

features = benign_flows1.columns
dim = len(features)

# target = []
# for i in range(len(benign_flows1.index)):
#     target.append(random.randint(0,1))
    
# target = np.asarray(target)


In [173]:
#dataset loading
train_tensor = torch.tensor(benign_flows[features].values.astype(np.float32))
train_loader = torch.utils.data.DataLoader(train_tensor, batch_size = BATCH_SIZE, shuffle = False)
print(train_tensor)


tensor([[1.0000e+00, 1.9027e-01, 1.0000e+00,  ..., 5.8152e+08, 0.0000e+00,
         3.3333e-01],
        [1.0000e+00, 1.8700e-04, 1.0000e+00,  ..., 1.4947e+09, 0.0000e+00,
         0.0000e+00],
        [2.0000e+00, 1.0197e+01, 1.0000e+00,  ..., 4.0063e+08, 1.3048e+04,
         1.1111e-01],
        ...,
        [5.4600e+02, 0.0000e+00, 1.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [5.4600e+02, 0.0000e+00, 1.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [1.3200e+02, 3.0262e+02, 1.0000e+00,  ..., 9.7572e+08, 2.0200e+01,
         0.0000e+00]])


In [174]:
#encoder decoder 

class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        # encoder
        self.enc1 = nn.Linear(in_features=dim, out_features=dim)
        self.enc2 = nn.Linear(in_features=dim, out_features=int(dim/2))
        self.enc3 = nn.Linear(in_features=int(dim/2), out_features=int(dim/4))
        self.enc4 = nn.Linear(in_features=int(dim/4), out_features=int(dim/8))
        
        # decoder 
        self.dec1 = nn.Linear(in_features=int(dim/8), out_features=int(dim/4))
        self.dec2 = nn.Linear(in_features=int(dim/4), out_features=int(dim/2))
        self.dec3 = nn.Linear(in_features=int(dim/2), out_features=dim)
        self.dec4 = nn.Linear(in_features=dim, out_features=dim)
        
    def forward(self, x):
        x = F.relu(self.enc1(x))
        x = F.relu(self.enc2(x))
        x = F.relu(self.enc3(x))
        x = F.relu(self.enc4(x))

        x = F.relu(self.dec1(x))
        x = F.relu(self.dec2(x))
        x = F.relu(self.dec3(x))
        x = F.relu(self.dec4(x))
        return x
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
net = Autoencoder()
optimizer = optim.Adam(net.parameters(), lr=1e-3)

#print(net)

In [175]:
#Trainning model

loss_function = nn.BCEWithLogitsLoss()  #MSELoss too
get_loss = list()
def training(net, trainloader, epochs):
    train_loss = []
    for epoch in range(epochs):
        running_loss = 0.0
        for data in train_loader:
            input_data = data.to(device=device)
            optimizer.zero_grad()
            output = net(input_data).to(device=device)          # output is the reconstruced x 
            loss = loss_function(output,input_data).to(device=device) # input_data should be the target variable
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        loss = running_loss / len(trainloader)
        train_loss.append(loss)
        
        if epoch % 5 == 0:
            print('Epoch {} of {}, Train Loss: {:.3f}'.format(
            epoch+1, NUM_EPOCHS, loss))
    return train_loss

get_loss = training(net, train_loader, NUM_EPOCHS)
get_loss
#plotting of get_loss 


            

Epoch 1 of 50, Train Loss: -66447977139404.797
Epoch 6 of 50, Train Loss: -109192170733096144.000
Epoch 11 of 50, Train Loss: -27524978376674639872.000
Epoch 16 of 50, Train Loss: -1368524755090605604864.000
Epoch 21 of 50, Train Loss: -9081327910871231365120.000
Epoch 26 of 50, Train Loss: -15374838973708713852928.000
Epoch 31 of 50, Train Loss: -20437369105906841681920.000
Epoch 36 of 50, Train Loss: -23506081648112304128000.000
Epoch 41 of 50, Train Loss: -25896070945425105879040.000
Epoch 46 of 50, Train Loss: -27675573481113405882368.000


[-66447977139404.8,
 -417625436114124.8,
 -1852168109817856.0,
 -7475435457740800.0,
 -2.9357224152478516e+16,
 -1.0919217073309614e+17,
 -3.807411075012362e+17,
 -1.2368709267845284e+18,
 -3.732866770286726e+18,
 -1.049448941451393e+19,
 -2.752497837667464e+19,
 -6.763087319128447e+19,
 -1.565428358982859e+20,
 -3.432760330449383e+20,
 -7.139705642420801e+20,
 -1.3685247550906056e+21,
 -2.357255827901751e+21,
 -3.796081478726396e+21,
 -5.710979869893517e+21,
 -7.571059342046803e+21,
 -9.081327910871231e+21,
 -1.0495838938029688e+22,
 -1.1784042209710227e+22,
 -1.299363445588327e+22,
 -1.4184065586806244e+22,
 -1.5374838973708714e+22,
 -1.6545319063284632e+22,
 -1.766623614000063e+22,
 -1.8699009435072022e+22,
 -1.962513202155664e+22,
 -2.043736910590684e+22,
 -2.1155626364499533e+22,
 -2.1800125274561735e+22,
 -2.2397648989368026e+22,
 -2.296322252882601e+22,
 -2.3506081648112304e+22,
 -2.4028296812064496e+22,
 -2.4529927733839678e+22,
 -2.500968889083094e+22,
 -2.5464311644315906e+22