In [1]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [2]:
IDS2017URL = "/gdrive/MyDrive/IDS2017/IDS2017.csv"


In [3]:
import torch
import pandas as pd
import tensorflow as tf
import numpy as np
import torch.nn as nn
from numpy import vstack
from numpy import argmax
from pandas import read_csv
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from torch import Tensor
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from torch.nn import Linear
from torch.nn import ReLU
from torch.nn import Softmax
from torch.nn import Module
from torch.optim import SGD
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from torch.nn.init import kaiming_uniform_
from torch.nn.init import xavier_uniform_
from torchvision import transforms

In [4]:
torch.random.manual_seed(2) 

<torch._C.Generator at 0x7f647bf08be8>

In [122]:
class IDSDataset(Dataset):

    def __init__(self, IDS2017URL, tranform=True):
      df = pd.read_csv(IDS2017URL)
      df = df[~df['Label'].isin(['Heartbleed', 'Web Attack - Sql Injection', 'Infiltration', 'Web Attack - XSS', 'Web Attack - Brute Force'])]
      df = df.replace([-np.inf, np.inf], np.nan)
      df = df.dropna()
      df.drop(df.columns[[37,39,61,62,63,64,65,66]],axis=1,inplace=True)
      self.X = df.iloc[:, :-2].apply(lambda x: (x-np.mean(x))/np.std(x)).values
      self.y = df.values[:, -2]
      self.X = self.X.astype('float32')
      self.y = LabelEncoder().fit_transform(self.y)

    def __len__(self):
      return len(self.X)
    
    def __getitem__(self, idx):
      return [self.X[idx], self.y[idx]]
 
    # get indexes for train and test rows
    def get_splits(self, n_test=0.33):
        # determine sizes
        test_size = round(n_test * len(self.X))
        train_size = len(self.X) - test_size
        # calculate the split
        return random_split(self, [train_size, test_size])

class MLP(Module):
    # define model elements
    def __init__(self, n_inputs):
        super(MLP, self).__init__()
        # input to first hidden layer
        self.hidden1 = Linear(n_inputs, 76)
        kaiming_uniform_(self.hidden1.weight, nonlinearity='relu')
        self.act1 = ReLU()
        # second hidden layer
        self.hidden2 = Linear(76, 76)
        kaiming_uniform_(self.hidden2.weight, nonlinearity='relu')
        self.act2 = ReLU()
        # third hidden layer and output
        self.hidden3 = Linear(76, 10)
        xavier_uniform_(self.hidden3.weight)
        self.act3 = Softmax(dim=1)

    def forward(self, X):
        # input to first hidden layer
        X = self.hidden1(X)
        X = self.act1(X)

        # second hidden layer
        X = self.hidden2(X)
        X = self.act2(X)

        # output layer
        X = self.hidden3(X)
        X = self.act3(X)

        return X
 
def prepare_data(path):
    # load the dataset
    dataset = IDSDataset(path)
    # calculate split
    train, test = dataset.get_splits()
    # prepare data loaders
    train_dl = DataLoader(train, batch_size=1000, shuffle=True)
    test_dl = DataLoader(test, batch_size=1000, shuffle=False)
    return train_dl, test_dl

def train_model(train_dl, model):
    # define the optimization
    criterion = CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=0.0001)
    # enumerate epochs
    for epoch in range(10):
        # enumerate mini batches
        print(model.state_dict())
        for i, (inputs, targets) in enumerate(train_dl):
            # clear the gradients
            optimizer.zero_grad()
            #torch.autograd.set_detect_anomaly(True)
            # compute the model output
            #print(inputs)
            #print("******************")
            yhat = model(inputs)
            #print(yhat)
            # calculate loss
            loss = criterion(yhat, targets)
            # credit assignment
            loss.backward()
            # update model weights
            optimizer.step()

def evaluate_model(test_dl, model):
    predictions, actuals = list(), list()
    for i, (inputs, targets) in enumerate(test_dl):
        # evaluate the model on the test set
        yhat = model(inputs)
        # retrieve numpy array
        yhat = yhat.detach().numpy()
        actual = targets.numpy()
        # convert to class labels
        yhat = argmax(yhat, axis=1)
        # reshape for stacking
        actual = actual.reshape((len(actual), 1))
        yhat = yhat.reshape((len(yhat), 1))
        # store
        predictions.append(yhat)
        actuals.append(actual)
    predictions, actuals = vstack(predictions), vstack(actuals)
    # calculate accuracy
    acc = accuracy_score(actuals, predictions)
    return acc

def predict(row, model):
    # convert row to data
    row = Tensor([row])
    # make prediction
    yhat = model(row)
    # retrieve numpy array
    yhat = yhat.detach().numpy()
    return yhat

In [109]:
# class IDSDataset(Dataset):

#     def __init__(self, IDS2017URL, tranform=True):
#       df = pd.read_csv(IDS2017URL)
#       df = df[~df['Label'].isin(['Heartbleed', 'Web Attack - Sql Injection', 'Infiltration', 'Web Attack - XSS', 'Web Attack - Brute Force'])]
#       df = df.replace([-np.inf, np.inf], np.nan)
#       df = df.dropna()
#       df = pd.concat([df.iloc[:,0:37], df[:,38:],df.iloc[:, -2:]], axis=1)
#       self.X = df.iloc[:, :-2].apply(lambda x: (x-np.mean(x))/np.std(x)+0.00001).values
#       self.y = df.values[:, -2]
#       self.X = self.X.astype('float32')
#       self.y = LabelEncoder().fit_transform(self.y)

#     def __len__(self):
#       return len(self.X)
    
#     def __getitem__(self, idx):
#       return [self.X[idx], self.y[idx]]
 
#     # get indexes for train and test rows
#     def get_splits(self, n_test=0.33):
#         # determine sizes
#         test_size = round(n_test * len(self.X))
#         train_size = len(self.X) - test_size
#         # calculate the split
#         return random_split(self, [train_size, test_size])

# class MLP(Module):
#     # define model elements
#     def __init__(self, n_inputs):
#         super(MLP, self).__init__()
#         # input to first hidden layer
#         self.hidden1 = Linear(n_inputs, 76)
#         #kaiming_uniform_(self.hidden1.weight, nonlinearity='relu')
#         self.act1 = ReLU()
#         # second hidden layer
#         self.hidden2 = Linear(76, 76)
#         #kaiming_uniform_(self.hidden2.weight, nonlinearity='relu')
#         self.act2 = ReLU()
#         # third hidden layer and output
#         self.hidden3 = Linear(76, 10)
#         #xavier_uniform_(self.hidden3.weight)
#         self.act3 = Softmax(dim=1)

#     def forward(self, X):
#         # input to first hidden layer
#         X = self.hidden1(X)
#         X = self.act1(X)
#         print(X)
#         # second hidden layer
#         X = self.hidden2(X)
#         X = self.act2(X)
#         print(X)
#         # output layer
#         X = self.hidden3(X)
#         X = self.act3(X)
#         print(X)
#         return X
 
# def prepare_data(path):
#     # load the dataset
#     dataset = IDSDataset(path)
#     # calculate split
#     train, test = dataset.get_splits()
#     # prepare data loaders
#     train_dl = DataLoader(train, batch_size=1000, shuffle=True)
#     test_dl = DataLoader(test, batch_size=1000, shuffle=False)
#     return train_dl, test_dl

# def train_model(train_dl, model):
#     # define the optimization
#     criterion = CrossEntropyLoss()
#     optimizer = Adam(model.parameters(), lr=0.0001)
#     # enumerate epochs
#     for epoch in range(10):
#         # enumerate mini batches
#         print(model.state_dict())
#         for i, (inputs, targets) in enumerate(train_dl):
#             # clear the gradients
#             optimizer.zero_grad()
#             #torch.autograd.set_detect_anomaly(True)
#             # compute the model output
#             #print(inputs)
#             #print("******************")
#             yhat = model(inputs)
#             #print(yhat)
#             # calculate loss
#             loss = criterion(yhat, targets)
#             # credit assignment
#             loss.backward()
#             # update model weights
#             optimizer.step()

# def evaluate_model(test_dl, model):
#     predictions, actuals = list(), list()
#     for i, (inputs, targets) in enumerate(test_dl):
#         # evaluate the model on the test set
#         yhat = model(inputs)
#         # retrieve numpy array
#         yhat = yhat.detach().numpy()
#         actual = targets.numpy()
#         # convert to class labels
#         yhat = argmax(yhat, axis=1)
#         # reshape for stacking
#         actual = actual.reshape((len(actual), 1))
#         yhat = yhat.reshape((len(yhat), 1))
#         # store
#         predictions.append(yhat)
#         actuals.append(actual)
#     predictions, actuals = vstack(predictions), vstack(actuals)
#     # calculate accuracy
#     acc = accuracy_score(actuals, predictions)
#     return acc

# def predict(row, model):
#     # convert row to data
#     row = Tensor([row])
#     # make prediction
#     yhat = model(row)
#     # retrieve numpy array
#     yhat = yhat.detach().numpy()
#     return yhat

In [123]:
train_dl, test_dl = prepare_data(IDS2017URL)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [7]:
train_dl.dataset

<torch.utils.data.dataset.Subset at 0x7f63e684af60>

In [17]:
print(len(train_dl.dataset), len(test_dl.dataset))

1895106 933410


In [124]:
model = MLP(75)

In [125]:
train_model(train_dl, model)

OrderedDict([('hidden1.weight', tensor([[-0.2152, -0.0275,  0.2764,  ..., -0.0381, -0.0354, -0.2230],
        [ 0.0703,  0.0577,  0.2258,  ..., -0.1524, -0.0200,  0.0586],
        [-0.0541,  0.1380, -0.0487,  ..., -0.0963, -0.1906,  0.0941],
        ...,
        [ 0.2285, -0.2300,  0.0472,  ...,  0.1778,  0.0857,  0.0176],
        [-0.0816, -0.1023, -0.1284,  ...,  0.0324, -0.1446, -0.1701],
        [-0.1669,  0.1833, -0.2724,  ..., -0.2017,  0.2758,  0.1902]])), ('hidden1.bias', tensor([ 0.0360,  0.0897, -0.0755,  0.0298,  0.0615,  0.0271, -0.0245,  0.1129,
        -0.1147, -0.0234, -0.0579, -0.0177, -0.0588, -0.0828, -0.0931,  0.0154,
         0.0312, -0.0295,  0.0492,  0.0926, -0.0306, -0.0055, -0.0203,  0.0602,
        -0.0822, -0.0545,  0.0148,  0.0141, -0.0255, -0.0764, -0.0216, -0.0392,
        -0.0586,  0.0301, -0.1124,  0.0509,  0.0074, -0.1149, -0.0889, -0.0587,
         0.0873, -0.0345,  0.0242,  0.0370,  0.1016,  0.0537,  0.0937,  0.0046,
        -0.0772, -0.0061, -0.0912, 

In [137]:
torch.save(model.state_dict(), '/gdrive/MyDrive/IDS2017/2.pth')

In [126]:
acc = evaluate_model(test_dl, model)
print('Accuracy: %.3f' % acc)

Accuracy: 0.907


In [128]:
print(model.state_dict())

OrderedDict([('hidden1.weight', tensor([[-0.2546,  0.0173,  0.2271,  ..., -0.1121, -0.0352, -0.2136],
        [ 0.2257,  0.0809,  0.2913,  ..., -0.1351, -0.0210,  0.0609],
        [-0.0431,  0.0863,  0.0944,  ..., -0.1471, -0.2224,  0.0645],
        ...,
        [ 0.3013, -0.3549, -0.0537,  ...,  0.0721,  0.0459, -0.0088],
        [ 0.0434, -0.2208,  0.0058,  ..., -0.0094, -0.1522, -0.1472],
        [-0.1968,  0.1637, -0.1660,  ..., -0.2490,  0.2451,  0.1810]])), ('hidden1.bias', tensor([ 7.1303e-02,  1.0398e-01, -2.2710e-02,  7.3222e-02,  1.3060e-01,
        -4.4802e-03, -3.8276e-02,  2.0086e-01, -1.3930e-01,  1.2710e-02,
         7.4247e-03, -2.6362e-02,  1.6921e-02, -5.0679e-02, -1.0915e-01,
        -1.2110e-02,  9.6036e-02, -4.2719e-02,  1.0658e-01,  1.9617e-01,
         8.8316e-03,  3.8669e-02,  5.1121e-02,  9.6465e-02,  1.5525e-02,
        -2.1970e-02,  5.8663e-02,  1.1629e-02,  3.8936e-02, -1.2617e-02,
         2.9186e-02,  1.0753e-02, -1.1594e-01,  1.0797e-01, -1.2299e-01,
    

In [136]:
row = [24049,1541,49633,2610,80,6,264,5125872,8,7,1659,2514.0,467,0,207.375,221.2780652,1047,0,359.14285710000007,420.18386,814.0,2.0,366133.7143,1320784.35,4954614.0,73.0,171258.0,24465.42857,21945.246219999997,57169.0,1001.0,5125743.0,854290.5,2033515.304,5004981.0,2049.0,0,0,0,0,172,152,1.560710061,1.3656213030000002,0,1047,260.8125,322.5548468,104041.6292,0,0,0,1,0,0,0,0,0,278.2,207.375,359.14285710000007,0,0,0,0,0,0,8,1659,7,2514,8192,262,7,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]
row = [row[:37]+row[38:39]+row[40:61]+row[67:]]
yhat = predict(row, model)
print(yhat)

[[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]]
