In [1]:
# Read data
import dask.dataframe as dd
import numpy as np
import pandas as pd
from sklearn.utils import shuffle

dt0 = dd.read_csv("../dataset/UGR/big_dataset/0_bacground.csv", header=None)
dt0.columns=['timestamp','duration','source_ip',
            'dest_ip','source_port','dest_port','protocol',
            'flag','fwd','stos','pkt','byt','Label']
dt1 = dd.read_csv("../dataset/UGR/big_dataset/1_blacklist_flows_cut.csv", header=None)
dt1.columns=['timestamp','duration','source_ip',
            'dest_ip','source_port','dest_port','protocol',
            'flag','fwd','stos','pkt','byt','Label']
dt2 = dd.read_csv("../dataset/UGR/big_dataset/2_botnet_flows_cut.csv", header=None)
dt2.columns=['timestamp','duration','source_ip',
            'dest_ip','source_port','dest_port','protocol',
            'flag','fwd','stos','pkt','byt','Label']
dt3 = dd.read_csv("../dataset/UGR/big_dataset/3_dos_flows_cut.csv", header=None)
dt3.columns=['timestamp','duration','source_ip',
            'dest_ip','source_port','dest_port','protocol',
            'flag','fwd','stos','pkt','byt','Label']
dt4 = dd.read_csv("../dataset/UGR/big_dataset/4_scan11_flows_cut.csv", header=None)
dt4.columns=['timestamp','duration','source_ip',
            'dest_ip','source_port','dest_port','protocol',
            'flag','fwd','stos','pkt','byt','Label']
dt5 = dd.read_csv("../dataset/UGR/big_dataset/5_scan44_flows_cut.csv", header=None)
dt5.columns=['timestamp','duration','source_ip',
            'dest_ip','source_port','dest_port','protocol',
            'flag','fwd','stos','pkt','byt','Label']
dt6 = dd.read_csv("../dataset/UGR/big_dataset/6_spam_flows_cut.csv", header=None)
dt6.columns=['timestamp','duration','source_ip',
            'dest_ip','source_port','dest_port','protocol',
            'flag','fwd','stos','pkt','byt','Label']
dt7 = dd.read_csv("../dataset/UGR/big_dataset/7_sshscan_flows_cut.csv", header=None)
dt7.columns=['timestamp','duration','source_ip',
            'dest_ip','source_port','dest_port','protocol',
            'flag','fwd','stos','pkt','byt','Label']

In [2]:
normal = dt0[dt0['Label']=="background"].compute().sample(n=20000, replace=False, random_state=1)

In [3]:
a1 = dt1.compute().sample(n=20000, replace=False, random_state=1) #blacklist

In [4]:
a2 = dt2.compute().sample(n=2000, replace=False, random_state=1) #botnet

In [5]:
a3 = dt3.compute().sample(n=4500, replace=False, random_state=1) #dos

In [6]:
a4 = dt4.compute().sample(n=20000, replace=False, random_state=1) #scan11

In [7]:
a5 = dt5.compute().sample(n=20000, replace=False, random_state=1) #scan44

In [10]:
a6 = dt6.compute().sample(n=400, replace=False, random_state=1) #spam

In [9]:
a7 = dt7.compute()

In [12]:
# Merge data
data1 = pd.concat([normal,a1,a2,a3,a4,a5,a6,a7],ignore_index = True,axis=0)
data_train = shuffle(data1).reset_index(drop=True)
print("-----------------------------------")
print(data_train['Label'].value_counts())
data_train.to_csv("../dataset/UGR/original_0.csv", index=False, header = True)

-----------------------------------
scan11             20000
background         20000
blacklist          20000
scan44             20000
dos                 4500
nerisbotnet         2000
anomaly-spam         400
anomaly-sshscan      109
Name: Label, dtype: int64


In [1]:
# Convert data
import argparse
import platform, logging, os
import re
import csv
import ipaddress

REG_EXPR = '^2016-07-\d{2} \d{2}:\d{2}:\d{2},\d+\.\d+,\d+\.\d+\.\d+\.\d+,\d+\.\d+\.\d+\.\d+,\d+,\d+,[A-Z]+,[A-Z\.]{6},\d+,\d+,\d+,\d+,.+[sd]$'

def convert(row):
    outrow = []
    d, t = row[0].split(' ')
                # split the date into 3 columns
    for val in d.split('-'):
        outrow.append(val)
        
                # split the time into 3 columns
    for val in t.split(':'): 
        outrow.append(val)
    outrow.append(row[1])

    # convert IP adresses to integer value 
    outrow.append(int(ipaddress.ip_address(row[2]))) 
    outrow.append(int(ipaddress.ip_address(row[3]))) 
    outrow.append(row[4]) 
    outrow.append(row[5])
    
    # convert: TCP -> 1 ; UDP -> 2 ; ICMP -> 3; GRE -> 4; IPIP -> 5; & rest
    if row[6] == 'TCP': 
        outrow.append(1)
    elif row[6] == 'UDP': 
        outrow.append(2)
    elif row[6] == 'ICMP':
        outrow.append(3)
    elif row[6] == 'GRE':
        outrow.append(4)
    elif row[6] == 'IPIP':
        outrow.append(5)
    else:
        outrow.append(6)

    # convert Flags to ASCII value & split it into different columns
    for ch in row[7]:
        outrow.append(ord(ch))
    outrow.append (row[8])
    outrow.append (row[9])
    outrow.append (row[10])
    outrow.append (row[11])
    outrow.append (row[12])
    return outrow
    
def main (data_in, data_out):
    
    if data_out == "":
        data_out = "../dataset/UGR/original_1.csv"
    if data_in == "":
        data_in = "../dataset/UGR/original_0.csv"
    with open(data_in) as in_dt:
        with open(data_out, 'w', newline ='') as out_dt:
            print("Starting convert data...")
            writer = csv.writer(out_dt)
            r = re.compile (REG_EXPR)

            read = csv.reader(in_dt)
            next(read)
            
            print("Converting...")
            for row in read:
                if (convert(row) == 0):
                    pass
                else:
                    writer.writerow(convert(row))
            print("Completed!")
if __name__ == "__main__":  
    main("","")

Starting convert data...
Converting...
Completed!


In [9]:
from sklearn.preprocessing import MinMaxScaler

dd = pd.read_csv("../dataset/UGR/original_1.csv", header=None)
dd.columns=['year','month','day','hour','minute','second','duration',
            'source_ip','dest_ip','source_port','dest_port','protocol',
            'flag1','flag2','flag3','flag4','flag5','flag6','fwd','stos','pkt','byt', 'Label']
dd = dd.drop(['year','month','day'], axis = 1)

dataset = dd.values
X = dataset[:,0:19]
y = dataset[:,19]

# convert the scaled array to dataframe 
min_max_scaler = MinMaxScaler()
X_scale = min_max_scaler.fit_transform(X)

data_x = pd.DataFrame(X_scale, columns = ['hour','minute','second','duration','source_ip','dest_ip','source_port','dest_port','protocol',
                'flag1','flag2','flag3','flag4','flag5','flag6','fwd','stos','pkt','byt'])
data_y = pd.DataFrame(y, columns = ['Label'])

In [10]:
dataset = pd.concat([data_x, data_y],ignore_index = True,axis=1)
dataset.columns = ['hour','minute','second','duration','source_ip','dest_ip','source_port','dest_port','protocol',
                'flag1','flag2','flag3','flag4','flag5','flag6','fwd','stos','pkt','byt','Label']

In [1]:
print(dataset['Label'].value_counts())
dataset.to_csv("../dataset/UGR/original.csv", index=False, header = True)

NameError: name 'dataset' is not defined