In [8]:
import numpy as np
import pickle
from pandas import DataFrame, read_csv
from sklearn import preprocessing
import helperModule as hm
import warnings

warnings.filterwarnings("ignore")

In [9]:
#########################
############ Read in data
#########################

data_train, data_test = hm.readData()
x_train, y_train, x_test, y_test = hm.splitData(data_train, data_test)


In [10]:
#########################
# Mapping of attack types
#########################

'''
38 different attack types are mapped to their corresponding group
-----------
normal                      ==>   NORMAL
back,land ..                ==>   DOS
satan,ipsweep,nmap, ..      ==>   PROBE
ftp_write,guess_passwd, ..  ==>   R2L
rootkit,perl ..             ==>   U2R
-----------
'''

y_train, y_test = hm.mapAttackTypes(y_train, y_test)


In [11]:
##########################
# Feature Encoding
##########################

'''
Categorial features (protocol_types, service, flags) are encoded into integers.
-----------
protocol_types:
['tcp' 'udp' 'icmp']                          ==> [0, 1, 2]
service:
['ftp_data', 'telnet', ... 'rje', 'harvest']  ==> [0, 1, .... 67, 68]
flags:
['SF', 'S0', ...  ,'S2', 'OTH']               ==> [ 0, 1 ... , 9, 10]
-----------
'''

x_train = hm.encodeFeatures(x_train)
x_test = hm.encodeFeatures(x_test)


In [12]:
##########################
# Label Encoding
##########################

'''
5 different attack groups are encoded into integers
-----------
NORMAL         ==>   0
DOS            ==>   1
PROBE          ==>   2
R2L            ==>   3
U2R            ==>   4
-----------
'''
y_train = hm.encodeLabels(y_train)
y_test = hm.encodeLabels(y_test)

In [13]:
#########################
####### Binarize Labels
#########################

y_train = hm.binarizeLabels(y_train)
y_test = hm.binarizeLabels(y_test)


In [14]:
#########################
######### Feature Scaling
#########################

'''
Scale non-categorial features into to values between 0 and 1
'''

x_train, x_test = hm.scaleFeatures(x_train, x_test) 


#########################
######### Turn data Frame into matrix / into vector
#########################

x_train = x_train.as_matrix()
x_test = x_test.as_matrix()




In [18]:
data = {}

data['x_train'] = x_train
data['y_train'] = y_train
data['x_test'] = x_test
data['y_test'] = y_test
 
pickle.dump( data, open( "nsl-kdd.p", "wb" ) )