In [1]:
import pickle
import helperModule as hm
import warnings

warnings.filterwarnings("ignore")

In [2]:
#########################
############ Read in data
#########################

data_train, data_test = hm.readDataKDD99()


In [3]:
#########################
##### Replace Attack Type
#########################

'''
Remove . at the end of attack types
-----------
buffer_overflow.  ==>   buffer_overflow
ftp_write.        ==>   ftp_write
-----------
'''
data_train = hm.replaceAttackTypes(data_train)
data_test = hm.replaceAttackTypes(data_test)



In [4]:
#########################
## Drop Attack Type Index
#########################

'''
Remove column "attack_type_index"
'''

data_train = hm.dropAttackIndex(data_train)
data_test = hm.dropAttackIndex(data_test)


In [5]:
#########################
# Mapping of attack types
#########################

'''
38 different attack types are mapped to their corresponding group
-----------
normal                      ==>   NORMAL
back,land ..                ==>   DOS
satan,ipsweep,nmap, ..      ==>   PROBE
ftp_write,guess_passwd, ..  ==>   R2L
rootkit,perl ..             ==>   U2R
-----------
'''

data_train = hm.mapAttackTypes(data_train)
data_test = hm.mapAttackTypes(data_test)

In [6]:
##########################
# Feature Encoding
##########################

'''
Categorial features (protocol_types, service, flags) are encoded into integers.
-----------
protocol_types:
['tcp' 'udp' 'icmp']                          ==> [0, 1, 2]
service:
['ftp_data', 'telnet', ... 'rje', 'harvest']  ==> [0, 1, .... 67, 68]
flags:
['SF', 'S0', ...  ,'S2', 'OTH']               ==> [ 0, 1 ... , 9, 10]
-----------
'''

data_train, data_test = hm.encodeFeatures(data_train, data_test)

In [7]:
##########################
# Label Encoding
##########################

'''
5 different attack groups are encoded into integers
-----------
NORMAL         ==>   0
DOS            ==>   1
PROBE          ==>   2
U2R            ==>   3
R2L            ==>   4
-----------
'''
data_train, data_test = hm.encodeLabels(data_train, data_test)

In [8]:
#########################
############## Split data
#########################

'''
Split data into features and labels

'''

x_train, y_train = hm.splitData(data_train)
x_test, y_test = hm.splitData(data_test)


In [9]:
#########################
####### Binarize Labels
#########################

'''
5 different attack groups are binarized
-----------
0         ==>   [1,0,0,0,0]
1         ==>   [0,1,0,0,0]
2         ==>   [0,0,1,0,0]
3         ==>   [0,0,0,1,0]
4         ==>   [0,0,0,0,1]
-----------
'''

y_train = hm.binarizeLabels(y_train)
y_test = hm.binarizeLabels(y_test)

In [10]:
#########################
######### Feature Scaling
#########################

'''
Scale features into to values between 0 and 1
'''

x_train, x_test = hm.scaleFeatures(x_train, x_test)


In [11]:
#########################
####### Feature Selection
#########################

'''
Scale features into to values between 0 and 1
'''

x_train = x_train.iloc[:,[0,1,2,4,5,7,24,32,34,35,39]]
x_test = x_test.iloc[:,[0,1,2,4,5,7,24,32,34,35,39]]

In [12]:
#########################
## Feature Transformation
#########################

'''
Turn features into numpy arrays
'''

x_train = x_train.ix[:,:].values
x_test = x_test.ix[:,:].values


In [13]:
data = {}

data['x_train'] = x_train
data['x_test'] = x_test
data['y_train'] = y_train
data['y_test'] = y_test
 
dump = pickle.dump( data, open( "kdd99reduced.p", "wb" ) )