In [11]:
import numpy as np
from pandas import DataFrame, read_csv
import os
from sklearn import preprocessing
    

In [18]:
# Read in data

header = list(read_csv(os.getcwd() + '\NSL_KDD\Field Names.csv', header=None)[0])
field_types = read_csv(os.getcwd() + '\NSL_KDD\Field Names.csv', names=['field_name','field_type'])
intrusions = read_csv(os.getcwd() + '\NSL_KDD\Attack Types.csv', names=['attack_type','attack_group','set'])
data_train = read_csv(os.getcwd() + '\NSL_KDD\KDDTrain+.csv', names=header)
data_test = read_csv(os.getcwd() + '\NSL_KDD\KDDTest+.csv', names=header)

x_train = data_train.ix[:,:-2]
y_train = data_train.ix[:,-2:-1]

x_test = data_test.ix[:,:-2]
y_test = data_test.ix[:,-2:-1]


In [13]:
# Mapping of attack types

attack_tuples = list(zip(intrusions.attack_type, intrusions.attack_group))
attack_mapping = {entry[0]:entry[1] for entry in attack_tuples}

y_train = y_train.replace({'attack_type':attack_mapping})
y_test = y_test.replace({'attack_type':attack_mapping})


In [14]:
# Encoding the data

protocol_types = x_train.protocol_type.unique()
services = x_train.service.unique()
flags = x_train.flag.unique()
attack_types = y_train.attack_type.unique()

le = preprocessing.LabelEncoder()

le.fit(protocol_types)
x_train.protocol_type = le.transform(x_train.protocol_type)
x_test.protocol_type = le.transform(x_test.protocol_type)


le.fit(services)
x_train.service = le.transform(x_train.service)
x_test.service = le.transform(x_test.service)

le.fit(flags)
x_train.flag = le.transform(x_train.flag)
x_test.flag = le.transform(x_test.flag)

le.fit(attack_types)
y_train.attack_type = le.transform(y_train.attack_type)
y_test.attack_type = le.transform(y_test.attack_type)



In [20]:
# Normalizing

field_tuples = list(zip(field_types.field_name, field_types.field_type))

field_tuples




[('duration', 'continuous'),
 ('protocol_type', 'symbolic'),
 ('service', 'symbolic'),
 ('flag', 'symbolic'),
 ('src_bytes', 'continuous'),
 ('dst_bytes', 'continuous'),
 ('land', 'continuous'),
 ('wrong_fragment', 'continuous'),
 ('urgent', 'continuous'),
 ('hot', 'continuous'),
 ('num_failed_logins', 'continuous'),
 ('logged_in', 'continuous'),
 ('num_compromised', 'continuous'),
 ('root_shell', 'continuous'),
 ('su_attempted', 'continuous'),
 ('num_root', 'continuous'),
 ('num_file_creations', 'continuous'),
 ('num_shells', 'continuous'),
 ('num_access_files', 'continuous'),
 ('num_outbound_cmds', 'continuous'),
 ('is_host_login', 'continuous'),
 ('is_guest_login', 'continuous'),
 ('count', 'continuous'),
 ('srv_count', 'continuous'),
 ('serror_rate', 'continuous'),
 ('srv_serror_rate', 'continuous'),
 ('rerror_rate', 'continuous'),
 ('srv_rerror_rate', 'continuous'),
 ('same_srv_rate', 'continuous'),
 ('diff_srv_rate', 'continuous'),
 ('srv_diff_host_rate', 'continuous'),
 ('dst_ho