In [1]:
import numpy as np
from pandas import DataFrame, read_csv
import os
from sklearn import preprocessing
    

In [11]:
# Read in data

header = list(read_csv('Field Names.csv', header=None)[0])
field_types = read_csv('Field Names.csv', names=['field_name','field_type'])
intrusions = read_csv('Attack Types.csv', names=['attack_type','attack_group','set'])
data_train = read_csv('KDDTrain+.csv', names=header)
data_test = read_csv('KDDTest+.csv', names=header)

x_train = data_train.ix[:,:-2]
y_train = data_train.ix[:,-2:-1]

x_test = data_test.ix[:,:-2]
y_test = data_test.ix[:,-2:-1]


In [12]:
# Mapping of attack types

attack_tuples = list(zip(intrusions.attack_type, intrusions.attack_group))
attack_mapping = {entry[0]:entry[1] for entry in attack_tuples}

y_train = y_train.replace({'attack_type':attack_mapping})
y_test = y_test.replace({'attack_type':attack_mapping})


In [13]:
# Encoding the data

protocol_types = x_train.protocol_type.unique()
services = x_train.service.unique()
flags = x_train.flag.unique()
attack_types = y_train.attack_type.unique()

le = preprocessing.LabelEncoder()

le.fit(protocol_types)
x_train.protocol_type = le.transform(x_train.protocol_type)
x_test.protocol_type = le.transform(x_test.protocol_type)


le.fit(services)
x_train.service = le.transform(x_train.service)
x_test.service = le.transform(x_test.service)

le.fit(flags)
x_train.flag = le.transform(x_train.flag)
x_test.flag = le.transform(x_test.flag)

le.fit(attack_types)
y_train.attack_type = le.transform(y_train.attack_type)
y_test.attack_type = le.transform(y_test.attack_type)



In [29]:
# Normalizing the data

field_tuples = list(zip(field_types.field_name, field_types.field_type))
fields = [field_tuple[0] for field_tuple in field_tuples if field_tuple[1] == 'continuous']

field_max_values = {field:np.amax(x_train[field]) for field in fields}
field_max_values = {field:np.amax(x_test[field]) if np.amax(x_test[field]) > field_max_values[field] else field_max_values[field] for field in fields}                                                                                                                                             

field_min_values = {field:np.amin(x_train[field]) for field in fields}
field_min_values = {field:np.amin(x_test[field]) if np.amin(x_test[field]) > field_min_values[field] else field_min_values[field] for field in fields}                                                                                                                                             
                      



    
    


{'num_access_files': 0, 'src_bytes': 0, 'srv_count': 0, 'num_compromised': 0, 'rerror_rate': 0.0, 'urgent': 0, 'dst_host_same_srv_rate': 0.0, 'duration': 0, 'srv_rerror_rate': 0.0, 'srv_serror_rate': 0.0, 'is_host_login': 0, 'wrong_fragment': 0, 'serror_rate': 0.0, 'num_outbound_cmds': 0, 'is_guest_login': 0, 'dst_host_rerror_rate': 0.0, 'dst_host_srv_serror_rate': 0.0, 'diff_srv_rate': 0.0, 'hot': 0, 'dst_host_srv_count': 0, 'logged_in': 0, 'num_shells': 0, 'dst_host_srv_diff_host_rate': 0.0, 'srv_diff_host_rate': 0.0, 'dst_host_same_src_port_rate': 0.0, 'root_shell': 0, 'su_attempted': 0, 'dst_host_count': 0, 'num_file_creations': 0, 'count': 0, 'land': 0, 'same_srv_rate': 0.0, 'dst_bytes': 0, 'dst_host_diff_srv_rate': 0.0, 'dst_host_srv_rerror_rate': 0.0, 'num_root': 0, 'num_failed_logins': 0, 'dst_host_serror_rate': 0.0}
