In [1]:
import pandas as pd
import numpy as np

import helperModule as hm


In [2]:
fields = pd.read_csv('Field Names.csv', header=None)
field_names = list(fields[0])
field_types = fields[1]
data_train = pd.read_csv('KDD99Train.csv', names=field_names)
data_test = pd.read_csv('KDD99Test.csv', names=field_names)


In [3]:
print(fields)

                              0           1
0                      duration  continuous
1                 protocol_type    symbolic
2                       service    symbolic
3                          flag    symbolic
4                     src_bytes  continuous
5                     dst_bytes  continuous
6                          land  continuous
7                wrong_fragment  continuous
8                        urgent  continuous
9                           hot  continuous
10            num_failed_logins  continuous
11                    logged_in  continuous
12              num_compromised  continuous
13                   root_shell  continuous
14                 su_attempted  continuous
15                     num_root  continuous
16           num_file_creations  continuous
17                   num_shells  continuous
18             num_access_files  continuous
19            num_outbound_cmds  continuous
20                is_host_login  continuous
21               is_guest_login 

In [4]:
size_train = len(data_train)
size_test = len(data_test)

num_fields_train = len(data_train.columns)
num_fields_test = len(data_test.columns)
unique_field_types = field_types.unique() 
field_type_count = {field:list(field_types).count(field) for field in unique_field_types}

print('There are a total of %d observations in the training set.' % size_train)
print('There are a total of %d observations in the test set.' % size_test)
print('Fields are of type: %s and of type: %s'   % (unique_field_types[0],unique_field_types[1]))
print('Number of fields of type "continous": %s' % field_type_count[unique_field_types[0]] )
print('Number of fields of type "symbolic": %s'  % field_type_count[unique_field_types[1]] )

There are a total of 494021 observations in the training set.
There are a total of 311029 observations in the test set.
Fields are of type: continuous and of type: symbolic
Number of fields of type "continous": 39
Number of fields of type "symbolic": 4


In [5]:
train_attacks = list(data_train['attack_type'].unique())
test_attacks = list(data_test['attack_type'].unique())
num_attacks_train = len(train_attacks)
num_attacks_test = len(test_attacks)

common_attacks_train_test = [attack for attack in train_attacks if attack in test_attacks]
unique_attack_train = [attack for attack in train_attacks if attack not in test_attacks]
unique_attack_test = [attack for attack in test_attacks if attack not in train_attacks]

print('There are a total of %d attack types in the training set.' % num_attacks_train)
print('There are a total of %d attack types in the test set.' % num_attacks_test)
print('Attacks that occur only in the training set are: [%s] ' % ', '.join(map(str, unique_attack_train)) )
print(' ')
print('Attacks that occur only in the test set are: [%s] ' % ', '.join(map(str, unique_attack_test)) )
print(' ')
print('Attacks that occur both sets: [%s] ' % ', '.join(map(str, common_attacks_train_test)) )



There are a total of 23 attack types in the training set.
There are a total of 38 attack types in the test set.
Attacks that occur only in the training set are: [warezclient., spy.] 
 
Attacks that occur only in the test set are: [snmpgetattack., named., xlock., xsnoop., sendmail., saint., apache2., udpstorm., xterm., mscan., processtable., ps., httptunnel., worm., mailbomb., sqlattack., snmpguess.] 
 
Attacks that occur both sets: [normal., buffer_overflow., loadmodule., perl., neptune., smurf., guess_passwd., pod., teardrop., portsweep., ipsweep., land., ftp_write., back., imap., satan., phf., nmap., multihop., warezmaster., rootkit.] 


In [44]:

attacks_train = data_train.ix[:,-2:-1]
attacks_test = data_test.ix[:,-2:-1]
attacks_train['attack_type'] = attacks_train['attack_type'].str.replace('.', '')
attacks_test['attack_type'] = attacks_test['attack_type'].str.replace('.', '')
attacks_train, attacks_test = hm.mapAttackTypes(attacks_train, attacks_test)
attack_type_groups = attacks_train['attack_type'].unique()

attack_count_train = {attack:list(attacks_train['attack_type']).count(attack) for attack in attack_type_groups}
attack_count_test = {attack:list(attacks_test['attack_type']).count(attack) for attack in attack_type_groups}

print('In the train set there are %d normal connetions that represent %f percent of the connections' % (attack_count_train['NORMAL'], float(attack_count_train['NORMAL'])/float(size_train)))
print('In the test set there are %d normal connetions that represent %f percent of the connections' % (attack_count_test['NORMAL'], float(attack_count_test['NORMAL'])/float(size_test)))
print('')
print('In the train set there are %d dos attacks that represent %f percent of the connections' % (attack_count_train['DOS'], float(attack_count_train['DOS'])/float(size_train)))
print('In the test set there are %d dos attacks that represent %f percent of the connections' % (attack_count_test['DOS'], float(attack_count_test['DOS'])/float(size_test)))
print('')
print('In the train set there are %d probe attacks that represent %f percent of the connections' % (attack_count_train['PROBE'], float(attack_count_train['PROBE'])/float(size_train)))
print('In the test set there are %d probe attacks that represent %f percent of the connections' % (attack_count_test['PROBE'], float(attack_count_test['PROBE'])/float(size_test)))
print('')
print('In the train set there are %d u2r attacks that represent %f percent of the connections' % (attack_count_train['U2R'], float(attack_count_train['U2R'])/float(size_train)))
print('In the test set there are %d u2r attacks that represent %f percent of the connections' % (attack_count_test['U2R'], float(attack_count_test['U2R'])/float(size_test)))
print('')
print('In the train set there are %d r2l attacks that represent %f percent of the connections' % (attack_count_train['R2L'], float(attack_count_train['R2L'])/float(size_train)))
print('In the test set there are %d r2l attacks that represent %f percent of the connections' % (attack_count_test['R2L'], float(attack_count_test['R2L'])/float(size_test)))


In the train set there are 97278 normal connetions that represent 0.196911 percent of the connections
In the test set there are 60593 normal connetions that represent 0.194815 percent of the connections

In the train set there are 391458 dos attacks that represent 0.792391 percent of the connections
In the test set there are 229853 dos attacks that represent 0.739008 percent of the connections

In the train set there are 4107 probe attacks that represent 0.008313 percent of the connections
In the test set there are 4166 probe attacks that represent 0.013394 percent of the connections

In the train set there are 52 u2r attacks that represent 0.000105 percent of the connections
In the test set there are 228 u2r attacks that represent 0.000733 percent of the connections

In the train set there are 1126 r2l attacks that represent 0.002279 percent of the connections
In the test set there are 16189 r2l attacks that represent 0.052050 percent of the connections
