In [2]:
"""
This python script does the following:
1. Loads NSL_KDD_Dataset from a file
2. Convert every continuous variable x into categorical by binnig. 
   Here a quantile based binning is used
   0.0 <= x < 0.2 : 1
   0.2 <= x < 0.4 : 2
   0.4 <= x < 0.6 : 3
   0.6 <= x < 0.8 : 4
   0.8 <= x < 1.0 : 5
3. Convert all categorical variables to boolean using one hot encoding
4. Write the resulting output into a file after compressing using pickle
"""

'\nThis python script does the following:\n1. Loads NSL_KDD_Dataset from a file\n2. Convert every continuous variable x into categorical by binnig. Here a quantile based binning is used\n   - x < 0.2 : low\n   - 0.2 <= x < 0.4: low_medium\n   - 0.4 <= x\n'

In [61]:
import numpy as np
import pandas as pd
from sklearn import preprocessing

In [62]:
# File containing NSL_KDD_DataSet
f = '/Users/harikoduvely/Projects/RL/NSL_KDD_DataSet/Small_Training_Set.csv'

In [63]:
# Column Names for NSL_KDD_DataSet
nsl_kdd_columns = ['duration',
'protocol_type',
'service',
'flag',
'src_bytes',
'dst_bytes',
'land',
'wrong_fragment',
'urgent',
'hot',
'num_failed_logins',
'logged_in',
'num_compromised',
'root_shell',
'su_attempted',
'num_root',
'num_file_creations',
'num_shells',
'num_access_files',
'num_outbound_cmds',
'is_host_login',
'is_guest_login',
'count',
'srv_count',
'serror_rate',
'srv_serror_rate',
'rerror_rate',
'srv_rerror_rate',
'same_srv_rate',
'diff_srv_rate',
'srv_diff_host_rate',
'dst_host_count',
'dst_host_srv_count',
'dst_host_same_srv_rate',
'dst_host_diff_srv_rate',
'dst_host_same_src_port_rate',
'dst_host_srv_diff_host_rate',
'dst_host_serror_rate',
'dst_host_srv_serror_rate',
'dst_host_rerror_rate',
'dst_host_srv_rerror_rate',
'attack_type',
'no_correctly_classified'] 

In [64]:
# Load Data
df_nsl = pd.read_csv(f,header=None, names=nsl_kdd_columns) 

In [65]:
# Dropping the column 'no_correctly_classified'
df_nsl = df_nsl.drop(['no_correctly_classified'], axis=1)

In [66]:
# Split Numeric and Categorical Variables
df_nsl_cat = df_nsl.select_dtypes(include=[object])

In [67]:
df_nsl_num = df_nsl.select_dtypes(exclude=[object])

In [68]:
# Keeping the attack_type in a separate dataframe (no need to one hot encode this)
df_nsl_attack_type = df_nsl_cat.drop(['protocol_type','service','flag'], axis=1)

In [69]:
df_nsl_cat = df_nsl_cat.drop(['attack_type'], axis=1)

In [70]:
# Keeping the binary variables in a separate dataframe (no need to bin these)
df_nsl_bin = df_nsl_num[['logged_in','is_host_login','is_guest_login','root_shell','urgent','su_attempted']]

In [71]:
df_nsl_cont= df_nsl_num.drop(['logged_in','is_host_login','is_guest_login','root_shell','urgent','su_attempted'], axis=1)

In [72]:
# Discretizing continuous variables using binning
def quantile_discret(x, q2, q4, q6, q8):
    # _1  : low 
    # _2 : Low_Medium
    # _3  : Medium
    # _4 : High_Medium
    # _5  : High
    if x < q2:
        y = 1
    elif (x >= q2 and x < q4):
        y = 2
    elif (x >= q4 and x < q6):
        y = 3
    elif (x >= q6 and x < q8):
        y = 4
    else:
        y = 5

    return y

In [73]:
colnames_nsl_cont = list(df_nsl_cont)

In [74]:
for name in colnames_nsl_cont:
    new_name = 'disc_' + name
    q2 = df_nsl_cont[name].quantile(0.2)
    q4 = df_nsl_cont[name].quantile(0.4)
    q6 = df_nsl_cont[name].quantile(0.6)
    q8 = df_nsl_cont[name].quantile(0.8)
    df_nsl_cont[new_name] = df_nsl_cont[name].apply(quantile_discret, args=(q2, q4, q6, q8))

In [75]:
df_nsl_disc = df_nsl_cont.drop(colnames_nsl_cont, axis=1)

In [76]:
df_nsl_disc.head()

Unnamed: 0,disc_duration,disc_src_bytes,disc_dst_bytes,disc_land,disc_wrong_fragment,disc_hot,disc_num_failed_logins,disc_num_compromised,disc_num_root,disc_num_file_creations,...,disc_dst_host_count,disc_dst_host_srv_count,disc_dst_host_same_srv_rate,disc_dst_host_diff_srv_rate,disc_dst_host_same_src_port_rate,disc_dst_host_srv_diff_host_rate,disc_dst_host_serror_rate,disc_dst_host_srv_serror_rate,disc_dst_host_rerror_rate,disc_dst_host_srv_rerror_rate
0,5,5,3,5,5,5,5,5,5,5,...,2,3,3,3,4,4,4,4,5,5
1,5,3,3,5,5,5,5,5,5,5,...,5,1,1,5,5,4,4,4,5,5
2,5,3,3,5,5,5,5,5,5,5,...,5,3,3,4,3,4,5,5,5,5
3,5,4,5,5,5,5,5,5,5,5,...,1,5,5,2,4,5,4,4,5,5
4,5,4,4,5,5,5,5,5,5,5,...,5,5,5,2,3,4,4,4,5,5


In [91]:
for name in list(df_nsl_num):
    print(name, df_nsl_num[name].describe())

('duration', count     1011.000000
mean       311.960435
std       2554.352933
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max      41285.000000
Name: duration, dtype: float64)
('src_bytes', count    1.011000e+03
mean     7.874637e+03
std      1.754888e+05
min      0.000000e+00
25%      0.000000e+00
50%      4.100000e+01
75%      2.675000e+02
max      5.131424e+06
Name: src_bytes, dtype: float64)
('dst_bytes', count      1011.000000
mean       1449.090010
std        5854.309216
min           0.000000
25%           0.000000
50%           0.000000
75%         537.500000
max      148722.000000
Name: dst_bytes, dtype: float64)
('land', count    1011.0
mean        0.0
std         0.0
min         0.0
25%         0.0
50%         0.0
75%         0.0
max         0.0
Name: land, dtype: float64)
('wrong_fragment', count    1011.000000
mean        0.033630
std         0.312854
min         0.000000
25%         0.000000
50%         0.000000
75%         0.0

In [87]:
df_nsl['duration'].describe()

count     1011.000000
mean       311.960435
std       2554.352933
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max      41285.000000
Name: duration, dtype: float64

In [59]:
enc = preprocessing.OneHotEncoder()

In [60]:
enc.fit(df_nsl_disc)

ValueError: Found array with 0 feature(s) (shape=(1011, 0)) while a minimum of 1 is required.