# Preprocessing

## 1. Acquire the Dataset

In [1]:
# Import the required libraries
import os
import numpy as np
import matplotlib.pylab as plt
import pandas as pd
import seaborn as sns
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

In [2]:
# Load the data set
data_as_dataframe = pd.read_csv ('kddcup.data.corrected')

In [3]:
# Look at the first few rows, make sure data is loaded correctly. The number of columns (features) is as expected
data_as_dataframe.head(10)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,target
0,0,tcp,http,SF,215,45076,0,0,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,162,4528,0,0,0,0,...,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,236,1228,0,0,0,0,...,2,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,233,2032,0,0,0,0,...,3,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,239,486,0,0,0,0,...,4,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,normal.
5,0,tcp,http,SF,238,1282,0,0,0,0,...,5,1.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,normal.
6,0,tcp,http,SF,235,1337,0,0,0,0,...,6,1.0,0.0,0.17,0.0,0.0,0.0,0.0,0.0,normal.
7,0,tcp,http,SF,234,1364,0,0,0,0,...,7,1.0,0.0,0.14,0.0,0.0,0.0,0.0,0.0,normal.
8,0,tcp,http,SF,239,1295,0,0,0,0,...,8,1.0,0.0,0.12,0.0,0.0,0.0,0.0,0.0,normal.
9,0,tcp,http,SF,181,5450,0,0,0,0,...,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,normal.


## 2. Inspect the Data

In [4]:
# Inspect the data type of each feature. Maybe convert them to more appropriate data type later.
data_as_dataframe.info (verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898431 entries, 0 to 4898430
Data columns (total 42 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   duration                     int64  
 1   protocol_type                object 
 2   service                      object 
 3   flag                         object 
 4   src_bytes                    int64  
 5   dst_bytes                    int64  
 6   land                         int64  
 7   wrong_fragment               int64  
 8   urgent                       int64  
 9   hot                          int64  
 10  num_failed_logins            int64  
 11  logged_in                    int64  
 12  num_compromised              int64  
 13  root_shell                   int64  
 14  su_attempted                 int64  
 15  num_root                     int64  
 16  num_file_creations           int64  
 17  num_shells                   int64  
 18  num_access_files             int64  
 19  

In [5]:
# Looks like there are data that are misrepresented as 'object' or 'int64' when they're in fact category strings or booleans
# Here we fix that
data_as_dataframe['protocol_type']=data_as_dataframe['protocol_type'].astype('category')
data_as_dataframe['service']=data_as_dataframe['service'].astype('category')
data_as_dataframe['flag']=data_as_dataframe['flag'].astype('category')
data_as_dataframe['land']=data_as_dataframe['land'].astype('category')
data_as_dataframe['logged_in']=data_as_dataframe['logged_in'].astype('category')
data_as_dataframe['is_host_login']=data_as_dataframe['is_host_login'].astype('category')
data_as_dataframe['is_guest_login']=data_as_dataframe['is_guest_login'].astype('category')
data_as_dataframe['target']=data_as_dataframe['target'].astype('category')

In [6]:
# Look and check the conversion is correct
data_as_dataframe.info (verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898431 entries, 0 to 4898430
Data columns (total 42 columns):
 #   Column                       Dtype   
---  ------                       -----   
 0   duration                     int64   
 1   protocol_type                category
 2   service                      category
 3   flag                         category
 4   src_bytes                    int64   
 5   dst_bytes                    int64   
 6   land                         category
 7   wrong_fragment               int64   
 8   urgent                       int64   
 9   hot                          int64   
 10  num_failed_logins            int64   
 11  logged_in                    category
 12  num_compromised              int64   
 13  root_shell                   int64   
 14  su_attempted                 int64   
 15  num_root                     int64   
 16  num_file_creations           int64   
 17  num_shells                   int64   
 18  num_access_files      

In [7]:
# Check the data for any unusual or invalid values (e.g negative values for duration or byte size or count and values above 1 for rate)
data_as_dataframe.describe()

Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,num_compromised,root_shell,su_attempted,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
count,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,...,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0
mean,48.34243,1834.621,1093.623,0.0006487792,7.961733e-06,0.01243766,3.205108e-05,0.008088304,6.81851e-05,3.674646e-05,...,232.9811,189.2142,0.7537132,0.03071111,0.605052,0.006464107,0.1780911,0.1778859,0.0579278,0.05765941
std,723.3298,941431.1,645012.3,0.04285434,0.007215084,0.4689782,0.007299408,3.856481,0.008257146,0.008082432,...,64.02094,105.9128,0.411186,0.1085432,0.4809877,0.04125978,0.3818382,0.3821774,0.2309428,0.2309777
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,45.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,49.0,0.41,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,520.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1032.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.04,1.0,0.0,0.0,0.0,0.0,0.0
max,58329.0,1379964000.0,1309937000.0,3.0,14.0,77.0,5.0,7479.0,1.0,2.0,...,255.0,255.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
# Quick scan for categorical columns
data_as_dataframe.describe(include = 'category')

Unnamed: 0,protocol_type,service,flag,land,logged_in,is_host_login,is_guest_login,target
count,4898431,4898431,4898431,4898431,4898431,4898431,4898431,4898431
unique,3,70,11,2,2,2,2,23
top,icmp,ecr_i,SF,0,0,0,0,smurf.
freq,2833545,2811660,3744328,4898403,4195364,4898429,4894340,2807886


## 3. Cleanup the Data

In [9]:
# Check if there are any nan or invalid values. Remove them if there is.
data_as_dataframe[data_as_dataframe.isna().any(axis=1)]

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,target


## 4. Standarize/Categorize Variables

In [10]:
data_as_dataframe['protocol_type'].unique()

['tcp', 'udp', 'icmp']
Categories (3, object): ['tcp', 'udp', 'icmp']

In [11]:
data_as_dataframe['service'].unique()

['http', 'smtp', 'domain_u', 'auth', 'finger', ..., 'aol', 'tftp_u', 'http_8001', 'tim_i', 'red_i']
Length: 70
Categories (70, object): ['http', 'smtp', 'domain_u', 'auth', ..., 'tftp_u', 'http_8001', 'tim_i', 'red_i']

In [12]:
data_as_dataframe['flag'].unique()

['SF', 'S2', 'S1', 'S3', 'OTH', ..., 'RSTO', 'S0', 'RSTR', 'RSTOS0', 'SH']
Length: 11
Categories (11, object): ['SF', 'S2', 'S1', 'S3', ..., 'S0', 'RSTR', 'RSTOS0', 'SH']

In [14]:
data_as_dataframe['land'].unique()

[0, 1]
Categories (2, int64): [0, 1]

In [15]:
data_as_dataframe['logged_in'].unique()

[1, 0]
Categories (2, int64): [1, 0]

In [16]:
data_as_dataframe['is_host_login'].unique()

[0, 1]
Categories (2, int64): [0, 1]

In [17]:
data_as_dataframe['is_guest_login'].unique()

[0, 1]
Categories (2, int64): [0, 1]

In [19]:
data_as_dataframe['target'].unique()

['normal.', 'buffer_overflow.', 'loadmodule.', 'perl.', 'neptune.', ..., 'multihop.', 'warezmaster.', 'warezclient.', 'spy.', 'rootkit.']
Length: 23
Categories (23, object): ['normal.', 'buffer_overflow.', 'loadmodule.', 'perl.', ..., 'warezmaster.', 'warezclient.', 'spy.', 'rootkit.']

## 5. Analyze Variables (Reduce Dimensions)

In [23]:
# First, it was notied that the feature named 'num_outbound_cmds' is nothing but zero values.
data_as_dataframe['num_outbound_cmds'].describe()

count    4898431.0
mean           0.0
std            0.0
min            0.0
25%            0.0
50%            0.0
75%            0.0
max            0.0
Name: num_outbound_cmds, dtype: float64

In [24]:
# So we drop it
data_as_dataframe = data_as_dataframe.drop(columns=['num_outbound_cmds'])

In [25]:
# define feature selection
fs = SelectKBest(score_func=f_classif, k=20)
# apply feature selection
x_numerical = data_as_dataframe.select_dtypes(exclude=[object,'category'])
y = data_as_dataframe['target']
X_selected = fs.fit_transform(x_numerical, y)
print(X_selected.shape)

(4898431, 20)


## 6. Split the Dataset