# Preprocessing

## 1. Acquire the Dataset

In [None]:
# Import the required libraries
import os
import math
import numpy as np
import matplotlib.pylab as plt
import pandas as pd
import seaborn as sns
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

In [None]:
# Load the data set
df = pd.read_csv ( '../data/kddcup.data.corrected' )

In [None]:
# Look at the first few rows, make sure data is loaded correctly. The number of columns (features) is as expected
df.head ( 10 )

## 2. Inspect the Data

In [None]:
# Inspect the data type of each feature. Maybe convert them to more appropriate data type later.
num_of_data_points = df.shape [ 0 ]
num_of_features = df.shape [ 1 ]
df.info ( verbose = True )

In [None]:
# Looks like there are data that are misrepresented as 'object' or 'int64' when they're in fact category strings or booleans
# Here we fix that
df [ 'protocol_type' ] = df [ 'protocol_type' ].astype ( 'category' )
df [ 'service' ] = df [ 'service' ].astype ( 'category' )
df [ 'flag' ] = df [ 'flag' ].astype ( 'category' )
df [ 'land' ] = df [ 'land' ].astype ( 'category' )
df [ 'logged_in' ] = df [ 'logged_in' ].astype ( 'category' )
df [ 'is_host_login' ] = df [ 'is_host_login' ].astype ( 'category' )
df [ 'is_guest_login' ] = df [ 'is_guest_login' ].astype ( 'category' )
df [ 'target' ] = df [ 'target' ].astype ( 'category' )

In [None]:
# Look and check the conversion is correct
df.info ( verbose = True )

In [None]:
# Check the data for any unusual or invalid values (e.g negative values for duration or byte size or count and values above 1 for rate)
df.describe ()

In [None]:
# Quick scan for categorical columns
df.describe ( include = 'category' )

## 3. Cleanup the Data

In [None]:
# Check if there are any nan or invalid values. Remove them if there is.
df [ df.isna ().any ( axis = 1 ) ]

## 4. Standarize/Categorize Variables

In [None]:
df [ 'protocol_type' ].unique ()

In [None]:
df [ 'service' ].unique ()

In [None]:
df [ 'flag' ].unique ()

In [None]:
df [ 'land' ].unique ()

In [None]:
df [ 'logged_in' ].unique ()

In [None]:
df [ 'is_host_login' ].unique ()

In [None]:
df [ 'is_guest_login' ].unique ()

In [None]:
df [ 'target' ].unique ()

## 5. Analyze Variables (Reduce Dimensions)

In [None]:
# First, it was notied that the feature named 'num_outbound_cmds' is nothing but zero values.
df [ 'num_outbound_cmds' ].describe ()

In [None]:
# So we drop it
df = df.drop ( columns = [ 'num_outbound_cmds' ] )

In [None]:
# We have a total of 34 numerical/continuous features and 7 categorical features
# We define feature selection methods
x_numerical = df.select_dtypes ( exclude = [ object , 'category' ] )
# Let's say we only need a set percentage of the total number of features
percentage_of_features = 0.5
num_of_numerical_features = x_numerical.shape [ 1 ]
num_of_selected_numerical_features = math.ceil ( num_of_numerical_features * percentage_of_features )
# Here we use Analysis Of Variance (AVONA) F-Test. It is best suited for numerical input and categorical output.
fs = SelectKBest ( score_func = f_classif , k = num_of_selected_numerical_features )
# Apply the feature selection
y = df [ 'target' ]
x_numerical_selected = fs.fit_transform ( x_numerical , y )
x_numerical.loc [ : , fs.get_support ( indices = False ) ]

In [None]:
corr = df.select_dtypes ( exclude = [ object , 'category' ] ).corr (method='pearson')

# Generate a mask for the upper triangle
mask = np.triu ( np.ones_like ( corr , dtype = bool ) )

# Set up the matplotlib figure
f , ax = plt.subplots ( figsize = (20 , 20) )

# Generate a custom diverging colormap
cmap = sns.diverging_palette ( 240 , 360 , as_cmap = True )

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap ( corr , mask = mask , cmap = cmap , vmax = .3 , center = 0 , square = True , linewidths = .5 , cbar_kws = {
    "shrink" : .5
    } )

## 6. Split the Dataset