# Preprocessing

## 1. Acquire the Dataset

In [None]:
# Import the required libraries
import os
import numpy as np
import matplotlib.pylab as plt
import pandas as pd
import seaborn as sns
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

In [None]:
# Load the data set
data_as_dataframe = pd.read_csv ('kddcup.data.corrected')

In [None]:
# Look at the first few rows, make sure data is loaded correctly. The number of columns (features) is as expected
data_as_dataframe.head(10)

## 2. Inspect the Data

In [None]:
# Inspect the data type of each feature. Maybe convert them to more appropriate data type later.
data_as_dataframe.info (verbose=True)

In [None]:
# Looks like there are data that are misrepresented as 'object' or 'int64' when they're in fact category strings or booleans
# Here we fix that
data_as_dataframe['protocol_type']=data_as_dataframe['protocol_type'].astype('category')
data_as_dataframe['service']=data_as_dataframe['service'].astype('category')
data_as_dataframe['flag']=data_as_dataframe['flag'].astype('category')
data_as_dataframe['land']=data_as_dataframe['land'].astype('category')
data_as_dataframe['logged_in']=data_as_dataframe['logged_in'].astype('category')
data_as_dataframe['is_host_login']=data_as_dataframe['is_host_login'].astype('category')
data_as_dataframe['is_guest_login']=data_as_dataframe['is_guest_login'].astype('category')
data_as_dataframe['target']=data_as_dataframe['target'].astype('category')

In [None]:
# Look and check the conversion is correct
data_as_dataframe.info (verbose=True)

In [None]:
# Check the data for any unusual or invalid values (e.g negative values for duration or byte size or count and values above 1 for rate)
data_as_dataframe.describe()

In [None]:
# Quick scan for categorical columns
data_as_dataframe.describe(include = 'category')

## 3. Cleanup the Data

In [None]:
# Check if there are any nan or invalid values. Remove them if there is.
data_as_dataframe[data_as_dataframe.isna().any(axis=1)]

## 4. Standarize/Categorize Variables

In [None]:
data_as_dataframe['protocol_type'].unique()

In [None]:
data_as_dataframe['service'].unique()

In [None]:
data_as_dataframe['flag'].unique()

In [None]:
data_as_dataframe['land'].unique()

In [None]:
data_as_dataframe['logged_in'].unique()

In [None]:
data_as_dataframe['is_host_login'].unique()

In [None]:
data_as_dataframe['is_guest_login'].unique()

In [None]:
data_as_dataframe['target'].unique()

## 5. Analyze Variables (Reduce Dimensions)

In [None]:
# First, it was notied that the feature named 'num_outbound_cmds' is nothing but zero values.
data_as_dataframe['num_outbound_cmds'].describe()

In [None]:
# So we drop it
data_as_dataframe = data_as_dataframe.drop(columns=['num_outbound_cmds'])

In [None]:
# define feature selection
fs = SelectKBest(score_func=f_classif, k=20)
# apply feature selection
x_numerical = data_as_dataframe.select_dtypes(exclude=[object,'category'])
y = data_as_dataframe['target']
X_selected = fs.fit_transform(x_numerical, y)
print(X_selected.shape)

## 6. Split the Dataset