# Dataset Processing

## Read the training and testing data and save them into pandas dataframes.

In [20]:
import pandas as pd

# Collect the data from the zipped files
df_training = pd.read_csv('kddcup.data.gz', header=None)
df_testing = pd.read_csv('corrected.gz', header=None)

In [22]:
# Split the data from labels
trlabels = df_training.iloc[:, 41].values
tslabels = df_testing.iloc[:, 41].values

training = df_training.drop(df_training.columns[41], axis=1)
testing = df_testing.drop(df_testing.columns[41], axis=1)

# The data after dropping the headers should be of shape (4898431, 41) and (311029, 41)
assert (training.shape == (4898431, 41))
assert (testing.shape == (311029, 41))
print(trlabels)
print(tslabels)

['normal.' 'normal.' 'normal.' ... 'normal.' 'normal.' 'normal.']
['normal.' 'normal.' 'normal.' ... 'normal.' 'normal.' 'normal.']


## Convert the categorical values into numeric values.

In [35]:
import numpy as np
from sklearn.preprocessing import LabelEncoder

def cat_to_num(trcolumn, tscolumn):
    """
    Converts 2 categorical columns of the same types into numerical columns

    Args:
        trcolumn (ndarray): ndarray of values of the first column.
        tscolumn (ndarray): ndarray of values of the second column.

    Returns:
        tuple: a tuple of 2 ndarrays
    """
    encoder = LabelEncoder()
    categories = set(np.unique(trcolumn)).union(set(np.unique(tscolumn)))
    encoder.fit(list(categories))
    return encoder.transform(trcolumn), encoder.transform(tscolumn)


In [39]:
# Copy the data into another dataframe to convert its categorical values into numerical.
num_training = training.copy()
num_testing = testing.copy()

# Convert the categorical features.
for i in range(1, 4):
    values = cat_to_num(num_training.iloc[:, i].values, num_testing.iloc[:, i].values)
    num_training.isetitem(i, values[0])
    num_testing.isetitem(i, values[1])

# Convert the labels.
num_trlabels, num_tslabels = cat_to_num(trlabels, tslabels)

The data is now available in two forms:
* Form One (Categorical):
    * training
    * testing
    * trlabels
    * tslabels

* Form Two (Numerical):
    * training
    * testing
    * trlabels
    * tslabels