# Dataset Processing

## Read the training and testing data and save them into pandas dataframes.

In [1]:
import pandas as pd

# Collect the data from the zipped files
df_training = pd.read_csv('kddcup.data.gz', header=None)
df_testing = pd.read_csv('corrected.gz', header=None)

In [2]:
# Split the data from labels
trlabels = df_training.iloc[:, 41].values
tslabels = df_testing.iloc[:, 41].values

training = df_training.drop(df_training.columns[41], axis=1)
testing = df_testing.drop(df_testing.columns[41], axis=1)

# The data after dropping the headers should be of shape (4898431, 41) and (311029, 41)
assert (training.shape == (4898431, 41))
assert (testing.shape == (311029, 41))
print(trlabels)
print(tslabels)

['normal.' 'normal.' 'normal.' ... 'normal.' 'normal.' 'normal.']
['normal.' 'normal.' 'normal.' ... 'normal.' 'normal.' 'normal.']


## Convert the categorical values into numeric values.

In [3]:
import numpy as np
from sklearn.preprocessing import LabelEncoder

def cat_to_num(trcolumn, tscolumn):
    """
    Converts 2 categorical columns of the same types into numerical columns

    Args:
        trcolumn (ndarray): ndarray of values of the first column.
        tscolumn (ndarray): ndarray of values of the second column.

    Returns:
        tuple: a tuple of 2 ndarrays
    """
    encoder = LabelEncoder()
    categories = set(np.unique(trcolumn)).union(set(np.unique(tscolumn)))
    encoder.fit(list(categories))
    return encoder.transform(trcolumn), encoder.transform(tscolumn)


In [4]:
# Copy the data into another dataframe to convert its categorical values into numerical.
num_training = training.copy()
num_testing = testing.copy()

# Convert the categorical features.
for i in range(1, 4):
    values = cat_to_num(num_training.iloc[:, i].values, num_testing.iloc[:, i].values)
    num_training.isetitem(i, values[0])
    num_testing.isetitem(i, values[1])

# Convert the labels.
num_trlabels, num_tslabels = cat_to_num(trlabels, tslabels)

The data is now available in two forms:
* Form One (Categorical):
    * training
    * testing
    * trlabels
    * tslabels

* Form Two (Numerical):
    * training
    * testing
    * trlabels
    * tslabels

# Normalized Cut

In [6]:
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split as tts
import numpy as np


def vecsort(vectors, values):
    """
    Sorts vectors based on values.

    Args:
        vectors (nparray): nparray of vectors to be sorted.
        values (nparray): nparray of values to be used to sort.

    Returns:
        nparray: nparray of sorted vectors with respect to values.
    """
    return vectors[:, np.argsort(values)[::-1]]


def sim_mat(data, sigma=1.0):
    """
    Constructs the similiarity graph of a given dataset.

    Args:
        data (ndarray): ndarray of the dataset.
        sigma (float, optional): Defaults to 1.0.

    Returns:
        ndarray: the similarity graph.
    """
    print(data)
    n = data.shape[0]
    S = np.zeros((n, n))
    for i in range(n):
        for j in range(i + 1, n):
            diff = data[i, :] - data[j, :]
            S[i, j] = np.exp(-np.dot(diff, diff) / (2 * sigma ** 2))
            S[j, i] = S[i, j]
    return S


def ncut(data, k, sigma=1.0):
    """
    Splits the data into a training set and testing set with ratio 0.5% for training dataset, then applies the normalized cut algorithm on the reduced training dataset.

    Args:
        data (pd.DataFrame): pd.DataFrame containing the original dataset.
        k (int): number of clusters.
        sigma (float, optional): Defaults to 1.0.

    Returns:
        nparray: nparray of labels after applying the normalized cut algorithm.
    """
    training = tts(data, random_state=42, train_size=0.005)[0]

    training = training.astype('float32')

    # Construct the similarity graph
    S = cosine_similarity(training)

    # Construct the degree matrix
    degrees = np.sum(S, axis=1)
    D = np.diag(degrees)

    # Compute Laplacian Matrix
    L = D - S

    # Compute sorted eigenvectors of the Laplacian Matrix
    values, vectors = np.linalg.eig(L)
    eigvectors = vecsort(vectors, values)

    # Perform K-means clustering on eigenvectors
    # TODO: This should be using the developed kmeans algorithm instead of python's built-in method.
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(eigvectors[:, 1:k])

    return kmeans.labels_


print(ncut(num_training, 23))


MemoryError: Unable to allocate 8.94 GiB for an array with shape (24492, 24492) and data type complex128