# Dataset Processing

## Read the training and testing data and save them into pandas dataframes.

In [20]:
import pandas as pd

# Collect the data from the zipped files
df_training = pd.read_csv('kddcup.data.gz', header=None)
df_testing = pd.read_csv('corrected.gz', header=None)

In [22]:
# Split the data from labels
trlabels = df_training.iloc[:, 41].values
tslabels = df_testing.iloc[:, 41].values

training = df_training.drop(df_training.columns[41], axis=1)
testing = df_testing.drop(df_testing.columns[41], axis=1)

# The data after dropping the headers should be of shape (4898431, 41) and (311029, 41)
assert (training.shape == (4898431, 41))
assert (testing.shape == (311029, 41))
print(trlabels)
print(tslabels)

['normal.' 'normal.' 'normal.' ... 'normal.' 'normal.' 'normal.']
['normal.' 'normal.' 'normal.' ... 'normal.' 'normal.' 'normal.']


## Convert the categorical values into numeric values.

In [35]:
import numpy as np
from sklearn.preprocessing import LabelEncoder

def cat_to_num(trcolumn, tscolumn):
    """
    Converts 2 categorical columns of the same types into numerical columns

    Args:
        trcolumn (ndarray): ndarray of values of the first column.
        tscolumn (ndarray): ndarray of values of the second column.

    Returns:
        tuple: a tuple of 2 ndarrays
    """
    encoder = LabelEncoder()
    categories = set(np.unique(trcolumn)).union(set(np.unique(tscolumn)))
    encoder.fit(list(categories))
    return encoder.transform(trcolumn), encoder.transform(tscolumn)


In [39]:
# Copy the data into another dataframe to convert its categorical values into numerical.
num_training = training.copy()
num_testing = testing.copy()

# Convert the categorical features.
for i in range(1, 4):
    values = cat_to_num(num_training.iloc[:, i].values, num_testing.iloc[:, i].values)
    num_training.isetitem(i, values[0])
    num_testing.isetitem(i, values[1])

# Convert the labels.
num_trlabels, num_tslabels = cat_to_num(trlabels, tslabels)

The data is now available in two forms:
* Form One (Categorical):
    * training
    * testing
    * trlabels
    * tslabels

* Form Two (Numerical):
    * training
    * testing
    * trlabels
    * tslabels

Clustering Using K-Means

In [None]:
import numpy as np

def k_means(X, k, epsilon):
    
    n_samples, n_features = X.shape
    
    # Randomly choose k data points as the initial centroids
    centroids = X[np.random.choice(n_samples, k, replace=False)]
    distances = np.zeros((n_samples, k))
    labels = np.zeros(n_samples)
    old_centroids = np.zeros((k, n_features))
    
    # Continue until the centroids don't change by more than epsilon
    while np.linalg.norm(centroids - old_centroids) > epsilon:
        old_centroids = centroids.copy()
        
        # Calculate the Euclidean distances from each sample to each centroid
        for i in range(k):
            distances[:, i] = np.linalg.norm(X - centroids[i], axis=1)
        
        # Assign each sample to the nearest centroid
        labels = np.argmin(distances, axis=1)
        
        # Update the centroids to be the mean of the samples assigned to them
        for i in range(k):
            X_i = X[labels == i]
            if len(X_i) == 0:
                centroids[i] = old_centroids[i]
            else:
                centroids[i] = np.mean(X_i, axis=0)
        
    return centroids

In [None]:
import numpy as np
from sklearn.metrics.cluster import contingency_matrix
from scipy.optimize import linear_sum_assignment
label = []
train = np.array(train_data.drop(train_data.columns[-1], axis=1))
test = np.array(test_data.drop(train_data.columns[-1], axis=1))
results = []
for K in [7,15,23,31,45]:
    centroids = k_means(train,K,0.001)
    distances = np.linalg.norm(test[:, np.newaxis, :] - centroids, axis=2)
    labels = np.argmin(distances, axis=1)
    contingency = contingency_matrix(y_test, labels)
    row_ind, col_ind = linear_sum_assignment(-contingency)
    y_pred = np.zeros_like(labels)
    for i, j in zip(row_ind, col_ind):
       y_pred[labels == j] = i
    results.append(y_pred)
    print(results)

Evaluation

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score ,mutual_info_score
K_range = [7,15,23,31,45]
i=0
for K in K_range:
    print("K",K)
    precision = precision_score(results[i],y_test,average='weighted')
    print("precision",precision)
    recall = recall_score(results[i], y_test, average='weighted')
    print("recall",recall)
    f1 = f1_score(results[i], y_test, average='weighted') 
    print("f1_score",f1)
    conditional_entropy = mutual_info_score(results[i], y_test)
    print("conditional_entropy",conditional_entropy)
    i = i+1