# Download Datset and Understand the Format

In [12]:
import pandas as pd
# Collect the data from the zipped files
df_training = pd.read_csv('kddcup.data.gz', header=None)
df_testing = pd.read_csv('corrected.gz', header=None)
df_training_10 = pd.read_csv('kddcup.data_10_percent.gz', header=None)

In [13]:
# Split the data from labels
trlabels = df_training.iloc[:, 41].values
tslabels = df_testing.iloc[:, 41].values

training = df_training.drop(df_training.columns[41], axis=1)
testing = df_testing.drop(df_testing.columns[41], axis=1)

# The data after dropping the headers should be of shape (4898431, 41) and (311029, 41)
assert (training.shape == (4898431, 41))
assert (testing.shape == (311029, 41))
print(trlabels)
print(tslabels)

['normal.' 'normal.' 'normal.' ... 'normal.' 'normal.' 'normal.']
['normal.' 'normal.' 'normal.' ... 'normal.' 'normal.' 'normal.']


# Convert the categorical values into numeric values.

In [15]:
import numpy as np
from sklearn.preprocessing import LabelEncoder

def cat_to_num(trcolumn, tscolumn):
    """
    Converts 2 categorical columns of the same types into numerical columns

    Args:
        trcolumn (ndarray): ndarray of values of the first column.
        tscolumn (ndarray): ndarray of values of the second column.

    Returns:
        tuple: a tuple of 2 ndarrays
    """
    encoder = LabelEncoder()
    categories = set(np.unique(trcolumn)).union(set(np.unique(tscolumn)))
    encoder.fit(list(categories))
    return encoder.transform(trcolumn), encoder.transform(tscolumn)

In [16]:
# Copy the data into another dataframe to convert its categorical values into numerical.
num_training = training.copy()
num_testing = testing.copy()

# Convert the categorical features.
for i in range(1, 4):
    values = cat_to_num(num_training.iloc[:, i].values, num_testing.iloc[:, i].values)
    num_training.isetitem(i, values[0])
    num_testing.isetitem(i, values[1])

# Convert the labels.
num_trlabels, num_tslabels = cat_to_num(trlabels, tslabels)

The data is now available in two forms:

Form One (Categorical):

training

*   trlabels
*   tslabels
*   training
*   testing

Form Two (Numerical):

*   trlabels
*   tslabels
*   training
*   testing



# Clustering Using K-Means 

In [17]:
def k_means(X, k, epsilon):
    
    n_samples, n_features = X.shape
    
    # Randomly choose k data points as the initial centroids
    centroids = X[np.random.choice(n_samples, k, replace=False)]
    distances = np.zeros((n_samples, k))
    labels = np.zeros(n_samples)
    old_centroids = np.zeros((k, n_features))
    
    # Continue until the centroids don't change by more than epsilon
    while np.linalg.norm(centroids - old_centroids) > epsilon:
        old_centroids = centroids.copy()
        
        # Calculate the Euclidean distances from each sample to each centroid
        for i in range(k):
            distances[:, i] = np.linalg.norm(X - centroids[i], axis=1)
        
        # Assign each sample to the nearest centroid
        labels = np.argmin(distances, axis=1)
        
        # Update the centroids to be the mean of the samples assigned to them
        for i in range(k):
            X_i = X[labels == i]
            if len(X_i) == 0:
                centroids[i] = old_centroids[i]
            else:
                centroids[i] = np.mean(X_i, axis=0)
        
    return centroids

In [13]:
from sklearn.metrics.cluster import contingency_matrix
from scipy.optimize import linear_sum_assignment

labels = []
train = np.array(num_training)
test = np.array(num_testing)
results = []
for K in [7]:
    centroids = k_means(train, K, 0.001)
    print(pd.DataFrame(centroids).shape)
    distances = np.linalg.norm(test[:, np.newaxis, :] - centroids, axis=2)
    labels = np.argmin(distances, axis=1)
    print(labels)
    contingency = contingency_matrix(num_tslabels, labels)
    row_ind, col_ind = linear_sum_assignment(-contingency)
    y_pred = np.zeros_like(labels)
    for i, j in zip(row_ind, col_ind):
        y_pred[labels == j] = i
    results.append(y_pred)
    print(results)


(7, 41)
[5 5 5 ... 5 5 5]
[array([0, 0, 0, ..., 0, 0, 0], dtype=int64)]


# Normalized Cut

In [26]:
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split as tts
import numpy as np

np.random.seed(42)


def vecsort(vectors, values):
    """
    Sorts vectors based on values.

    Args:
        vectors (nparray): nparray of vectors to be sorted.
        values (nparray): nparray of values to be used to sort.

    Returns:
        nparray: nparray of sorted vectors with respect to values.
    """
    return vectors[:, np.argsort(values)[::]]


def norm(data):
    """
    Normalizes a data matrix.

    Args:
        data (nparray): array of numbers.

    Returns:
        normalized: nparray of the normalized array result after normalizing the data matrix.
    """
    normalized = []
    for row in data:
        normalized.append(row / np.linalg.norm(row))
    return np.array(normalized)


def ncut(training_data, k):
    """
    Splits the data into a training set and testing set with ratio 0.5% for training dataset, then applies the normalized cut algorithm on the reduced training dataset.

    Args:
        data (pd.DataFrame): pd.DataFrame containing the original dataset.
        k (int): number of clusters.

    Returns:
        nparray: nparray of labels after applying the normalized cut algorithm.
    """
    training, testing = tts(training_data, random_state=42, train_size=0.0015)

    # Convert the data into numpy arrays
    training = np.array(training)
    testing = np.array(testing)

    # Construct the similarity graph
    S = cosine_similarity(training)

    # Construct the degree matrix
    degrees = np.sum(S, axis=1)
    D = np.diag(degrees)

    # Compute Laplacian Matrix
    L = D - S

    # Compute sorted eigenvectors of the Laplacian Matrix then normalize them
    values, vectors = np.linalg.eigh(L)
    eigvectors = vecsort(vectors, values)

    normalized = norm(eigvectors[:, :k])

    # Perform K-means clustering on eigenvectors
    centroids = k_means(normalized, k, 0.01)
    distances = np.linalg.norm(normalized[:, np.newaxis, :] - centroids, axis=2)
    labels = np.argmin(distances, axis=1)

    return labels


# K-Means VS Normalized Cut

In [2]:
kmeans_clustering = pd.DataFrame(k_means(np.array(num_training), 11, 0.01))

# print(kmeans_clustering)
# print(ncut_clustering)

NameError: name 'pd' is not defined

In [28]:
ncut_clustering = pd.DataFrame(ncut(num_training, 11))
print(ncut_clustering)

(11, 11)
      0
0     2
1     2
2     2
3     7
4     2
...  ..
7342  9
7343  2
7344  2
7345  7
7346  2

[7347 rows x 1 columns]


# DBSCAN

In [29]:
vis = []
my_dict = {}
def dbscan(data, eps, min_samples):
    X = data.values
    global vis
    global my_dict
    vis = [0] * len(X)
    my_dict = {i: [] for i in range(len(X))}
    labels = [0] * len(X)
    cluster_id = 0
    for i in range(len(X)):
        if labels[i] != 0:
            continue            
        # Find all neighbors of the current point within eps distance
        neighbors = get_neighbors(X, i, eps)       
        # If the point is not a core point, mark it as an outlier
        if len(neighbors) < min_samples:
            labels[i] = -1
            continue       
        # Expand the cluster starting from the current core point
        cluster_id += 1
        labels[i] = cluster_id
        
        expand_cluster(X, labels, i, neighbors, eps, min_samples, cluster_id)
    
    return labels

def expand_cluster(X, labels, i, neighbors, eps, min_samples, cluster_id):
    """
    This function expands cluster for the ith 
    """
    # Loop over each neighbor of the core point
    for j in neighbors:
        if labels[j] == -1:
            labels[j] = cluster_id
        elif labels[j] == 0:
            labels[j] = cluster_id        
            # Find all neighbors of the current point within eps distance
            new_neighbors = get_neighbors(X, j, eps)        
            # If the point is a core point, add its neighbors to the list of neighbors
            if len(new_neighbors) >= min_samples:
                neighbors += new_neighbors

def get_neighbors(X, i, eps):
    """
    Thsis functions gets the neighbour for ith instance within given epsilon
    """
    global vis
    global my_dict
    if vis[i] == 1:
        return my_dict[i]
    neighbors = []
    for j in range(len(X)):
        if i == j:
            continue
            
        dist = np.linalg.norm(X[i] - X[j])
        if dist <= eps:
            neighbors.append(j)
    vis[i] = 1
    my_dict[i] = neighbors
    return neighbors

In [31]:
from sklearn.model_selection import train_test_split
# df_copy = train_data.drop(train_data.columns[-1], axis=1)
# df_copy.std()
train_data_copy, test_data = train_test_split(num_training, test_size=0.95, random_state=42)
train_labels = np.array(train_data_copy.iloc[:, -1])
print(f"Training set size: {len(train_data_copy)}")

Training set size: 244921


In [33]:
pred_labels = dbscan(train_data_copy, 15, 3)
print(pred_labels)

MemoryError: 

# Evaluation

In [None]:
pred_labels = [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2]
true_labels = [5, 5, 5, 5, 5, 6, 5, 7, 6, 6, 6, 6, 7, 5, 5, 7, 7, 7]

In [None]:
def clusterize(pred_labels, true_labels):
    if len(pred_labels) != len(true_labels):
        raise ValueError("The two list should be equal")
    clusters_set = set(pred_labels)
    # num_clusters = len(set(pred_labels))
    clusters = {}
    for cluster in clusters_set:
        clusters[cluster] = []
    for i in range(len(pred_labels)):
        clusters[pred_labels[i]].append(true_labels[i])
    return clusters, clusters_set

In [None]:
clusterize(pred_labels, true_labels)

({0: [5, 5, 5, 5, 5, 6], 1: [5, 7, 6, 6, 6, 6, 7], 2: [5, 5, 7, 7, 7]},
 {0, 1, 2})

In [None]:
def precision(pred_labels, true_labels):
    clusters, clusters_set = clusterize(pred_labels, true_labels)
    res = 0
    for cluster in clusters_set:
        most_common = max(set(clusters[cluster]), key = clusters[cluster].count)
        count = clusters[cluster].count(most_common)
        res += (len(clusters[cluster]) / len(true_labels)) * (count / len(clusters[cluster]))
    return res

In [None]:
precision(pred_labels, true_labels)

0.6666666666666666

In [None]:
def recall(pred_labels, true_labels):
    clusters, clusters_set = clusterize(pred_labels, true_labels)
    res = 0
    r = len(clusters_set)
    for cluster in clusters_set:
        most_common = max(set(clusters[cluster]), key = clusters[cluster].count)
        count = clusters[cluster].count(most_common)
        count_total = true_labels.count(most_common)
        res += (len(clusters[cluster]) / len(true_labels)) * (count / count_total)
    return res

In [None]:
recall(pred_labels, true_labels)

0.686111111111111

In [None]:
def f1(pred_labels, true_labels):
    clusters, clusters_set = clusterize(pred_labels, true_labels)
    res = 0
    r = len(clusters_set)
    for cluster in clusters_set:
        most_common = max(set(clusters[cluster]), key = clusters[cluster].count)
        count = clusters[cluster].count(most_common)
        count_total = true_labels.count(most_common)
        precision =  count / len(clusters[cluster])
        recall = count / count_total
        f1 = (2 * precision * recall) / (precision + recall)
        print(f"cluster: {cluster} pre: {precision}   rec: {recall}   f1: {f1}")
        res += (float(f1) / float(r))
    return res

In [None]:
f1(pred_labels, true_labels)

cluster: 0 pre: 0.8333333333333334   rec: 0.625   f1: 0.7142857142857143
cluster: 1 pre: 0.5714285714285714   rec: 0.8   f1: 0.6666666666666666
cluster: 2 pre: 0.6   rec: 0.6   f1: 0.6


0.6603174603174603

In [None]:
from math import log2
def conditional_entropy(pred_labels, true_labels):
    clusters, clusters_set = clusterize(pred_labels, true_labels)
    res = 0
    true_labels_set = set(true_labels)
    for cluster in clusters_set:
        temp = 0
        for t in true_labels_set:
            t_count = clusters[cluster].count(t)
            if t_count != 0:
                temp += -(t_count / len(clusters[cluster])) * log2(t_count / len(clusters[cluster]))
        res += (len(clusters[cluster]) / len(true_labels)) * temp
    return res

In [None]:
conditional_entropy(pred_labels, true_labels)

1.022576219809261