# Clustering of hosts using K-means algorithm 

We will use k-means to cluster **all** hosts from subnets and evaluate clustering performamce.  

### Imports

In [1]:
import pandas as pd
import numpy as np

### Import dataset

We have a preprocessed pandas dataframe consisting of anonymized host behaviour. Each host belongs to a certain **unit** within the subnet.

In [2]:
df = pd.read_pickle('balanced_preprocessed_dataset.pkl')
df.head()

Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,...,max,max,max,max,max,Label,Label,Label,Label,Unit_encoding
Unnamed: 0_level_1,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,...,Bytes,Bytes,Bytes,Bytes,Bytes,Label,Label,Label,Label,Unnamed: 21_level_1
Unnamed: 0_level_2,00,01,02,03,04,05,06,07,08,09,...,19,20,21,22,23,anon_net_range,addr_range,unit,subunit,Unnamed: 21_level_2
0,-0.007622,-0.006103,-0.009019,-0.006422,-0.012099,-0.007141,-0.007069,-0.012616,-0.009572,-0.011851,...,-0.01529,-0.014971,-0.016604,-0.016766,-0.01632,28,ef160f55b36bd48b37f22bc9c48819b1a0259c2dd27ccc...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
1,-0.009677,-0.011074,-0.01079,-0.008082,-0.01361,-0.009863,-0.009743,-0.01434,-0.01151,-0.014119,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,28,93b8f5a052053b0db4731b671f78b8c5e5817d38d51ba9...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
2,-0.009484,-0.010427,-0.010896,-0.00808,-0.013609,-0.009918,-0.009838,-0.01432,-0.011545,-0.013922,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,28,ac600c8985d0f198d532737ea9d58db00905c6c6bebb6b...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
3,-0.009932,-0.011126,-0.010762,-0.007984,-0.013595,-0.009918,-0.00947,-0.014341,-0.011979,-0.01423,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,28,48bab257d30b1c6eaa225275fe60fc5e1dfe61afe54ace...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
4,-0.007484,-0.00579,-0.00893,-0.006322,-0.012029,-0.007007,-0.006911,-0.012538,-0.009441,-0.011706,...,-0.015274,-0.014955,-0.016588,-0.016752,-0.016307,26,1cd00c373ace404b829e822bf076631b564bf2bc70db82...,CEITEC,CRS,0


In [3]:
df['Label', 'Label', 'unit'].unique()

array(['CEITEC', 'CESNET', 'CTT', 'ESF', 'FF', 'FI', 'FIeduroamnaFI',
       'FNBrno', 'FSS', 'FSpS', 'IBA', 'LF', 'PedF', 'PravF', 'PrirF',
       'RMU', 'SKM', 'Teiresias', 'Telc', 'UKB', 'UVT', 'VPN'],
      dtype=object)

In [4]:
df['Label', 'Label', 'unit'].value_counts()

UVT              3500
SKM              3246
PrirF            2599
CEITEC           2223
LF               1475
FI               1289
FF               1089
FIeduroamnaFI    1020
FSS               802
PedF              740
UKB               649
ESF               554
PravF             536
RMU               429
FSpS              410
Teiresias         130
Telc               99
IBA                96
VPN                83
CTT                19
FNBrno              7
CESNET              3
Name: (Label, Label, unit), dtype: int64

### Create train test split

In [5]:
x = df.drop(['Label', 'Unit_encoding'], level=0, axis=1)
y = df['Unit_encoding']
x.head()

Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,...,max,max,max,max,max,max,max,max,max,max
Unnamed: 0_level_1,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,...,Bytes,Bytes,Bytes,Bytes,Bytes,Bytes,Bytes,Bytes,Bytes,Bytes
Unnamed: 0_level_2,00,01,02,03,04,05,06,07,08,09,...,14,15,16,17,18,19,20,21,22,23
0,-0.007622,-0.006103,-0.009019,-0.006422,-0.012099,-0.007141,-0.007069,-0.012616,-0.009572,-0.011851,...,-0.018135,-0.017941,-0.017531,-0.01764,-0.015971,-0.01529,-0.014971,-0.016604,-0.016766,-0.01632
1,-0.009677,-0.011074,-0.01079,-0.008082,-0.01361,-0.009863,-0.009743,-0.01434,-0.01151,-0.014119,...,-0.018135,-0.017942,-0.017531,-0.017641,-0.015971,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632
2,-0.009484,-0.010427,-0.010896,-0.00808,-0.013609,-0.009918,-0.009838,-0.01432,-0.011545,-0.013922,...,-0.018135,-0.017942,-0.017531,-0.017641,-0.015971,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632
3,-0.009932,-0.011126,-0.010762,-0.007984,-0.013595,-0.009918,-0.00947,-0.014341,-0.011979,-0.01423,...,-0.018135,-0.017942,-0.017531,-0.017641,-0.015971,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632
4,-0.007484,-0.00579,-0.00893,-0.006322,-0.012029,-0.007007,-0.006911,-0.012538,-0.009441,-0.011706,...,-0.018115,-0.017918,-0.017516,-0.017627,-0.015957,-0.015274,-0.014955,-0.016588,-0.016752,-0.016307


## K-means implementation (LB_Keogh as a metric)

In [6]:
from DTW import LB_Keogh
from timeit import default_timer as timer

In [7]:
def k_means(dataframe, print_progress=True, n_iter=10, k=5):
    size = dataframe.shape[0]
    series_length = dataframe.shape[1]
    centroids = dataframe.sample(n=k).values # setting initial random cluster centroids
    data = dataframe.values
    r = 5 # LB_Keogh reach parameter
    output = {}
    output_array = [] # will hold final assignments of each host
    for iternum in range(n_iter): # perform n_iter iterations
        start = timer()
        output = {}
        for i in range(size): # for every host in our dataset
            min_dist = np.inf
            cluster = 0
            for c in range(k): # find closest cluster
                dist = LB_Keogh(data[i], centroids[c], series_length, r) # LB_Keogh used as a distance metric
                if dist < min_dist:
                    min_dist = dist
                    cluster = c
            if iternum == n_iter - 1: # fill output array at last iteration
                output_array.append(cluster)
            if cluster in output: # assign host cluster according to closes centroid 
                output[cluster] = np.append(output[cluster], i)
            else:
                output[cluster] = np.array([i])
        for cluster in output: # calculate new clusters after each iteration
            centroids[cluster] = np.mean(data[output[cluster]], axis=0)
        if print_progress:
            print("Finished iteration number " + str(iternum + 1) + ", time: " + str(timer() - start))
    for cluster in output:
        print('Cluster ' + str(cluster) + ', number of assigned hosts: ' + str(output[cluster].size))
    return output_array

## Apply K-means

In [8]:
n_iters = 30
num_clusts = df['Label', 'Label', 'unit'].nunique()
start = timer()
predictions_dtw = k_means(x, True, n_iters, num_clusts)
print("K-means finished in: " + str(timer() - start) + " seconds")

Finished iteration number 1, time: 29.295739072000288
Finished iteration number 2, time: 20.8072960100003
Finished iteration number 3, time: 21.882607993999954
Finished iteration number 4, time: 20.865493464000338
Finished iteration number 5, time: 20.537845019000088
Finished iteration number 6, time: 20.274761675999798
Finished iteration number 7, time: 20.271782238000014
Finished iteration number 8, time: 20.25250736099997
Finished iteration number 9, time: 20.34529250500009
Finished iteration number 10, time: 19.83924134000017
Finished iteration number 11, time: 20.386332999999922
Finished iteration number 12, time: 19.95099080099999
Finished iteration number 13, time: 19.97147336299986
Finished iteration number 14, time: 19.649057111000275
Finished iteration number 15, time: 20.549734988999717
Finished iteration number 16, time: 20.139452852999966
Finished iteration number 17, time: 20.07763465900007
Finished iteration number 18, time: 19.954326693999974
Finished iteration number 1

In [9]:
df['Label', 'Label', 'unit'].value_counts()

UVT              3500
SKM              3246
PrirF            2599
CEITEC           2223
LF               1475
FI               1289
FF               1089
FIeduroamnaFI    1020
FSS               802
PedF              740
UKB               649
ESF               554
PravF             536
RMU               429
FSpS              410
Teiresias         130
Telc               99
IBA                96
VPN                83
CTT                19
FNBrno              7
CESNET              3
Name: (Label, Label, unit), dtype: int64

## Compare with sklearn implementation (metric is euclidean distance)

In [10]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=num_clusts)

In [11]:
predictions_euclid = kmeans.fit_predict(x)
np.unique(predictions_euclid, return_counts=True)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21], dtype=int32),
 array([ 976, 2224,    1,    1,    1,    1,   27,    1,    1,   11,    2,
        1160, 1866,   82, 4941,    1, 2294,    1, 2927,    1, 4478,    1]))

# Performance evaluation

## Adjusted Rand index

In [12]:
from sklearn import metrics

#### With DTW using LB_Keogh as metric

In [13]:
true_labels = y
pred_labels = predictions_dtw

In [14]:
dtw_score = metrics.adjusted_rand_score(true_labels, pred_labels) 
dtw_score

0.1432387989094331

#### Using builtin sklearn - euclidean distance metric

In [15]:
pred_labels = predictions_euclid

In [16]:
ed_score = metrics.adjusted_rand_score(true_labels, pred_labels) 
ed_score

0.12498040314904307

In [17]:
print("Score difference: " + str(dtw_score - ed_score))

Score difference: 0.01825839576039004


## Silhoutte Coefficient 

#### With DTW using LB_Keogh as metric

_euclidean_

In [18]:
silhouette_dtw = metrics.silhouette_score(x, predictions_dtw)
silhouette_dtw

0.13180142243859644

_dtw_

#### Using builtin sklearn - euclidean distance metric

In [19]:
silhouette_euclid = metrics.silhouette_score(x, predictions_euclid)
silhouette_euclid

0.20083985829691794