# Clustering of hosts using K-means algorithm 

We will use k-means to cluster hosts from **2** subnets and evaluate clustering performamce.  

### Imports

In [1]:
import pandas as pd
import numpy as np

### Import dataset

We have a preprocessed pandas dataframe consisting of anonymized host behaviour. Each host belongs to a certain **unit** within the subnet.

In [2]:
df = pd.read_pickle('balanced_preprocessed_dataset.pkl')
df.head()

Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,...,max,max,max,max,max,Label,Label,Label,Label,Unit_encoding
Unnamed: 0_level_1,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,...,Bytes,Bytes,Bytes,Bytes,Bytes,Label,Label,Label,Label,Unnamed: 21_level_1
Unnamed: 0_level_2,00,01,02,03,04,05,06,07,08,09,...,19,20,21,22,23,anon_net_range,addr_range,unit,subunit,Unnamed: 21_level_2
0,-0.007622,-0.006103,-0.009019,-0.006422,-0.012099,-0.007141,-0.007069,-0.012616,-0.009572,-0.011851,...,-0.01529,-0.014971,-0.016604,-0.016766,-0.01632,28,ef160f55b36bd48b37f22bc9c48819b1a0259c2dd27ccc...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
1,-0.009677,-0.011074,-0.01079,-0.008082,-0.01361,-0.009863,-0.009743,-0.01434,-0.01151,-0.014119,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,28,93b8f5a052053b0db4731b671f78b8c5e5817d38d51ba9...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
2,-0.009484,-0.010427,-0.010896,-0.00808,-0.013609,-0.009918,-0.009838,-0.01432,-0.011545,-0.013922,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,28,ac600c8985d0f198d532737ea9d58db00905c6c6bebb6b...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
3,-0.009932,-0.011126,-0.010762,-0.007984,-0.013595,-0.009918,-0.00947,-0.014341,-0.011979,-0.01423,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,28,48bab257d30b1c6eaa225275fe60fc5e1dfe61afe54ace...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
4,-0.007484,-0.00579,-0.00893,-0.006322,-0.012029,-0.007007,-0.006911,-0.012538,-0.009441,-0.011706,...,-0.015274,-0.014955,-0.016588,-0.016752,-0.016307,26,1cd00c373ace404b829e822bf076631b564bf2bc70db82...,CEITEC,CRS,0


In [3]:
df['Label', 'Label', 'unit'].unique()

array(['CEITEC', 'CESNET', 'CTT', 'ESF', 'FF', 'FI', 'FIeduroamnaFI',
       'FNBrno', 'FSS', 'FSpS', 'IBA', 'LF', 'PedF', 'PravF', 'PrirF',
       'RMU', 'SKM', 'Teiresias', 'Telc', 'UKB', 'UVT', 'VPN'],
      dtype=object)

In [4]:
df['Label', 'Label', 'unit'].value_counts()

UVT              3500
SKM              3246
PrirF            2599
CEITEC           2223
LF               1475
FI               1289
FF               1089
FIeduroamnaFI    1020
FSS               802
PedF              740
UKB               649
ESF               554
PravF             536
RMU               429
FSpS              410
Teiresias         130
Telc               99
IBA                96
VPN                83
CTT                19
FNBrno              7
CESNET              3
Name: (Label, Label, unit), dtype: int64

### More data preprocessing

We are only going to use hosts within group

In [5]:
groups = ['UVT', 'SKM']

In [6]:
df = df[df['Label', 'Label', 'unit'].isin(groups)]
print(df['Label', 'Label', 'unit'].value_counts())
df.head()

UVT    3500
SKM    3246
Name: (Label, Label, unit), dtype: int64


Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,...,max,max,max,max,max,Label,Label,Label,Label,Unit_encoding
Unnamed: 0_level_1,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,...,Bytes,Bytes,Bytes,Bytes,Bytes,Label,Label,Label,Label,Unnamed: 21_level_1
Unnamed: 0_level_2,00,01,02,03,04,05,06,07,08,09,...,19,20,21,22,23,anon_net_range,addr_range,unit,subunit,Unnamed: 21_level_2
13291,-0.007496,-0.005742,-0.008903,-0.006316,-0.012004,-0.006973,-0.006919,-0.012518,-0.009461,-0.011721,...,-0.015274,-0.014957,-0.016592,-0.01675,-0.016306,24,a8ce8c3977ebd10bba6849d15645105eed4d5e78006cfb...,SKM,centrum Slapanice,16
13292,-0.008895,-0.008315,-0.010002,-0.006953,-0.01274,-0.008184,-0.008276,-0.013226,-0.010829,-0.012773,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,24,5a5fa9cbb0325b8b154b37dfee873af049c3f7fbad8141...,SKM,centrum Slapanice,16
13293,-0.007623,-0.006104,-0.009037,-0.006413,-0.012109,-0.00714,-0.007082,-0.012616,-0.009489,-0.011722,...,-0.015289,-0.01495,-0.016603,-0.016765,-0.016319,24,0eb624c7b418aec4d9779fe9a69f171a3c452e8a163e96...,SKM,centrum Slapanice,16
13294,-0.007596,-0.00603,-0.009027,-0.006398,-0.012092,-0.007123,-0.007039,-0.012571,-0.009517,-0.011771,...,-0.014638,-0.014446,-0.016041,-0.016122,-0.016018,24,8d2375a96a2a821d927c290ed665767957ffbbdc55e547...,SKM,centrum Slapanice,16
13295,-0.008925,-0.009641,-0.010201,-0.007432,-0.012726,-0.008296,-0.00854,-0.013556,-0.010859,-0.013467,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,24,ceab45b2cd2a4b0f2af9c73236e4fe98f1bcd1ca8b3bff...,SKM,centrum Slapanice,16


In [7]:
df['Label', 'Label', 'unit'].unique()

array(['SKM', 'UVT'], dtype=object)

#### Add numeric labels to groups 

In [8]:
from sklearn.preprocessing import LabelEncoder

In [9]:
df['Unit_encoding'] = LabelEncoder().fit_transform(df['Label', 'Label','unit'].astype(str))
df.head()

Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,...,max,max,max,max,max,Label,Label,Label,Label,Unit_encoding
Unnamed: 0_level_1,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,...,Bytes,Bytes,Bytes,Bytes,Bytes,Label,Label,Label,Label,Unnamed: 21_level_1
Unnamed: 0_level_2,00,01,02,03,04,05,06,07,08,09,...,19,20,21,22,23,anon_net_range,addr_range,unit,subunit,Unnamed: 21_level_2
13291,-0.007496,-0.005742,-0.008903,-0.006316,-0.012004,-0.006973,-0.006919,-0.012518,-0.009461,-0.011721,...,-0.015274,-0.014957,-0.016592,-0.01675,-0.016306,24,a8ce8c3977ebd10bba6849d15645105eed4d5e78006cfb...,SKM,centrum Slapanice,0
13292,-0.008895,-0.008315,-0.010002,-0.006953,-0.01274,-0.008184,-0.008276,-0.013226,-0.010829,-0.012773,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,24,5a5fa9cbb0325b8b154b37dfee873af049c3f7fbad8141...,SKM,centrum Slapanice,0
13293,-0.007623,-0.006104,-0.009037,-0.006413,-0.012109,-0.00714,-0.007082,-0.012616,-0.009489,-0.011722,...,-0.015289,-0.01495,-0.016603,-0.016765,-0.016319,24,0eb624c7b418aec4d9779fe9a69f171a3c452e8a163e96...,SKM,centrum Slapanice,0
13294,-0.007596,-0.00603,-0.009027,-0.006398,-0.012092,-0.007123,-0.007039,-0.012571,-0.009517,-0.011771,...,-0.014638,-0.014446,-0.016041,-0.016122,-0.016018,24,8d2375a96a2a821d927c290ed665767957ffbbdc55e547...,SKM,centrum Slapanice,0
13295,-0.008925,-0.009641,-0.010201,-0.007432,-0.012726,-0.008296,-0.00854,-0.013556,-0.010859,-0.013467,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,24,ceab45b2cd2a4b0f2af9c73236e4fe98f1bcd1ca8b3bff...,SKM,centrum Slapanice,0


In [10]:
df['Unit_encoding'].value_counts()

1    3500
0    3246
Name: Unit_encoding, dtype: int64

Cleanup the index

In [11]:
df = df.reset_index(drop=True)

### Create train test split

In [12]:
x = df.drop(['Label', 'Unit_encoding'], level=0, axis=1)
y = df['Unit_encoding']
x.head()

Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,...,max,max,max,max,max,max,max,max,max,max
Unnamed: 0_level_1,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,...,Bytes,Bytes,Bytes,Bytes,Bytes,Bytes,Bytes,Bytes,Bytes,Bytes
Unnamed: 0_level_2,00,01,02,03,04,05,06,07,08,09,...,14,15,16,17,18,19,20,21,22,23
0,-0.007496,-0.005742,-0.008903,-0.006316,-0.012004,-0.006973,-0.006919,-0.012518,-0.009461,-0.011721,...,-0.018114,-0.017923,-0.017517,-0.017624,-0.01595,-0.015274,-0.014957,-0.016592,-0.01675,-0.016306
1,-0.008895,-0.008315,-0.010002,-0.006953,-0.01274,-0.008184,-0.008276,-0.013226,-0.010829,-0.012773,...,-0.018135,-0.017942,-0.017531,-0.017641,-0.015971,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632
2,-0.007623,-0.006104,-0.009037,-0.006413,-0.012109,-0.00714,-0.007082,-0.012616,-0.009489,-0.011722,...,-0.017943,-0.017941,-0.017526,-0.017639,-0.015971,-0.015289,-0.01495,-0.016603,-0.016765,-0.016319
3,-0.007596,-0.00603,-0.009027,-0.006398,-0.012092,-0.007123,-0.007039,-0.012571,-0.009517,-0.011771,...,-0.017672,-0.017679,-0.017014,-0.017126,-0.015627,-0.014638,-0.014446,-0.016041,-0.016122,-0.016018
4,-0.008925,-0.009641,-0.010201,-0.007432,-0.012726,-0.008296,-0.00854,-0.013556,-0.010859,-0.013467,...,-0.018135,-0.017942,-0.017531,-0.017641,-0.015971,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632


## K-means implementation (LB_Keogh as a metric)

In [13]:
from DTW import LB_Keogh
from timeit import default_timer as timer

In [14]:
def k_means(dataframe, print_progress=True, n_iter=10, k=5):
    size = dataframe.shape[0]
    series_length = dataframe.shape[1]
    centroids = dataframe.sample(n=k).values # setting initial random cluster centroids
    data = dataframe.values
    r = 5 # LB_Keogh reach parameter
    output = {}
    output_array = [] # will hold final assignments of each host
    for iternum in range(n_iter): # perform n_iter iterations
        start = timer()
        output = {}
        for i in range(size): # for every host in our dataset
            min_dist = np.inf
            cluster = 0
            for c in range(k): # find closest cluster
                dist = LB_Keogh(data[i], centroids[c], series_length, r) # LB_Keogh used as a distance metric
                if dist < min_dist:
                    min_dist = dist
                    cluster = c
            if iternum == n_iter - 1: # fill output array at last iteration
                output_array.append(cluster)
            if cluster in output: # assign host cluster according to closes centroid 
                output[cluster] = np.append(output[cluster], i)
            else:
                output[cluster] = np.array([i])
        for cluster in output: # calculate new clusters after each iteration
            centroids[cluster] = np.mean(data[output[cluster]], axis=0)
        if print_progress:
            print("Finished iteration number " + str(iternum + 1) + ", time: " + str(timer() - start))
    for cluster in output:
        print('Cluster ' + str(cluster) + ', number of assigned hosts: ' + str(output[cluster].size))
    return output_array

## Apply K-means

In [15]:
n_iters = 30
num_clusts = len(groups)
start = timer()
predictions_dtw = k_means(x, True, n_iters, num_clusts)
print("K-means finished in: " + str(timer() - start) + " seconds")

Finished iteration number 1, time: 0.7272722500001692
Finished iteration number 2, time: 0.6915609470001982
Finished iteration number 3, time: 0.6610153890001129
Finished iteration number 4, time: 0.6589604199998575
Finished iteration number 5, time: 0.773480162999931
Finished iteration number 6, time: 0.6588509940002041
Finished iteration number 7, time: 0.6529551499997979
Finished iteration number 8, time: 0.652922297999794
Finished iteration number 9, time: 0.6588369619998957
Finished iteration number 10, time: 0.6500966810003774
Finished iteration number 11, time: 0.6493085759998394
Finished iteration number 12, time: 0.651694507999764
Finished iteration number 13, time: 0.6521489960000508
Finished iteration number 14, time: 0.6493100479997338
Finished iteration number 15, time: 0.653083245000289
Finished iteration number 16, time: 0.6529343829997742
Finished iteration number 17, time: 0.6528396700000485
Finished iteration number 18, time: 0.6551179939997382
Finished iteration numb

In [16]:
df['Label', 'Label', 'unit'].value_counts()

UVT    3500
SKM    3246
Name: (Label, Label, unit), dtype: int64

## Compare with sklearn implementation (metric is euclidean distance)

In [17]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=num_clusts)

In [18]:
predictions_euclid = kmeans.fit_predict(x)
np.unique(predictions_euclid, return_counts=True)

(array([0, 1], dtype=int32), array([6745,    1]))

# Performance evaluation

## Adjusted Rand index

In [19]:
from sklearn import metrics

#### With DTW using LB_Keogh as metric

In [20]:
true_labels = y
pred_labels = predictions_dtw

In [21]:
dtw_score = metrics.adjusted_rand_score(true_labels, pred_labels) 
dtw_score

0.03107117943207805

#### Using builtin sklearn - euclidean distance metric

In [22]:
pred_labels = predictions_euclid

In [23]:
ed_score = metrics.adjusted_rand_score(true_labels, pred_labels) 
ed_score

-2.1515379149058398e-05

In [24]:
print("Score difference: " + str(dtw_score - ed_score))

Score difference: 0.03109269481122711


## Silhoutte Coefficient 

#### With DTW using LB_Keogh as metric

_euclidean_

In [25]:
silhouette_dtw = metrics.silhouette_score(x, predictions_dtw)
silhouette_dtw

0.11584237563942723

_dtw_

#### Using builtin sklearn - euclidean distance metric

In [26]:
silhouette_euclid = metrics.silhouette_score(x, predictions_euclid)
silhouette_euclid

0.973805071353198