# Classification of hosts using time series

We are going to implement **KNN algorithm** but instead of simply using euclidean distance as a similiarity measure, we are going to use **dynamic time warping**. We will pick 2 units from our subnets.

In [1]:
import pandas as pd
import numpy as np

## Import dataset

The dataset has already been preprocessed ready to be used

In [2]:
df = pd.read_pickle('../../dataset/balanced_preprocessed_dataset.pkl')
df.head()

Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,...,max,max,max,max,max,Label,Label,Label,Label,Unit_encoding
Unnamed: 0_level_1,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,...,Bytes,Bytes,Bytes,Bytes,Bytes,Label,Label,Label,Label,Unnamed: 21_level_1
Unnamed: 0_level_2,00,01,02,03,04,05,06,07,08,09,...,19,20,21,22,23,anon_net_range,addr_range,unit,subunit,Unnamed: 21_level_2
0,-0.007622,-0.006103,-0.009019,-0.006422,-0.012099,-0.007141,-0.007069,-0.012616,-0.009572,-0.011851,...,-0.01529,-0.014971,-0.016604,-0.016766,-0.01632,28,ef160f55b36bd48b37f22bc9c48819b1a0259c2dd27ccc...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
1,-0.009677,-0.011074,-0.01079,-0.008082,-0.01361,-0.009863,-0.009743,-0.01434,-0.01151,-0.014119,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,28,93b8f5a052053b0db4731b671f78b8c5e5817d38d51ba9...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
2,-0.009484,-0.010427,-0.010896,-0.00808,-0.013609,-0.009918,-0.009838,-0.01432,-0.011545,-0.013922,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,28,ac600c8985d0f198d532737ea9d58db00905c6c6bebb6b...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
3,-0.009932,-0.011126,-0.010762,-0.007984,-0.013595,-0.009918,-0.00947,-0.014341,-0.011979,-0.01423,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,28,48bab257d30b1c6eaa225275fe60fc5e1dfe61afe54ace...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
4,-0.007484,-0.00579,-0.00893,-0.006322,-0.012029,-0.007007,-0.006911,-0.012538,-0.009441,-0.011706,...,-0.015274,-0.014955,-0.016588,-0.016752,-0.016307,26,1cd00c373ace404b829e822bf076631b564bf2bc70db82...,CEITEC,CRS,0


In [3]:
df.shape

(20998, 653)

### Preprocessing

#### Binary classifier - UVT and SKM rows only

In [4]:
groups = ['UVT', 'SKM']

In [5]:
df = df[df['Label', 'Label', 'unit'].isin(groups)]

In [6]:
from sklearn.preprocessing import LabelEncoder
df['Unit_encoding'] = LabelEncoder().fit_transform(df['Label', 'Label','unit'].astype(str))
df.head()

Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,...,max,max,max,max,max,Label,Label,Label,Label,Unit_encoding
Unnamed: 0_level_1,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,...,Bytes,Bytes,Bytes,Bytes,Bytes,Label,Label,Label,Label,Unnamed: 21_level_1
Unnamed: 0_level_2,00,01,02,03,04,05,06,07,08,09,...,19,20,21,22,23,anon_net_range,addr_range,unit,subunit,Unnamed: 21_level_2
13291,-0.007496,-0.005742,-0.008903,-0.006316,-0.012004,-0.006973,-0.006919,-0.012518,-0.009461,-0.011721,...,-0.015274,-0.014957,-0.016592,-0.01675,-0.016306,24,a8ce8c3977ebd10bba6849d15645105eed4d5e78006cfb...,SKM,centrum Slapanice,0
13292,-0.008895,-0.008315,-0.010002,-0.006953,-0.01274,-0.008184,-0.008276,-0.013226,-0.010829,-0.012773,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,24,5a5fa9cbb0325b8b154b37dfee873af049c3f7fbad8141...,SKM,centrum Slapanice,0
13293,-0.007623,-0.006104,-0.009037,-0.006413,-0.012109,-0.00714,-0.007082,-0.012616,-0.009489,-0.011722,...,-0.015289,-0.01495,-0.016603,-0.016765,-0.016319,24,0eb624c7b418aec4d9779fe9a69f171a3c452e8a163e96...,SKM,centrum Slapanice,0
13294,-0.007596,-0.00603,-0.009027,-0.006398,-0.012092,-0.007123,-0.007039,-0.012571,-0.009517,-0.011771,...,-0.014638,-0.014446,-0.016041,-0.016122,-0.016018,24,8d2375a96a2a821d927c290ed665767957ffbbdc55e547...,SKM,centrum Slapanice,0
13295,-0.008925,-0.009641,-0.010201,-0.007432,-0.012726,-0.008296,-0.00854,-0.013556,-0.010859,-0.013467,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,24,ceab45b2cd2a4b0f2af9c73236e4fe98f1bcd1ca8b3bff...,SKM,centrum Slapanice,0


### Dividing dataset into train/test split

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
x = df[['mean', 'max', 'min']]
y = df['Unit_encoding']
x.head()

Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,...,min,min,min,min,min,min,min,min,min,min
Unnamed: 0_level_1,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,...,Bytes,Bytes,Bytes,Bytes,Bytes,Bytes,Bytes,Bytes,Bytes,Bytes
Unnamed: 0_level_2,00,01,02,03,04,05,06,07,08,09,...,14,15,16,17,18,19,20,21,22,23
13291,-0.007496,-0.005742,-0.008903,-0.006316,-0.012004,-0.006973,-0.006919,-0.012518,-0.009461,-0.011721,...,-0.020301,-0.017671,-0.018476,-0.020891,-0.015201,-0.015691,-0.017608,-0.022394,-0.020095,-0.022354
13292,-0.008895,-0.008315,-0.010002,-0.006953,-0.01274,-0.008184,-0.008276,-0.013226,-0.010829,-0.012773,...,-0.020752,-0.01804,-0.018899,-0.021301,-0.015771,-0.015889,-0.01791,-0.022626,-0.02043,-0.02281
13293,-0.007623,-0.006104,-0.009037,-0.006413,-0.012109,-0.00714,-0.007082,-0.012616,-0.009489,-0.011722,...,-0.020659,-0.017956,-0.018811,-0.02121,-0.015701,-0.015802,-0.01784,-0.022561,-0.02038,-0.02273
13294,-0.007596,-0.00603,-0.009027,-0.006398,-0.012092,-0.007123,-0.007039,-0.012571,-0.009517,-0.011771,...,-0.020658,-0.017958,-0.0188,-0.021137,-0.015696,-0.015812,-0.017843,-0.022542,-0.020374,-0.022734
13295,-0.008925,-0.009641,-0.010201,-0.007432,-0.012726,-0.008296,-0.00854,-0.013556,-0.010859,-0.013467,...,-0.020752,-0.01804,-0.018899,-0.021301,-0.015771,-0.015889,-0.01791,-0.022626,-0.02043,-0.02281


In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20)

### Apply KNN

In [10]:
from timeit import default_timer as timer
from sklearn.neighbors import KNeighborsClassifier
from DTW import LB_Keogh
k = 5
r = 5 # LB_Keogh reach parameter
start = timer()
size = x.shape[1] # pass into LB_Keogh
knn = KNeighborsClassifier(n_neighbors=k, metric=LB_Keogh, metric_params={'size' : size, 'r' : r})
knn.fit(x_train, y_train)
predictions = knn.predict(x_test)
print("KNN finished in:" + str(timer() - start) + " seconds")

KNN finished in:183.04791440500048 seconds


## Evaluate results

In [11]:
from sklearn.metrics import classification_report, confusion_matrix

In [12]:
print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))

              precision    recall  f1-score   support

           0       0.87      0.92      0.89       660
           1       0.92      0.87      0.89       690

   micro avg       0.89      0.89      0.89      1350
   macro avg       0.89      0.89      0.89      1350
weighted avg       0.89      0.89      0.89      1350

[[605  55]
 [ 92 598]]
