# Classification of hosts using time series

We are going to implement **KNN algorithm** but instead of simply using euclidean distance as a similiarity measure, we are going to use **dynamic time warping**. We will choose 5 units from our subnet.

In [1]:
import pandas as pd
import numpy as np

## Import dataset

The dataset has already been preprocessed ready to be used

In [2]:
df = pd.read_pickle('balanced_preprocessed_dataset.pkl')
df.head()

Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,...,max,max,max,max,max,Label,Label,Label,Label,Unit_encoding
Unnamed: 0_level_1,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,...,Bytes,Bytes,Bytes,Bytes,Bytes,Label,Label,Label,Label,Unnamed: 21_level_1
Unnamed: 0_level_2,00,01,02,03,04,05,06,07,08,09,...,19,20,21,22,23,anon_net_range,addr_range,unit,subunit,Unnamed: 21_level_2
0,-0.007622,-0.006103,-0.009019,-0.006422,-0.012099,-0.007141,-0.007069,-0.012616,-0.009572,-0.011851,...,-0.01529,-0.014971,-0.016604,-0.016766,-0.01632,28,ef160f55b36bd48b37f22bc9c48819b1a0259c2dd27ccc...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
1,-0.009677,-0.011074,-0.01079,-0.008082,-0.01361,-0.009863,-0.009743,-0.01434,-0.01151,-0.014119,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,28,93b8f5a052053b0db4731b671f78b8c5e5817d38d51ba9...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
2,-0.009484,-0.010427,-0.010896,-0.00808,-0.013609,-0.009918,-0.009838,-0.01432,-0.011545,-0.013922,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,28,ac600c8985d0f198d532737ea9d58db00905c6c6bebb6b...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
3,-0.009932,-0.011126,-0.010762,-0.007984,-0.013595,-0.009918,-0.00947,-0.014341,-0.011979,-0.01423,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,28,48bab257d30b1c6eaa225275fe60fc5e1dfe61afe54ace...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
4,-0.007484,-0.00579,-0.00893,-0.006322,-0.012029,-0.007007,-0.006911,-0.012538,-0.009441,-0.011706,...,-0.015274,-0.014955,-0.016588,-0.016752,-0.016307,26,1cd00c373ace404b829e822bf076631b564bf2bc70db82...,CEITEC,CRS,0


In [3]:
df.shape

(20998, 653)

### Preprocessing

#### Binary classifier - UVT and SKM rows only

In [4]:
groups = ['UVT', 'SKM', 'CEITEC', 'FI', 'PrirF']

In [5]:
df = df[df['Label', 'Label', 'unit'].isin(groups)]

In [6]:
from sklearn.preprocessing import LabelEncoder
df['Unit_encoding'] = LabelEncoder().fit_transform(df['Label', 'Label','unit'].astype(str))
df.head()

Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,...,max,max,max,max,max,Label,Label,Label,Label,Unit_encoding
Unnamed: 0_level_1,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,...,Bytes,Bytes,Bytes,Bytes,Bytes,Label,Label,Label,Label,Unnamed: 21_level_1
Unnamed: 0_level_2,00,01,02,03,04,05,06,07,08,09,...,19,20,21,22,23,anon_net_range,addr_range,unit,subunit,Unnamed: 21_level_2
0,-0.007622,-0.006103,-0.009019,-0.006422,-0.012099,-0.007141,-0.007069,-0.012616,-0.009572,-0.011851,...,-0.01529,-0.014971,-0.016604,-0.016766,-0.01632,28,ef160f55b36bd48b37f22bc9c48819b1a0259c2dd27ccc...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
1,-0.009677,-0.011074,-0.01079,-0.008082,-0.01361,-0.009863,-0.009743,-0.01434,-0.01151,-0.014119,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,28,93b8f5a052053b0db4731b671f78b8c5e5817d38d51ba9...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
2,-0.009484,-0.010427,-0.010896,-0.00808,-0.013609,-0.009918,-0.009838,-0.01432,-0.011545,-0.013922,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,28,ac600c8985d0f198d532737ea9d58db00905c6c6bebb6b...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
3,-0.009932,-0.011126,-0.010762,-0.007984,-0.013595,-0.009918,-0.00947,-0.014341,-0.011979,-0.01423,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,28,48bab257d30b1c6eaa225275fe60fc5e1dfe61afe54ace...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
4,-0.007484,-0.00579,-0.00893,-0.006322,-0.012029,-0.007007,-0.006911,-0.012538,-0.009441,-0.011706,...,-0.015274,-0.014955,-0.016588,-0.016752,-0.016307,26,1cd00c373ace404b829e822bf076631b564bf2bc70db82...,CEITEC,CRS,0


### Dividing dataset into train/test split

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
x = df[['mean', 'max', 'min']]
y = df['Unit_encoding']
x.head()

Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,...,min,min,min,min,min,min,min,min,min,min
Unnamed: 0_level_1,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,...,Bytes,Bytes,Bytes,Bytes,Bytes,Bytes,Bytes,Bytes,Bytes,Bytes
Unnamed: 0_level_2,00,01,02,03,04,05,06,07,08,09,...,14,15,16,17,18,19,20,21,22,23
0,-0.007622,-0.006103,-0.009019,-0.006422,-0.012099,-0.007141,-0.007069,-0.012616,-0.009572,-0.011851,...,-0.02073,-0.018019,-0.018878,-0.02128,-0.015753,-0.015868,-0.017892,-0.022608,-0.020415,-0.022789
1,-0.009677,-0.011074,-0.01079,-0.008082,-0.01361,-0.009863,-0.009743,-0.01434,-0.01151,-0.014119,...,-0.020752,-0.01804,-0.018899,-0.021301,-0.015771,-0.015889,-0.01791,-0.022626,-0.02043,-0.02281
2,-0.009484,-0.010427,-0.010896,-0.00808,-0.013609,-0.009918,-0.009838,-0.01432,-0.011545,-0.013922,...,-0.020752,-0.01804,-0.018899,-0.021301,-0.015771,-0.015889,-0.01791,-0.022626,-0.02043,-0.02281
3,-0.009932,-0.011126,-0.010762,-0.007984,-0.013595,-0.009918,-0.00947,-0.014341,-0.011979,-0.01423,...,-0.020752,-0.01804,-0.018899,-0.021301,-0.015771,-0.015889,-0.01791,-0.022626,-0.02043,-0.02281
4,-0.007484,-0.00579,-0.00893,-0.006322,-0.012029,-0.007007,-0.006911,-0.012538,-0.009441,-0.011706,...,-0.020322,-0.01775,-0.018128,-0.021017,-0.015511,-0.015545,-0.017444,-0.022333,-0.020035,-0.022432


In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20)

### Apply KNN

In [10]:
from timeit import default_timer as timer
from sklearn.neighbors import KNeighborsClassifier
from DTW import LB_Keogh
k = 5
r = 5 # LB_Keogh reach parameter
start = timer()
size = x.shape[1] # pass into LB_Keogh
knn = KNeighborsClassifier(n_neighbors=k, metric=LB_Keogh, metric_params={'size' : size, 'r' : r})
knn.fit(x_train, y_train)
predictions = knn.predict(x_test)
print("KNN finished in:" + str(timer() - start) + " seconds")

KNN finished in:496.92642027700094 seconds


## Evaluate

In [11]:
from sklearn.metrics import classification_report, confusion_matrix

In [12]:
print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))

              precision    recall  f1-score   support

           0       0.89      0.67      0.77       445
           1       0.42      0.09      0.15       270
           2       0.76      0.44      0.55       536
           3       0.53      0.93      0.68       621
           4       0.66      0.74      0.70       700

   micro avg       0.65      0.65      0.65      2572
   macro avg       0.65      0.58      0.57      2572
weighted avg       0.67      0.65      0.62      2572

[[300  11  15  89  30]
 [  9  25  12 131  93]
 [ 16  14 234 163 109]
 [  2   0   8 580  31]
 [  9   9  40 122 520]]
