# Classification of hosts within subnet

The goal of this notebook is to perform classification of hosts from **2 subnets** (**SKM** and **UVT**), with the help of Support Vector Machine classifier. 

### Imports

In [1]:
import pandas as pd

### Import dataset

We have a preprocessed pandas dataframe consisting of anonymized host behaviour. Each host belongs to a certain **unit** within the subnet.

In [2]:
df = pd.read_pickle('../../dataset/balanced_preprocessed_dataset.pkl')
df.head()

Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,...,max,max,max,max,max,Label,Label,Label,Label,Unit_encoding
Unnamed: 0_level_1,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,...,Bytes,Bytes,Bytes,Bytes,Bytes,Label,Label,Label,Label,Unnamed: 21_level_1
Unnamed: 0_level_2,00,01,02,03,04,05,06,07,08,09,...,19,20,21,22,23,anon_net_range,addr_range,unit,subunit,Unnamed: 21_level_2
0,-0.007622,-0.006103,-0.009019,-0.006422,-0.012099,-0.007141,-0.007069,-0.012616,-0.009572,-0.011851,...,-0.01529,-0.014971,-0.016604,-0.016766,-0.01632,28,ef160f55b36bd48b37f22bc9c48819b1a0259c2dd27ccc...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
1,-0.009677,-0.011074,-0.01079,-0.008082,-0.01361,-0.009863,-0.009743,-0.01434,-0.01151,-0.014119,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,28,93b8f5a052053b0db4731b671f78b8c5e5817d38d51ba9...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
2,-0.009484,-0.010427,-0.010896,-0.00808,-0.013609,-0.009918,-0.009838,-0.01432,-0.011545,-0.013922,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,28,ac600c8985d0f198d532737ea9d58db00905c6c6bebb6b...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
3,-0.009932,-0.011126,-0.010762,-0.007984,-0.013595,-0.009918,-0.00947,-0.014341,-0.011979,-0.01423,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,28,48bab257d30b1c6eaa225275fe60fc5e1dfe61afe54ace...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
4,-0.007484,-0.00579,-0.00893,-0.006322,-0.012029,-0.007007,-0.006911,-0.012538,-0.009441,-0.011706,...,-0.015274,-0.014955,-0.016588,-0.016752,-0.016307,26,1cd00c373ace404b829e822bf076631b564bf2bc70db82...,CEITEC,CRS,0


In [3]:
df['Label', 'Label', 'unit'].unique()

array(['CEITEC', 'CESNET', 'CTT', 'ESF', 'FF', 'FI', 'FIeduroamnaFI',
       'FNBrno', 'FSS', 'FSpS', 'IBA', 'LF', 'PedF', 'PravF', 'PrirF',
       'RMU', 'SKM', 'Teiresias', 'Telc', 'UKB', 'UVT', 'VPN'],
      dtype=object)

### More data preprocessing

We are only going to use hosts with unit label **UVT** and **SKM**

In [4]:
df = df[df['Label', 'Label', 'unit'].isin(['UVT', 'SKM'])]
print(df['Label', 'Label', 'unit'].value_counts())
df.head()

UVT    3500
SKM    3246
Name: (Label, Label, unit), dtype: int64


Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,...,max,max,max,max,max,Label,Label,Label,Label,Unit_encoding
Unnamed: 0_level_1,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,...,Bytes,Bytes,Bytes,Bytes,Bytes,Label,Label,Label,Label,Unnamed: 21_level_1
Unnamed: 0_level_2,00,01,02,03,04,05,06,07,08,09,...,19,20,21,22,23,anon_net_range,addr_range,unit,subunit,Unnamed: 21_level_2
13291,-0.007496,-0.005742,-0.008903,-0.006316,-0.012004,-0.006973,-0.006919,-0.012518,-0.009461,-0.011721,...,-0.015274,-0.014957,-0.016592,-0.01675,-0.016306,24,a8ce8c3977ebd10bba6849d15645105eed4d5e78006cfb...,SKM,centrum Slapanice,16
13292,-0.008895,-0.008315,-0.010002,-0.006953,-0.01274,-0.008184,-0.008276,-0.013226,-0.010829,-0.012773,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,24,5a5fa9cbb0325b8b154b37dfee873af049c3f7fbad8141...,SKM,centrum Slapanice,16
13293,-0.007623,-0.006104,-0.009037,-0.006413,-0.012109,-0.00714,-0.007082,-0.012616,-0.009489,-0.011722,...,-0.015289,-0.01495,-0.016603,-0.016765,-0.016319,24,0eb624c7b418aec4d9779fe9a69f171a3c452e8a163e96...,SKM,centrum Slapanice,16
13294,-0.007596,-0.00603,-0.009027,-0.006398,-0.012092,-0.007123,-0.007039,-0.012571,-0.009517,-0.011771,...,-0.014638,-0.014446,-0.016041,-0.016122,-0.016018,24,8d2375a96a2a821d927c290ed665767957ffbbdc55e547...,SKM,centrum Slapanice,16
13295,-0.008925,-0.009641,-0.010201,-0.007432,-0.012726,-0.008296,-0.00854,-0.013556,-0.010859,-0.013467,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,24,ceab45b2cd2a4b0f2af9c73236e4fe98f1bcd1ca8b3bff...,SKM,centrum Slapanice,16


### Cleanup unit labels (0 - SKM, 1 - UVT) 

In [5]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Unit_encoding'] = le.fit_transform(df['Label', 'Label', 'unit'].astype(str))
df.head()

Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,...,max,max,max,max,max,Label,Label,Label,Label,Unit_encoding
Unnamed: 0_level_1,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,...,Bytes,Bytes,Bytes,Bytes,Bytes,Label,Label,Label,Label,Unnamed: 21_level_1
Unnamed: 0_level_2,00,01,02,03,04,05,06,07,08,09,...,19,20,21,22,23,anon_net_range,addr_range,unit,subunit,Unnamed: 21_level_2
13291,-0.007496,-0.005742,-0.008903,-0.006316,-0.012004,-0.006973,-0.006919,-0.012518,-0.009461,-0.011721,...,-0.015274,-0.014957,-0.016592,-0.01675,-0.016306,24,a8ce8c3977ebd10bba6849d15645105eed4d5e78006cfb...,SKM,centrum Slapanice,0
13292,-0.008895,-0.008315,-0.010002,-0.006953,-0.01274,-0.008184,-0.008276,-0.013226,-0.010829,-0.012773,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,24,5a5fa9cbb0325b8b154b37dfee873af049c3f7fbad8141...,SKM,centrum Slapanice,0
13293,-0.007623,-0.006104,-0.009037,-0.006413,-0.012109,-0.00714,-0.007082,-0.012616,-0.009489,-0.011722,...,-0.015289,-0.01495,-0.016603,-0.016765,-0.016319,24,0eb624c7b418aec4d9779fe9a69f171a3c452e8a163e96...,SKM,centrum Slapanice,0
13294,-0.007596,-0.00603,-0.009027,-0.006398,-0.012092,-0.007123,-0.007039,-0.012571,-0.009517,-0.011771,...,-0.014638,-0.014446,-0.016041,-0.016122,-0.016018,24,8d2375a96a2a821d927c290ed665767957ffbbdc55e547...,SKM,centrum Slapanice,0
13295,-0.008925,-0.009641,-0.010201,-0.007432,-0.012726,-0.008296,-0.00854,-0.013556,-0.010859,-0.013467,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,24,ceab45b2cd2a4b0f2af9c73236e4fe98f1bcd1ca8b3bff...,SKM,centrum Slapanice,0


In [6]:
df['Unit_encoding'].value_counts()

1    3500
0    3246
Name: Unit_encoding, dtype: int64

Cleanup the index

In [7]:
df = df.reset_index(drop=True)

## Divide dataset into train/test split

In [8]:
x = df.drop(['Label', 'Unit_encoding'], level=0, axis=1)
y = df['Unit_encoding']
x.head()

Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,...,max,max,max,max,max,max,max,max,max,max
Unnamed: 0_level_1,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,...,Bytes,Bytes,Bytes,Bytes,Bytes,Bytes,Bytes,Bytes,Bytes,Bytes
Unnamed: 0_level_2,00,01,02,03,04,05,06,07,08,09,...,14,15,16,17,18,19,20,21,22,23
0,-0.007496,-0.005742,-0.008903,-0.006316,-0.012004,-0.006973,-0.006919,-0.012518,-0.009461,-0.011721,...,-0.018114,-0.017923,-0.017517,-0.017624,-0.01595,-0.015274,-0.014957,-0.016592,-0.01675,-0.016306
1,-0.008895,-0.008315,-0.010002,-0.006953,-0.01274,-0.008184,-0.008276,-0.013226,-0.010829,-0.012773,...,-0.018135,-0.017942,-0.017531,-0.017641,-0.015971,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632
2,-0.007623,-0.006104,-0.009037,-0.006413,-0.012109,-0.00714,-0.007082,-0.012616,-0.009489,-0.011722,...,-0.017943,-0.017941,-0.017526,-0.017639,-0.015971,-0.015289,-0.01495,-0.016603,-0.016765,-0.016319
3,-0.007596,-0.00603,-0.009027,-0.006398,-0.012092,-0.007123,-0.007039,-0.012571,-0.009517,-0.011771,...,-0.017672,-0.017679,-0.017014,-0.017126,-0.015627,-0.014638,-0.014446,-0.016041,-0.016122,-0.016018
4,-0.008925,-0.009641,-0.010201,-0.007432,-0.012726,-0.008296,-0.00854,-0.013556,-0.010859,-0.013467,...,-0.018135,-0.017942,-0.017531,-0.017641,-0.015971,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632


In [9]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20)

## Apply SVM classifier

In [10]:
from sklearn.svm import SVC

In [11]:
svc = SVC(gamma='auto')  
svc.fit(x_train, y_train) 

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [12]:
prediction = svc.predict(x_test)

## Evaluate classifier performance

In [13]:
from sklearn.metrics import classification_report, confusion_matrix  
print(classification_report(y_test,prediction))

              precision    recall  f1-score   support

           0       0.90      0.92      0.91       651
           1       0.93      0.90      0.91       699

   micro avg       0.91      0.91      0.91      1350
   macro avg       0.91      0.91      0.91      1350
weighted avg       0.91      0.91      0.91      1350



### Perform grid search to compare performance using variety of parameter combinations

In [14]:
param_grid = {'C': [0.001, 0.01, 0.1, 0.5], 'gamma': [1, 0.1], 'kernel': ['poly'], 'degree':[3]}

In [15]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)

In [16]:
grid.fit(x_train, y_train)
grid_prediction = grid.predict(x_test)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] C=0.001, degree=3, gamma=1, kernel=poly .........................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  C=0.001, degree=3, gamma=1, kernel=poly, score=0.9066147859922179, total=   3.7s
[CV] C=0.001, degree=3, gamma=1, kernel=poly .........................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.7s remaining:    0.0s


[CV]  C=0.001, degree=3, gamma=1, kernel=poly, score=0.9177320733740967, total=   4.1s
[CV] C=0.001, degree=3, gamma=1, kernel=poly .........................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   11.9s remaining:    0.0s


[CV]  C=0.001, degree=3, gamma=1, kernel=poly, score=0.9010011123470523, total=   3.9s
[CV] C=0.001, degree=3, gamma=0.1, kernel=poly .......................
[CV]  C=0.001, degree=3, gamma=0.1, kernel=poly, score=0.9321845469705392, total=   4.7s
[CV] C=0.001, degree=3, gamma=0.1, kernel=poly .......................
[CV]  C=0.001, degree=3, gamma=0.1, kernel=poly, score=0.9332962757087271, total=   4.7s
[CV] C=0.001, degree=3, gamma=0.1, kernel=poly .......................
[CV]  C=0.001, degree=3, gamma=0.1, kernel=poly, score=0.9260289210233593, total=   4.8s
[CV] C=0.01, degree=3, gamma=1, kernel=poly ..........................
[CV]  C=0.01, degree=3, gamma=1, kernel=poly, score=0.9066147859922179, total=   3.7s
[CV] C=0.01, degree=3, gamma=1, kernel=poly ..........................
[CV]  C=0.01, degree=3, gamma=1, kernel=poly, score=0.9177320733740967, total=   4.1s
[CV] C=0.01, degree=3, gamma=1, kernel=poly ..........................
[CV]  C=0.01, degree=3, gamma=1, kernel=poly, sc

[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:  2.5min finished


## Pick the best parameters

In [17]:
grid.best_params_

{'C': 0.001, 'degree': 3, 'gamma': 0.1, 'kernel': 'poly'}

In [20]:
print(classification_report(y_test,grid_prediction))

              precision    recall  f1-score   support

           0       0.92      0.95      0.94       651
           1       0.95      0.93      0.94       699

   micro avg       0.94      0.94      0.94      1350
   macro avg       0.94      0.94      0.94      1350
weighted avg       0.94      0.94      0.94      1350



In [21]:
print(confusion_matrix(y_test,grid_prediction))

[[618  33]
 [ 52 647]]
