# Classification of hosts within subnet

We are going to perform SVM on hosts from all units of given subnet

### Imports

In [1]:
import pandas as pd

### Import dataset

We have a preprocessed pandas dataframe consisting of anonymized host behaviour. Each host belongs to a certain **unit** within the subnet.

In [2]:
df = pd.read_pickle('balanced_preprocessed_dataset.pkl')
df.head()

Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,...,max,max,max,max,max,Label,Label,Label,Label,Unit_encoding
Unnamed: 0_level_1,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,...,Bytes,Bytes,Bytes,Bytes,Bytes,Label,Label,Label,Label,Unnamed: 21_level_1
Unnamed: 0_level_2,00,01,02,03,04,05,06,07,08,09,...,19,20,21,22,23,anon_net_range,addr_range,unit,subunit,Unnamed: 21_level_2
0,-0.007622,-0.006103,-0.009019,-0.006422,-0.012099,-0.007141,-0.007069,-0.012616,-0.009572,-0.011851,...,-0.01529,-0.014971,-0.016604,-0.016766,-0.01632,28,ef160f55b36bd48b37f22bc9c48819b1a0259c2dd27ccc...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
1,-0.009677,-0.011074,-0.01079,-0.008082,-0.01361,-0.009863,-0.009743,-0.01434,-0.01151,-0.014119,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,28,93b8f5a052053b0db4731b671f78b8c5e5817d38d51ba9...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
2,-0.009484,-0.010427,-0.010896,-0.00808,-0.013609,-0.009918,-0.009838,-0.01432,-0.011545,-0.013922,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,28,ac600c8985d0f198d532737ea9d58db00905c6c6bebb6b...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
3,-0.009932,-0.011126,-0.010762,-0.007984,-0.013595,-0.009918,-0.00947,-0.014341,-0.011979,-0.01423,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,28,48bab257d30b1c6eaa225275fe60fc5e1dfe61afe54ace...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
4,-0.007484,-0.00579,-0.00893,-0.006322,-0.012029,-0.007007,-0.006911,-0.012538,-0.009441,-0.011706,...,-0.015274,-0.014955,-0.016588,-0.016752,-0.016307,26,1cd00c373ace404b829e822bf076631b564bf2bc70db82...,CEITEC,CRS,0


In [3]:
df['Label', 'Label', 'unit'].unique()

array(['CEITEC', 'CESNET', 'CTT', 'ESF', 'FF', 'FI', 'FIeduroamnaFI',
       'FNBrno', 'FSS', 'FSpS', 'IBA', 'LF', 'PedF', 'PravF', 'PrirF',
       'RMU', 'SKM', 'Teiresias', 'Telc', 'UKB', 'UVT', 'VPN'],
      dtype=object)

In [4]:
df['Label', 'Label', 'unit'].value_counts()

UVT              3500
SKM              3246
PrirF            2599
CEITEC           2223
LF               1475
FI               1289
FF               1089
FIeduroamnaFI    1020
FSS               802
PedF              740
UKB               649
ESF               554
PravF             536
RMU               429
FSpS              410
Teiresias         130
Telc               99
IBA                96
VPN                83
CTT                19
FNBrno              7
CESNET              3
Name: (Label, Label, unit), dtype: int64

Cleanup the index

In [5]:
df = df.reset_index(drop=True)

### Divide dataset into train/test split

In [6]:
x = df.drop(['Label', 'Unit_encoding'], level=0, axis=1)
y = df['Unit_encoding']
x.head()

Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,...,max,max,max,max,max,max,max,max,max,max
Unnamed: 0_level_1,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,...,Bytes,Bytes,Bytes,Bytes,Bytes,Bytes,Bytes,Bytes,Bytes,Bytes
Unnamed: 0_level_2,00,01,02,03,04,05,06,07,08,09,...,14,15,16,17,18,19,20,21,22,23
0,-0.007622,-0.006103,-0.009019,-0.006422,-0.012099,-0.007141,-0.007069,-0.012616,-0.009572,-0.011851,...,-0.018135,-0.017941,-0.017531,-0.01764,-0.015971,-0.01529,-0.014971,-0.016604,-0.016766,-0.01632
1,-0.009677,-0.011074,-0.01079,-0.008082,-0.01361,-0.009863,-0.009743,-0.01434,-0.01151,-0.014119,...,-0.018135,-0.017942,-0.017531,-0.017641,-0.015971,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632
2,-0.009484,-0.010427,-0.010896,-0.00808,-0.013609,-0.009918,-0.009838,-0.01432,-0.011545,-0.013922,...,-0.018135,-0.017942,-0.017531,-0.017641,-0.015971,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632
3,-0.009932,-0.011126,-0.010762,-0.007984,-0.013595,-0.009918,-0.00947,-0.014341,-0.011979,-0.01423,...,-0.018135,-0.017942,-0.017531,-0.017641,-0.015971,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632
4,-0.007484,-0.00579,-0.00893,-0.006322,-0.012029,-0.007007,-0.006911,-0.012538,-0.009441,-0.011706,...,-0.018115,-0.017918,-0.017516,-0.017627,-0.015957,-0.015274,-0.014955,-0.016588,-0.016752,-0.016307


In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20)

## Apply SVM

In [8]:
from sklearn.svm import SVC

In [9]:
svc = SVC(gamma='auto')  
svc.fit(x_train, y_train) 

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [10]:
prediction = svc.predict(x_test)

## Evaluate classifier performance

In [11]:
from sklearn.metrics import classification_report, confusion_matrix  
print(classification_report(y_test,prediction))

              precision    recall  f1-score   support

           0       0.44      0.81      0.57       448
           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         1
           3       0.74      0.26      0.39       106
           4       0.33      0.50      0.40       220
           5       0.56      0.29      0.38       262
           6       0.78      0.95      0.86       197
           7       0.00      0.00      0.00         2
           8       0.52      0.50      0.51       173
           9       0.00      0.00      0.00        64
          10       0.00      0.00      0.00        25
          11       0.26      0.02      0.04       303
          12       0.59      0.11      0.18       149
          13       0.93      0.54      0.69       114
          14       0.37      0.73      0.49       515
          15       1.00      0.01      0.02        86
          16       0.76      0.84      0.80       657
          17       0.00    

  'precision', 'predicted', average, warn_for)


In [12]:
print(confusion_matrix(y_test,prediction))

[[364   0   0   0   5   3   1   0  12   0   0   0   0   0  40   0  12   0
    0   0  11   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0
    0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0
    0   0   0   0]
 [  4   0   0  28  26   6   0   0   0   0   0   1   0   0  34   0   3   0
    0   0   4   0]
 [ 16   0   0   6 111   5   1   0   5   0   0   1   3   1  51   0  16   0
    0   0   4   0]
 [ 40   0   0   0  15  76  13   0  17   0   0   0   1   0  55   0  34   0
    0   0  11   0]
 [  0   0   0   0   3   0 188   0   0   0   0   1   0   0   2   0   1   0
    0   0   2   0]
 [  1   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0
    0   0   0   0]
 [  4   0   0   0  18   6   8   0  86   0   0   1   0   0  35   0  12   0
    0   0   3   0]
 [ 49   0   0   0   2   1   0   0   2   0   0   0   0   1   7   0   2   0
    0   0   0   0]
 [  3   0   0   0   4   0   0   0   0   0   0   1   3   1   7   0   2 

### Try various parameter combinations using grid search

In [13]:
param_grid = {'C': [0.001, 0.01, 0.1, 1], 'gamma': ['auto', 1, 0.1], 'kernel': ['poly'], 'degree':[3]}

In [14]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)

In [15]:
grid.fit(x_train, y_train)
grid_prediction = grid.predict(x_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] C=0.001, degree=3, gamma=auto, kernel=poly ......................
[CV]  C=0.001, degree=3, gamma=auto, kernel=poly, score=0.16850927246790298, total= 3.0min
[CV] C=0.001, degree=3, gamma=auto, kernel=poly ......................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  4.4min remaining:    0.0s


[CV]  C=0.001, degree=3, gamma=auto, kernel=poly, score=0.1680357142857143, total= 3.0min
[CV] C=0.001, degree=3, gamma=auto, kernel=poly ......................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  8.8min remaining:    0.0s


[CV]  C=0.001, degree=3, gamma=auto, kernel=poly, score=0.16833631484794276, total= 3.1min
[CV] C=0.001, degree=3, gamma=1, kernel=poly .........................
[CV]  C=0.001, degree=3, gamma=1, kernel=poly, score=0.5962910128388017, total= 1.8min
[CV] C=0.001, degree=3, gamma=1, kernel=poly .........................
[CV]  C=0.001, degree=3, gamma=1, kernel=poly, score=0.5891071428571428, total= 1.8min
[CV] C=0.001, degree=3, gamma=1, kernel=poly .........................
[CV]  C=0.001, degree=3, gamma=1, kernel=poly, score=0.5898032200357782, total= 1.7min
[CV] C=0.001, degree=3, gamma=0.1, kernel=poly .......................
[CV]  C=0.001, degree=3, gamma=0.1, kernel=poly, score=0.5987874465049928, total= 1.9min
[CV] C=0.001, degree=3, gamma=0.1, kernel=poly .......................
[CV]  C=0.001, degree=3, gamma=0.1, kernel=poly, score=0.5885714285714285, total= 1.8min
[CV] C=0.001, degree=3, gamma=0.1, kernel=poly .......................
[CV]  C=0.001, degree=3, gamma=0.1, kernel=p

[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed: 122.7min finished


In [16]:
grid.best_params_

{'C': 0.01, 'degree': 3, 'gamma': 0.1, 'kernel': 'poly'}

In [17]:
print(classification_report(y_test,grid_prediction))

              precision    recall  f1-score   support

           0       0.47      0.84      0.61       448
           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         1
           3       0.43      0.41      0.42       106
           4       0.48      0.53      0.50       220
           5       0.49      0.52      0.50       262
           6       0.84      0.88      0.86       197
           7       0.00      0.00      0.00         2
           8       0.62      0.58      0.60       173
           9       0.58      0.17      0.27        64
          10       0.25      0.16      0.20        25
          11       0.30      0.13      0.18       303
          12       0.51      0.47      0.49       149
          13       0.79      0.60      0.68       114
          14       0.53      0.63      0.58       515
          15       0.44      0.33      0.38        86
          16       0.82      0.85      0.84       657
          17       0.67    

  'precision', 'predicted', average, warn_for)


In [18]:
print(confusion_matrix(y_test,grid_prediction))

[[378   0   1   2   5  10   0   0   4   3   1   3   2   1  20   0   5   0
    0   7   6   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0
    0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0
    0   0   0   0]
 [  4   0   0  43  16   0   0   0   1   0   1   6   3   1  21   2   7   1
    0   0   0   0]
 [ 14   0   0  12 117  10   1   0   5   0   0  16   2   1  29   1   9   1
    0   1   1   0]
 [ 24   0   0   6  17 135   6   0  10   0   1   7   5   2  24   2  10   0
    1   1  11   0]
 [  0   0   0   1   1   4 173   0   0   0   0   3   3   0   7   0   0   0
    0   0   5   0]
 [  1   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0
    0   0   0   0]
 [  4   0   0   3   9  15   5   0 101   0   1   2   2   0  18   2  10   0
    0   1   0   0]
 [ 37   0   0   0   3   3   0   0   1  11   0   1   0   0   2   0   1   0
    0   3   2   0]
 [  2   0   0   0   2   3   0   0   0   0   4   4   1   0   3   0   4 