# Classification of hosts within subnet

The goal of this notebook is to perform classification of hosts from **5 subnets** using Support Vector Machine classifier. 

### Imports

In [1]:
import pandas as pd

### Import dataset

We have a preprocessed pandas dataframe consisting of anonymized host behaviour. Each host belongs to a certain **unit** within the subnet.

In [2]:
df = pd.read_pickle('../../dataset/balanced_preprocessed_dataset.pkl')
df.head()

Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,...,max,max,max,max,max,Label,Label,Label,Label,Unit_encoding
Unnamed: 0_level_1,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,...,Bytes,Bytes,Bytes,Bytes,Bytes,Label,Label,Label,Label,Unnamed: 21_level_1
Unnamed: 0_level_2,00,01,02,03,04,05,06,07,08,09,...,19,20,21,22,23,anon_net_range,addr_range,unit,subunit,Unnamed: 21_level_2
0,-0.007622,-0.006103,-0.009019,-0.006422,-0.012099,-0.007141,-0.007069,-0.012616,-0.009572,-0.011851,...,-0.01529,-0.014971,-0.016604,-0.016766,-0.01632,28,ef160f55b36bd48b37f22bc9c48819b1a0259c2dd27ccc...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
1,-0.009677,-0.011074,-0.01079,-0.008082,-0.01361,-0.009863,-0.009743,-0.01434,-0.01151,-0.014119,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,28,93b8f5a052053b0db4731b671f78b8c5e5817d38d51ba9...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
2,-0.009484,-0.010427,-0.010896,-0.00808,-0.013609,-0.009918,-0.009838,-0.01432,-0.011545,-0.013922,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,28,ac600c8985d0f198d532737ea9d58db00905c6c6bebb6b...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
3,-0.009932,-0.011126,-0.010762,-0.007984,-0.013595,-0.009918,-0.00947,-0.014341,-0.011979,-0.01423,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,28,48bab257d30b1c6eaa225275fe60fc5e1dfe61afe54ace...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
4,-0.007484,-0.00579,-0.00893,-0.006322,-0.012029,-0.007007,-0.006911,-0.012538,-0.009441,-0.011706,...,-0.015274,-0.014955,-0.016588,-0.016752,-0.016307,26,1cd00c373ace404b829e822bf076631b564bf2bc70db82...,CEITEC,CRS,0


In [3]:
df['Label', 'Label', 'unit'].unique()

array(['CEITEC', 'CESNET', 'CTT', 'ESF', 'FF', 'FI', 'FIeduroamnaFI',
       'FNBrno', 'FSS', 'FSpS', 'IBA', 'LF', 'PedF', 'PravF', 'PrirF',
       'RMU', 'SKM', 'Teiresias', 'Telc', 'UKB', 'UVT', 'VPN'],
      dtype=object)

In [4]:
df['Label', 'Label', 'unit'].value_counts()

UVT              3500
SKM              3246
PrirF            2599
CEITEC           2223
LF               1475
FI               1289
FF               1089
FIeduroamnaFI    1020
FSS               802
PedF              740
UKB               649
ESF               554
PravF             536
RMU               429
FSpS              410
Teiresias         130
Telc               99
IBA                96
VPN                83
CTT                19
FNBrno              7
CESNET              3
Name: (Label, Label, unit), dtype: int64

### More data preprocessing

In [5]:
groups = ['UVT', 'SKM', 'CEITEC', 'FI', 'PrirF']

We are only going to use hosts within grop

In [6]:
df = df[df['Label', 'Label', 'unit'].isin(groups)]
print(df['Label', 'Label', 'unit'].value_counts())
df.head()

UVT       3500
SKM       3246
PrirF     2599
CEITEC    2223
FI        1289
Name: (Label, Label, unit), dtype: int64


Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,...,max,max,max,max,max,Label,Label,Label,Label,Unit_encoding
Unnamed: 0_level_1,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,...,Bytes,Bytes,Bytes,Bytes,Bytes,Label,Label,Label,Label,Unnamed: 21_level_1
Unnamed: 0_level_2,00,01,02,03,04,05,06,07,08,09,...,19,20,21,22,23,anon_net_range,addr_range,unit,subunit,Unnamed: 21_level_2
0,-0.007622,-0.006103,-0.009019,-0.006422,-0.012099,-0.007141,-0.007069,-0.012616,-0.009572,-0.011851,...,-0.01529,-0.014971,-0.016604,-0.016766,-0.01632,28,ef160f55b36bd48b37f22bc9c48819b1a0259c2dd27ccc...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
1,-0.009677,-0.011074,-0.01079,-0.008082,-0.01361,-0.009863,-0.009743,-0.01434,-0.01151,-0.014119,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,28,93b8f5a052053b0db4731b671f78b8c5e5817d38d51ba9...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
2,-0.009484,-0.010427,-0.010896,-0.00808,-0.013609,-0.009918,-0.009838,-0.01432,-0.011545,-0.013922,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,28,ac600c8985d0f198d532737ea9d58db00905c6c6bebb6b...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
3,-0.009932,-0.011126,-0.010762,-0.007984,-0.013595,-0.009918,-0.00947,-0.014341,-0.011979,-0.01423,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,28,48bab257d30b1c6eaa225275fe60fc5e1dfe61afe54ace...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
4,-0.007484,-0.00579,-0.00893,-0.006322,-0.012029,-0.007007,-0.006911,-0.012538,-0.009441,-0.011706,...,-0.015274,-0.014955,-0.016588,-0.016752,-0.016307,26,1cd00c373ace404b829e822bf076631b564bf2bc70db82...,CEITEC,CRS,0


In [7]:
df['Label', 'Label', 'unit'].unique()

array(['CEITEC', 'FI', 'PrirF', 'SKM', 'UVT'], dtype=object)

### Cleanup numeric unit labels of groups 

In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Unit_encoding'] = le.fit_transform(df['Label', 'Label', 'unit'].astype(str))
df.head()

Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,...,max,max,max,max,max,Label,Label,Label,Label,Unit_encoding
Unnamed: 0_level_1,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,...,Bytes,Bytes,Bytes,Bytes,Bytes,Label,Label,Label,Label,Unnamed: 21_level_1
Unnamed: 0_level_2,00,01,02,03,04,05,06,07,08,09,...,19,20,21,22,23,anon_net_range,addr_range,unit,subunit,Unnamed: 21_level_2
0,-0.007622,-0.006103,-0.009019,-0.006422,-0.012099,-0.007141,-0.007069,-0.012616,-0.009572,-0.011851,...,-0.01529,-0.014971,-0.016604,-0.016766,-0.01632,28,ef160f55b36bd48b37f22bc9c48819b1a0259c2dd27ccc...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
1,-0.009677,-0.011074,-0.01079,-0.008082,-0.01361,-0.009863,-0.009743,-0.01434,-0.01151,-0.014119,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,28,93b8f5a052053b0db4731b671f78b8c5e5817d38d51ba9...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
2,-0.009484,-0.010427,-0.010896,-0.00808,-0.013609,-0.009918,-0.009838,-0.01432,-0.011545,-0.013922,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,28,ac600c8985d0f198d532737ea9d58db00905c6c6bebb6b...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
3,-0.009932,-0.011126,-0.010762,-0.007984,-0.013595,-0.009918,-0.00947,-0.014341,-0.011979,-0.01423,...,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632,28,48bab257d30b1c6eaa225275fe60fc5e1dfe61afe54ace...,CEITEC,frontendy diskovych poli a aplikacni servery ...,0
4,-0.007484,-0.00579,-0.00893,-0.006322,-0.012029,-0.007007,-0.006911,-0.012538,-0.009441,-0.011706,...,-0.015274,-0.014955,-0.016588,-0.016752,-0.016307,26,1cd00c373ace404b829e822bf076631b564bf2bc70db82...,CEITEC,CRS,0


In [9]:
df['Unit_encoding'].value_counts()

4    3500
3    3246
2    2599
0    2223
1    1289
Name: Unit_encoding, dtype: int64

#### Cleanup the index

In [10]:
df = df.reset_index(drop=True)

### Divide dataset into train/test split

In [11]:
x = df.drop(['Label', 'Unit_encoding'], level=0, axis=1)
y = df['Unit_encoding']
x.head()

Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,...,max,max,max,max,max,max,max,max,max,max
Unnamed: 0_level_1,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,FlowDuration,...,Bytes,Bytes,Bytes,Bytes,Bytes,Bytes,Bytes,Bytes,Bytes,Bytes
Unnamed: 0_level_2,00,01,02,03,04,05,06,07,08,09,...,14,15,16,17,18,19,20,21,22,23
0,-0.007622,-0.006103,-0.009019,-0.006422,-0.012099,-0.007141,-0.007069,-0.012616,-0.009572,-0.011851,...,-0.018135,-0.017941,-0.017531,-0.01764,-0.015971,-0.01529,-0.014971,-0.016604,-0.016766,-0.01632
1,-0.009677,-0.011074,-0.01079,-0.008082,-0.01361,-0.009863,-0.009743,-0.01434,-0.01151,-0.014119,...,-0.018135,-0.017942,-0.017531,-0.017641,-0.015971,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632
2,-0.009484,-0.010427,-0.010896,-0.00808,-0.013609,-0.009918,-0.009838,-0.01432,-0.011545,-0.013922,...,-0.018135,-0.017942,-0.017531,-0.017641,-0.015971,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632
3,-0.009932,-0.011126,-0.010762,-0.007984,-0.013595,-0.009918,-0.00947,-0.014341,-0.011979,-0.01423,...,-0.018135,-0.017942,-0.017531,-0.017641,-0.015971,-0.015291,-0.014971,-0.016604,-0.016766,-0.01632
4,-0.007484,-0.00579,-0.00893,-0.006322,-0.012029,-0.007007,-0.006911,-0.012538,-0.009441,-0.011706,...,-0.018115,-0.017918,-0.017516,-0.017627,-0.015957,-0.015274,-0.014955,-0.016588,-0.016752,-0.016307


In [12]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20)

## Apply SVM classifier

In [13]:
from sklearn.svm import SVC

In [14]:
svc = SVC(gamma='auto')  
svc.fit(x_train, y_train) 

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [15]:
prediction = svc.predict(x_test)

## Evaluate classifier performance

In [16]:
from sklearn.metrics import classification_report, confusion_matrix  
print(classification_report(y_test,prediction))

              precision    recall  f1-score   support

           0       0.78      0.87      0.82       442
           1       0.77      0.33      0.46       245
           2       0.61      0.83      0.70       507
           3       0.80      0.85      0.83       655
           4       0.87      0.71      0.78       723

   micro avg       0.76      0.76      0.76      2572
   macro avg       0.77      0.72      0.72      2572
weighted avg       0.78      0.76      0.75      2572



In [17]:
print(confusion_matrix(y_test,prediction))

[[383   0  36  12  11]
 [ 37  80  67  47  14]
 [ 10  13 422  37  25]
 [  9   6  55 557  28]
 [ 52   5 113  41 512]]


### Try various parameter combinations using grid search

In [18]:
param_grid = {'C': [0.001, 0.01, 0.1], 'gamma': ['auto', 1, 0.1], 'kernel': ['poly'], 'degree':[3]}

In [19]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)

In [20]:
grid.fit(x_train, y_train)
grid_prediction = grid.predict(x_test)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] C=0.001, degree=3, gamma=auto, kernel=poly ......................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  C=0.001, degree=3, gamma=auto, kernel=poly, score=0.2720116618075802, total= 1.0min
[CV] C=0.001, degree=3, gamma=auto, kernel=poly ......................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.5min remaining:    0.0s


[CV]  C=0.001, degree=3, gamma=auto, kernel=poly, score=0.2741324001166521, total= 1.1min
[CV] C=0.001, degree=3, gamma=auto, kernel=poly ......................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  3.1min remaining:    0.0s


[CV]  C=0.001, degree=3, gamma=auto, kernel=poly, score=0.2732049036777583, total= 1.0min
[CV] C=0.001, degree=3, gamma=1, kernel=poly .........................
[CV]  C=0.001, degree=3, gamma=1, kernel=poly, score=0.7486880466472303, total=  26.1s
[CV] C=0.001, degree=3, gamma=1, kernel=poly .........................
[CV]  C=0.001, degree=3, gamma=1, kernel=poly, score=0.7389909594634004, total=  27.2s
[CV] C=0.001, degree=3, gamma=1, kernel=poly .........................
[CV]  C=0.001, degree=3, gamma=1, kernel=poly, score=0.7393461762988909, total=  27.4s
[CV] C=0.001, degree=3, gamma=0.1, kernel=poly .......................
[CV]  C=0.001, degree=3, gamma=0.1, kernel=poly, score=0.7895043731778426, total=  30.7s
[CV] C=0.001, degree=3, gamma=0.1, kernel=poly .......................
[CV]  C=0.001, degree=3, gamma=0.1, kernel=poly, score=0.784193642461359, total=  30.8s
[CV] C=0.001, degree=3, gamma=0.1, kernel=poly .......................
[CV]  C=0.001, degree=3, gamma=0.1, kernel=pol

[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed: 27.6min finished


### Pick the highest performing parameters

In [21]:
grid.best_params_

{'C': 0.01, 'degree': 3, 'gamma': 0.1, 'kernel': 'poly'}

In [22]:
print(classification_report(y_test,grid_prediction))

              precision    recall  f1-score   support

           0       0.79      0.89      0.84       442
           1       0.56      0.65      0.60       245
           2       0.74      0.80      0.77       507
           3       0.88      0.87      0.88       655
           4       0.92      0.74      0.82       723

   micro avg       0.81      0.81      0.81      2572
   macro avg       0.78      0.79      0.78      2572
weighted avg       0.82      0.81      0.81      2572



In [23]:
print(confusion_matrix(y_test,prediction))

[[383   0  36  12  11]
 [ 37  80  67  47  14]
 [ 10  13 422  37  25]
 [  9   6  55 557  28]
 [ 52   5 113  41 512]]
