### 1. mon_feature.pkl upload

In [1]:
import pandas as pd
import numpy as np
import pickle

FILE_PATH = '/content/mon_features.pkl'
LABEL_COLUMN = ['website_label', 'monitored_label']

FEATURES_VER2 = [
    'total_transmission_time', 'std_inter_packet_time',
    'num_outgoing_packets', 'avg_incoming_burst_size',
    'cumul_packets_30pct', 'cumul_packets_10pct',
    'incoming_order_skew', 'outgoing_order_skew',
    'cumul_max', 'bigram_OO', 'avg_outgoing_order_first_30',
    'num_incoming_first_30', 'incoming_packet_ratio'
]

features_df = pd.read_pickle(FILE_PATH)
X = features_df[FEATURES_VER2]
y = features_df[LABEL_COLUMN[0]]

print(X)
print(y)

       total_transmission_time  std_inter_packet_time  num_outgoing_packets  \
0                        10.14               0.041168                 121.0   
1                        10.16               0.163930                  80.0   
2                        11.11               0.066661                 118.0   
3                        13.36               0.047809                 122.0   
4                        10.64               0.038760                 115.0   
...                        ...                    ...                   ...   
18995                    43.91               0.143962                 619.0   
18996                    15.60               0.019465                 552.0   
18997                    14.93               0.016411                 579.0   
18998                    19.91               0.033281                 690.0   
18999                    13.76               0.011074                 757.0   

       avg_incoming_burst_size  cumul_packets_30pct

### 2. SVM

Ignore ConvergenceWarning

In [2]:
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)

Split the dataset into training and testing sets

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

Map all points to have mean=0 and std=1

In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train.values)
X_test_scale = scaler.transform(X_test.values)
X_scale = scaler.fit_transform(X.values)

Train and test SVM using RBF kernel

In [5]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report

clf_rbf = SVC(kernel='rbf', C=100, gamma=0.1, class_weight='balanced')
clf_rbf.fit(X_train_scale, y_train)
y_pred_rbf = clf_rbf.predict(X_test_scale)

print("============ Before Hyperparameter tuning ============")
print("SVM Accuracy: {}".format(accuracy_score(y_test, y_pred_rbf)))
print("Classification Report:\n", classification_report(y_test, y_pred_rbf))

SVM Accuracy: 0.816
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.75      0.76        51
           1       0.89      0.72      0.80        47
           2       0.84      0.96      0.89        48
           3       0.88      1.00      0.94        37
           4       0.80      0.80      0.80        44
           5       0.83      0.88      0.85        56
           6       0.87      0.90      0.88        58
           7       0.76      0.79      0.78        53
           8       0.67      0.69      0.68        49
           9       0.66      0.66      0.66        38
          10       0.81      0.85      0.83        55
          11       0.69      0.80      0.74        45
          12       0.90      0.95      0.92        55
          13       0.63      0.60      0.61        45
          14       0.69      0.71      0.70        52
          15       0.95      0.81      0.88        48
          16       0.74      0.80    

### 3. Hyperparameter tuning by using Grid Search

In [6]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'gamma': [0.001, 0.01, 0.1, 1],
              'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}

search = HalvingGridSearchCV(SVC(class_weight='balanced'), param_grid, cv=5, factor=3, n_jobs=-1)
search.fit(X_train_scale, y_train)

print(search.best_params_)
print(search.best_estimator_)

{'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}
SVC(C=100, class_weight='balanced', gamma=0.1)


In [7]:
y_pred_tuned = search.predict(X_test_scale)

print("============ After Hyperparameter tuning ============")
print("SVM Accuracy: {}".format(accuracy_score(y_test, y_pred_tuned)))
print("Classification Report:\n", classification_report(y_test, y_pred_tuned))

SVM Accuracy: 0.816
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.75      0.76        51
           1       0.89      0.72      0.80        47
           2       0.84      0.96      0.89        48
           3       0.88      1.00      0.94        37
           4       0.80      0.80      0.80        44
           5       0.83      0.88      0.85        56
           6       0.87      0.90      0.88        58
           7       0.76      0.79      0.78        53
           8       0.67      0.69      0.68        49
           9       0.66      0.66      0.66        38
          10       0.81      0.85      0.83        55
          11       0.69      0.80      0.74        45
          12       0.90      0.95      0.92        55
          13       0.63      0.60      0.61        45
          14       0.69      0.71      0.70        52
          15       0.95      0.81      0.88        48
          16       0.74      0.80    