# **DDoS Model Training (SVM)**
---

## **Imports and Configuration**
---

In [1]:
import json
import numpy as np
import os
import pandas as pd
import pickle
import warnings

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

In [2]:
# Ignore warnings
warnings.filterwarnings('ignore')

In [3]:
# Data, encoders and normalizer paths
dataset_name = '75-20-05-udplag.syn'

input_path = os.path.join('data/clean', dataset_name)
encoders_path = os.path.join(input_path, 'encoders')
stats_path = os.path.join(input_path, 'stats')
data_path = os.path.join(input_path, 'split-sets-balanced-smote')

# Model paths settings
model_type = 'svm' # <- Change for other models
model_subtype = 'svc'
models_path = os.path.join('models', model_type)

## **Load Encoders and Normalizing Stats**
---

In [4]:
# Load label encoder
with open(os.path.join(encoders_path, 'label-encoder.pkl'), 'rb') as file:
    label_encoder = pickle.load(file)

# Load one-hot encoder
with open(os.path.join(encoders_path, 'onehot-encoder.pkl'), 'rb') as file:
    onehot_encoder = pickle.load(file)
    
# Load standard scaler
with open(os.path.join(stats_path, 'standard-scaler.pkl'), 'rb') as file:
    standard_scaler = pickle.load(file)

## **Load Data**
---

In [5]:
# Load Data
X_train = pd.read_csv(os.path.join(data_path, 'X_train.csv'))
y_train = pd.read_csv(os.path.join(data_path, 'y_train.csv'))
X_val = pd.read_csv(os.path.join(data_path, 'X_val.csv'))
y_val = pd.read_csv(os.path.join(data_path, 'y_val.csv'))
X_test = pd.read_csv(os.path.join(data_path, 'X_test.csv'))
y_test = pd.read_csv(os.path.join(data_path, 'y_test.csv'))

In [6]:
unique_labels = y_train['Label'].unique()
print(f'Unique labels = {unique_labels}')

Unique labels = ['Syn/UDPLag' 'DNS/LDAP' 'NetBIOS/Portmap' 'MSSQL' 'SNMP' 'SSDP/UDP' 'NTP'
 'TFTP' 'BENIGN']


## **Normalize and Transform (X, y) Data**
---

In [7]:
# Standard normalization of X
X_train = standard_scaler.transform(X_train)
X_val = standard_scaler.transform(X_val)
X_test = standard_scaler.transform(X_test)

In [None]:
# L2 normalization and quantile transformation of X
X_train = l2_normalizer.transform(X_train)
X_val = l2_normalizer.transform(X_val)
X_test = l2_normalizer.transform(X_test)

X_train = quantile_transformer.transform(X_train)
X_val = quantile_transformer.transform(X_val)
X_test = quantile_transformer.transform(X_test)

In [8]:
# One-hot encoding of y
y_train_onehot = onehot_encoder.transform(y_train)
y_val_onehot = onehot_encoder.transform(y_val)
y_test_onehot = onehot_encoder.transform(y_test)

# Label encoding of y
y_train = label_encoder.transform(y_train)
y_val = label_encoder.transform(y_val)
y_test = label_encoder.transform(y_test)

In [9]:
print(f'       X_train.shape = {X_train.shape}')
print(f'       y_train.shape = {y_train.shape}')
print(f'y_train_onehot.shape = {y_train_onehot.shape}\n')

print(f'         X_val.shape = {X_val.shape}')
print(f'         y_val.shape = {y_val.shape}')
print(f'  y_val_onehot.shape = {y_val_onehot.shape}\n')

print(f'        X_test.shape = {X_test.shape}')
print(f'        y_test.shape = {y_test.shape}')
print(f' y_test_onehot.shape = {y_test_onehot.shape}')

       X_train.shape = (11916819, 78)
       y_train.shape = (11916819,)
y_train_onehot.shape = (11916819, 9)

         X_val.shape = (1770568, 78)
         y_val.shape = (1770568,)
  y_val_onehot.shape = (1770568, 9)

        X_test.shape = (442643, 78)
        y_test.shape = (442643,)
 y_test_onehot.shape = (442643, 9)


## **Model Settings**
---

In [11]:
# Model hyperparameter definition
# Debería hacer una grid?
model_props = {
    'C': 1.0,
    'kernel': 'rbf',
    'degree': 3,
    'gamma': 'scale',
    'probability': True,
    'max_iter': 1000,
    'decision_function_shape': 'ovr'
}

In [12]:
# Model definition
model = SVC(**model_props, verbose=True)
model

<h2> 
    Grid Search 
    <span style="color: red; font-size: 10px;">No se está usando</span>
</h2>
---

In [12]:
grid_props = {
    "kernel": ['rbf'],
    "C": [5],
    "gamma": ['scale'],
}

In [13]:
grid_search = GridSearchCV(model, grid_props, cv=5, scoring='accuracy', verbose=3)

In [None]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 270 candidates, totalling 1350 fits


In [None]:
grid_search.best_params_

## **Model Training and Persistence**
---

In [13]:
# Model training
model.fit(X_train, y_train)

[LibSVM]

In [14]:
# Define normalize method name
normalize_name = 'standard-v1'

normalize_name = f'{model_subtype}-{normalize_name}'
# Save model
with open(os.path.join(models_path, f'model-svm-{normalize_name}.object.pkl'), 'wb') as file:
    pickle.dump(model, file)

In [None]:
# Save model history
history = {
    'hyperparameters': model.get_params(),
    'feature_importances': list(model.feature_importances_)
}

with open(os.path.join(models_path, f'model-rf-{normalize_name}.history.json'), 'w') as file:
    json.dump(history, file)

## **Model Evaluation**
---

In [15]:
y_pred = model.predict(X_test)
# y_pred = np.argmax(y_pred, axis=1)

print(f'Accuracy = {accuracy_score(y_test, y_pred)}\n')
print(classification_report(y_test, y_pred, target_names=list(label_encoder.classes_)))

Accuracy = 0.11195026240107717

                 precision    recall  f1-score   support

         BENIGN       0.07      0.18      0.10      2827
       DNS/LDAP       0.01      0.00      0.00     40987
          MSSQL       0.25      0.00      0.00     48599
            NTP       0.00      0.00      0.00     48900
NetBIOS/Portmap       0.00      0.00      0.00     58680
           SNMP       0.00      0.00      0.00     49756
       SSDP/UDP       0.00      0.00      0.00     56214
     Syn/UDPLag       0.00      0.00      0.00     87766
           TFTP       0.19      1.00      0.32     48914

       accuracy                           0.11    442643
      macro avg       0.06      0.13      0.05    442643
   weighted avg       0.05      0.11      0.04    442643

