In [21]:
import json
import numpy as np
import os
import pandas as pd
import pickle
import tensorflow as tf
import tensorflow.keras.layers as tfl
import warnings

from scipy.stats import mode

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Ignore warnings
warnings.filterwarnings('ignore')

In [3]:
# Data, encoders and normalizer paths
dataset_name = '75-20-05-udplag.syn'

input_path = os.path.join('data/clean', dataset_name)
encoders_path = os.path.join(input_path, 'encoders')
data_path = os.path.join(input_path, 'split-sets-balanced-smote')

# Model paths settings
model_types = ['logistic-regression', 'ridge-classifier', 'neural-network']
model_paths = [os.path.join('models', 'layer-1', model_type) for model_type in model_types]

In [4]:
X_prob_val = pd.read_csv(os.path.join(data_path, 'X_prob_val.csv'))
y_val = pd.read_csv(os.path.join(data_path, 'y_val.csv'))
X_prob_test = pd.read_csv(os.path.join(data_path, 'X_prob_test.csv'))
y_test = pd.read_csv(os.path.join(data_path, 'y_test.csv'))

In [5]:
# Load label encoder
with open(os.path.join(encoders_path, 'label-encoder.pkl'), 'rb') as file:
    label_encoder = pickle.load(file)

# Load one-hot encoder
with open(os.path.join(encoders_path, 'onehot-encoder.pkl'), 'rb') as file:
    onehot_encoder = pickle.load(file)

In [6]:
# One-hot encoding of y
y_val_onehot = onehot_encoder.transform(y_val)
y_test_onehot = onehot_encoder.transform(y_test)

# Label encoding of y
y_val = label_encoder.transform(y_val)
y_test = label_encoder.transform(y_test)

In [7]:
model_1 = LogisticRegression()
model_1

In [8]:
model_1.fit(X_prob_val, y_val)
model_1

In [9]:
y_pred_1 = model_1.predict(X_prob_test)

print(f'Metrics for test:\n')
print(f'Accuracy = {accuracy_score(y_test, y_pred_1)}\n')
print(classification_report(y_test, y_pred_1, target_names=list(label_encoder.classes_)))

Metrics for test:

Accuracy = 0.9123876351823027

                 precision    recall  f1-score   support

         BENIGN       0.99      1.00      1.00      2827
       DNS/LDAP       0.83      0.82      0.83     40987
          MSSQL       0.93      0.94      0.93     48599
            NTP       0.99      1.00      1.00     48900
NetBIOS/Portmap       0.92      0.89      0.90     58680
           SNMP       0.75      0.80      0.77     49756
       SSDP/UDP       0.95      0.95      0.95     56214
     Syn/UDPLag       0.93      0.96      0.95     87766
           TFTP       0.98      0.88      0.92     48914

       accuracy                           0.91    442643
      macro avg       0.92      0.92      0.92    442643
   weighted avg       0.91      0.91      0.91    442643



In [10]:
# Define version
version = 'v1'

# Save model
with open(os.path.join(model_paths[0], f'model-lr-{version}.object.pkl'), 'wb') as file:
    pickle.dump(model_1, file)

In [11]:
model_2 = RidgeClassifier()
model_2

In [12]:
model_2.fit(X_prob_val, y_val)
model_2

In [13]:
y_pred_2 = model_2.predict(X_prob_test)

print(f'Metrics for test:\n')
print(f'Accuracy = {accuracy_score(y_test, y_pred_2)}\n')
print(classification_report(y_test, y_pred_2, target_names=list(label_encoder.classes_)))

Metrics for test:

Accuracy = 0.9127219904076197

                 precision    recall  f1-score   support

         BENIGN       0.99      1.00      1.00      2827
       DNS/LDAP       0.83      0.82      0.83     40987
          MSSQL       0.93      0.94      0.93     48599
            NTP       0.99      1.00      1.00     48900
NetBIOS/Portmap       0.93      0.88      0.90     58680
           SNMP       0.75      0.81      0.78     49756
       SSDP/UDP       0.95      0.95      0.95     56214
     Syn/UDPLag       0.93      0.96      0.95     87766
           TFTP       0.98      0.87      0.92     48914

       accuracy                           0.91    442643
      macro avg       0.92      0.92      0.92    442643
   weighted avg       0.91      0.91      0.91    442643



In [14]:
# Define version
version = 'v1'

# Save model
with open(os.path.join(model_paths[1], f'model-rc-{version}.object.pkl'), 'wb') as file:
    pickle.dump(model_2, file)

In [15]:
# Model hyperparameter definition
input_dim = X_prob_val.shape[1]
n_classes = y_val_onehot.shape[1]
learning_rate = 0.0001
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
dropout_rate = 0.3
batch_size = 256
epochs = 20

In [16]:
# Model architecture
model_3 = tf.keras.Sequential([
    tfl.Dense(64, activation='leaky_relu', input_shape=(input_dim,)),
    tfl.Dropout(dropout_rate),
    tfl.Dense(n_classes, activation='softmax')
])

# Show architecture summary
model_3.summary()

In [17]:
# Compile the model
model_3.compile(
    optimizer=optimizer,
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Model training
history = model_3.fit(
    X_prob_val,
    y_val_onehot.todense(),
    batch_size=batch_size,
    epochs=epochs
)

Epoch 1/20
[1m6917/6917[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/step - accuracy: 0.8473 - loss: 0.6941
Epoch 2/20
[1m6917/6917[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2ms/step - accuracy: 0.9117 - loss: 0.2740
Epoch 3/20
[1m6917/6917[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 2ms/step - accuracy: 0.9126 - loss: 0.2669
Epoch 4/20
[1m6917/6917[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 2ms/step - accuracy: 0.9127 - loss: 0.2648
Epoch 5/20
[1m6917/6917[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 2ms/step - accuracy: 0.9130 - loss: 0.2620
Epoch 6/20
[1m6917/6917[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 2ms/step - accuracy: 0.9126 - loss: 0.2619
Epoch 7/20
[1m6917/6917[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 2ms/step - accuracy: 0.9135 - loss: 0.2604
Epoch 8/20
[1m6917/6917[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2ms/step - accuracy: 0.9128 - loss: 0.2602
Epoch 9/20
[1m6

In [18]:
y_pred_3 = model_3.predict(X_prob_test, batch_size=batch_size)
y_pred_3 = np.argmax(y_pred_3, axis=1)

print(f'Metrics for test:\n')
print(f'Accuracy = {accuracy_score(y_test, y_pred_3)}\n')
print(classification_report(y_test, y_pred_3, target_names=list(label_encoder.classes_)))

[1m1730/1730[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
Metrics for test:

Accuracy = 0.9132099683040283

                 precision    recall  f1-score   support

         BENIGN       0.99      1.00      1.00      2827
       DNS/LDAP       0.83      0.82      0.83     40987
          MSSQL       0.93      0.94      0.93     48599
            NTP       0.99      1.00      1.00     48900
NetBIOS/Portmap       0.93      0.88      0.90     58680
           SNMP       0.75      0.81      0.78     49756
       SSDP/UDP       0.95      0.95      0.95     56214
     Syn/UDPLag       0.93      0.97      0.95     87766
           TFTP       0.99      0.87      0.93     48914

       accuracy                           0.91    442643
      macro avg       0.92      0.92      0.92    442643
   weighted avg       0.92      0.91      0.91    442643



In [22]:
# Define version
version = 'v1'

# Save model architecture
with open(os.path.join(model_paths[2], f'model-nn-{version}.architecture.json'), 'w') as file:
    file.write(model_3.to_json())

# Save model weights
model_3.save_weights(os.path.join(model_paths[2], f'model-nn-{version}.weights.h5'))

# Save model history
with open(os.path.join(model_paths[2], f'model-nn-{version}.history.json'), 'w') as file:
    json.dump(history.history, file)

In [23]:
# Stack predictions
y_preds = np.vstack([y_pred_1, y_pred_2, y_pred_3])

# Compute majority vote (mode)
y_pred, _ = mode(y_preds, axis=0)

# Convert result to 1D array
y_pred = y_pred.flatten()

print(f'Metrics for test in ensemble:\n')
print(f'Accuracy = {accuracy_score(y_test, y_pred)}\n')
print(classification_report(y_test, y_pred, target_names=list(label_encoder.classes_)))

Metrics for test in ensemble:

Accuracy = 0.9126135508750844

                 precision    recall  f1-score   support

         BENIGN       0.99      1.00      1.00      2827
       DNS/LDAP       0.83      0.82      0.83     40987
          MSSQL       0.93      0.94      0.93     48599
            NTP       0.99      1.00      1.00     48900
NetBIOS/Portmap       0.93      0.88      0.90     58680
           SNMP       0.75      0.81      0.78     49756
       SSDP/UDP       0.95      0.95      0.95     56214
     Syn/UDPLag       0.93      0.96      0.95     87766
           TFTP       0.98      0.88      0.92     48914

       accuracy                           0.91    442643
      macro avg       0.92      0.92      0.92    442643
   weighted avg       0.91      0.91      0.91    442643

