# **DDoS Generation Validation Dataset**
---

## **Imports and Configuration**
---

In [22]:
import numpy as np
import os
import pandas as pd
import pickle
import tensorflow as tf
import warnings

from itertools import product

from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier

from tensorflow.keras.models import model_from_json

In [2]:
# Ignore warnings
warnings.filterwarnings('ignore')

In [3]:
# Data, encoders and normalizer paths
dataset_name = '75-20-05-udplag.syn'

input_path = os.path.join('data/clean', dataset_name)
encoders_path = os.path.join(input_path, 'encoders')
stats_path = os.path.join(input_path, 'stats')
data_path = os.path.join(input_path, 'split-sets-balanced-smote')

# Model paths settings
model_types = ['neural-network', 'random-forest', 'decision-tree', 'knn', 'gradient-boost']
model_paths = [os.path.join('models', model_type) for model_type in model_types]

## **Load Validation and Test Datasets**
---

In [4]:
# Load Data
X_val = pd.read_csv(os.path.join(data_path, 'X_val.csv'))
y_val = pd.read_csv(os.path.join(data_path, 'y_val.csv'))
X_test = pd.read_csv(os.path.join(data_path, 'X_test.csv'))
y_test = pd.read_csv(os.path.join(data_path, 'y_test.csv'))

## **Load Encoders and Normalizing Stats**
---

In [5]:
# Load standard scaler
with open(os.path.join(stats_path, 'standard-scaler.pkl'), 'rb') as file:
    standard_scaler = pickle.load(file)

## **Normalize and Transform Validation Data**
---

In [6]:
# Standard normalization of X
X_val = standard_scaler.transform(X_val)
X_test = standard_scaler.transform(X_test)

In [7]:
print(f' X_val.shape = {X_val.shape}')
print(f' y_val.shape = {y_val.shape}')

print(f'X_test.shape = {X_test.shape}')
print(f'y_test.shape = {y_test.shape}')

 X_val.shape = (1770568, 78)
 y_val.shape = (1770568, 1)
X_test.shape = (442643, 78)
y_test.shape = (442643, 1)


## **Load Models**
---

### ***Neural Network***

In [8]:
# Define normalize method name
normalize_name = 'standard-v4'

# Load model architecture
with open(os.path.join(model_paths[0], f'model-nn-{normalize_name}.architecture.json'), 'r') as file:
    model_nn = model_from_json(file.read())

# Load weights
model_nn.load_weights(os.path.join(model_paths[0], f'model-nn-{normalize_name}.weights.h5'))

# Check model summary
model_nn.summary()

### ***Random Forest***

In [9]:
# Define normalize method name
normalize_name = 'standard-v2'

# Load model
with open(os.path.join(model_paths[1], f'model-rf-{normalize_name}.object.pkl'), 'rb') as file:
    model_rf = pickle.load(file)

# Show model summary
model_rf

### ***Decision Tree***

In [10]:
# Define normalize method name
normalize_name = 'standard-v2'

# Load model
with open(os.path.join(model_paths[2], f'model-dt-{normalize_name}.object.pkl'), 'rb') as file:
    model_dt = pickle.load(file)

# Show model summary
model_dt

### ***KNN***

In [11]:
# Define normalize method name
normalize_name = 'standard-v2'

# Load model
with open(os.path.join(model_paths[3], f'model-kn-{normalize_name}.object.pkl'), 'rb') as file:
    model_kn = pickle.load(file)

# Show model summary
model_kn

### ***Gradient Boosting***

In [12]:
# Define normalize method name
normalize_name = 'standard-v2'

# Load model
with open(os.path.join(model_paths[4], f'model-gb-{normalize_name}.object.pkl'), 'rb') as file:
    model_gb = pickle.load(file)

# Show model summary
model_gb

## **Predict Probabilities**
---

### ***Neural Network***

In [14]:
y_prob_nn_val = model_nn.predict(X_val)
y_prob_nn_test = model_nn.predict(X_test)

print(y_prob_nn_val.shape)
print(y_prob_nn_test.shape)

[1m55331/55331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 2ms/step 
[1m13833/13833[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 2ms/step
(1770568, 9)
(442643, 9)


### ***Random Forest***

In [15]:
y_prob_rf_val = model_rf.predict_proba(X_val)
y_prob_rf_test = model_rf.predict_proba(X_test)

print(y_prob_rf_val.shape)
print(y_prob_rf_test.shape)

[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done  71 out of 100 | elapsed:   10.4s remaining:    4.2s
[Parallel(n_jobs=32)]: Done 100 out of 100 | elapsed:   12.9s finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done  71 out of 100 | elapsed:    2.4s remaining:    0.9s


(1770568, 9)
(442643, 9)


[Parallel(n_jobs=32)]: Done 100 out of 100 | elapsed:    3.0s finished


### ***Decision Tree***

In [16]:
y_prob_dt_val = model_dt.predict_proba(X_val)
y_prob_dt_test = model_dt.predict_proba(X_test)

print(y_prob_dt_val.shape)
print(y_prob_dt_test.shape)

(1770568, 9)
(442643, 9)


### ***KNN***

In [18]:
y_prob_kn_val = model_kn.predict_proba(X_val)
y_prob_kn_test = model_kn.predict_proba(X_test)

print(y_prob_kn_val.shape)
print(y_prob_kn_test.shape)

(1770568, 9)
(442643, 9)


### ***Gradient Boosting***

In [17]:
y_prob_gb_val = model_gb.predict_proba(X_val)
y_prob_gb_test = model_gb.predict_proba(X_test)

print(y_prob_gb_val.shape)
print(y_prob_gb_test.shape)

(1770568, 9)
(442643, 9)


## **Generate Datasets**
---

In [29]:
# Load label encoder
with open(os.path.join(encoders_path, 'label-encoder.pkl'), 'rb') as file:
    label_encoder = pickle.load(file)

# Get labels list
labels = list(label_encoder.classes_)

# Set models identifiers
model_ids = ['NN', 'RF', 'DT', 'KN', 'GB']

# Generate column names
column_names = [f'{label}_{model_id}_PROB' for model_id, label in product(model_ids, labels)]
column_names

['BENIGN_NN_PROB',
 'DNS/LDAP_NN_PROB',
 'MSSQL_NN_PROB',
 'NTP_NN_PROB',
 'NetBIOS/Portmap_NN_PROB',
 'SNMP_NN_PROB',
 'SSDP/UDP_NN_PROB',
 'Syn/UDPLag_NN_PROB',
 'TFTP_NN_PROB',
 'BENIGN_RF_PROB',
 'DNS/LDAP_RF_PROB',
 'MSSQL_RF_PROB',
 'NTP_RF_PROB',
 'NetBIOS/Portmap_RF_PROB',
 'SNMP_RF_PROB',
 'SSDP/UDP_RF_PROB',
 'Syn/UDPLag_RF_PROB',
 'TFTP_RF_PROB',
 'BENIGN_DT_PROB',
 'DNS/LDAP_DT_PROB',
 'MSSQL_DT_PROB',
 'NTP_DT_PROB',
 'NetBIOS/Portmap_DT_PROB',
 'SNMP_DT_PROB',
 'SSDP/UDP_DT_PROB',
 'Syn/UDPLag_DT_PROB',
 'TFTP_DT_PROB',
 'BENIGN_KN_PROB',
 'DNS/LDAP_KN_PROB',
 'MSSQL_KN_PROB',
 'NTP_KN_PROB',
 'NetBIOS/Portmap_KN_PROB',
 'SNMP_KN_PROB',
 'SSDP/UDP_KN_PROB',
 'Syn/UDPLag_KN_PROB',
 'TFTP_KN_PROB',
 'BENIGN_GB_PROB',
 'DNS/LDAP_GB_PROB',
 'MSSQL_GB_PROB',
 'NTP_GB_PROB',
 'NetBIOS/Portmap_GB_PROB',
 'SNMP_GB_PROB',
 'SSDP/UDP_GB_PROB',
 'Syn/UDPLag_GB_PROB',
 'TFTP_GB_PROB']

In [26]:
# Concatenate probabilities
prob_values_val = np.concat([
    y_prob_nn_val, 
    y_prob_rf_val, 
    y_prob_dt_val, 
    y_prob_kn_val, 
    y_prob_gb_val], axis=1)

prob_values_test = np.concat([
    y_prob_nn_test, 
    y_prob_rf_test, 
    y_prob_dt_test, 
    y_prob_kn_test, 
    y_prob_gb_test], axis=1)

print(prob_values_val.shape)
print(prob_values_test.shape)

(1770568, 45)
(442643, 45)


In [39]:
# Create validation probabilities dataframe
df_prob_val = pd.DataFrame(prob_values_val, columns=column_names)
df_prob_val

Unnamed: 0,BENIGN_NN_PROB,DNS/LDAP_NN_PROB,MSSQL_NN_PROB,NTP_NN_PROB,NetBIOS/Portmap_NN_PROB,SNMP_NN_PROB,SSDP/UDP_NN_PROB,Syn/UDPLag_NN_PROB,TFTP_NN_PROB,BENIGN_RF_PROB,...,TFTP_KN_PROB,BENIGN_GB_PROB,DNS/LDAP_GB_PROB,MSSQL_GB_PROB,NTP_GB_PROB,NetBIOS/Portmap_GB_PROB,SNMP_GB_PROB,SSDP/UDP_GB_PROB,Syn/UDPLag_GB_PROB,TFTP_GB_PROB
0,4.450096e-09,3.652971e-04,3.679143e-07,7.043646e-04,8.040428e-01,1.948857e-01,1.238563e-07,2.991003e-07,1.016034e-06,0.0,...,0.0,0.000369,0.000510,0.000422,0.000407,0.900966,0.095980,0.000444,0.000527,0.000375
1,1.336685e-07,8.529316e-01,3.455677e-02,1.302071e-07,8.653852e-04,1.105747e-01,1.017169e-03,2.886152e-05,2.520559e-05,0.0,...,0.0,0.000455,0.868378,0.034801,0.000451,0.000783,0.092503,0.001661,0.000485,0.000482
2,2.543411e-06,3.236395e-03,8.933601e-06,5.479277e-04,6.906306e-01,3.055602e-01,1.199190e-06,4.318598e-06,7.828927e-06,0.0,...,0.0,0.000501,0.001204,0.000572,0.000552,0.816140,0.179055,0.000576,0.000892,0.000508
3,1.524835e-31,5.826189e-16,1.764758e-08,0.000000e+00,1.497550e-10,2.213200e-11,1.314478e-11,1.191886e-16,1.000000e+00,0.0,...,1.0,0.000013,0.000024,0.000022,0.000013,0.000020,0.000022,0.000030,0.000020,0.999835
4,0.000000e+00,6.052246e-32,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1.000000e+00,0.000000e+00,0.0,...,0.0,0.000031,0.000030,0.000034,0.000027,0.000041,0.000031,0.000061,0.999716,0.000029
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1770563,7.810503e-11,1.753871e-06,1.043076e-06,1.506603e-06,3.905487e-07,1.365711e-06,9.744582e-01,2.553581e-02,4.881410e-10,0.0,...,0.0,0.000103,0.000148,0.000118,0.000113,0.000132,0.000127,0.995189,0.003959,0.000110
1770564,1.813594e-22,2.191192e-17,6.742382e-22,1.000000e+00,2.715196e-21,5.752569e-20,1.378928e-18,1.969982e-23,8.017250e-32,0.0,...,0.0,0.000062,0.000081,0.000019,0.999714,0.000031,0.000017,0.000038,0.000022,0.000016
1770565,3.646619e-07,2.559514e-01,2.096698e-02,5.038382e-10,3.899527e-04,6.992705e-01,2.218714e-02,1.158676e-03,7.500845e-05,0.0,...,0.0,0.000622,0.278175,0.021110,0.000617,0.000957,0.669414,0.027783,0.000664,0.000659
1770566,2.716757e-07,1.418529e-08,2.566639e-09,2.396295e-12,3.184271e-08,5.334827e-08,5.123499e-09,9.999995e-01,3.127927e-12,0.0,...,0.0,0.000023,0.000027,0.000025,0.000024,0.000036,0.000029,0.000026,0.999782,0.000029


In [40]:
# Create test probabilities dataframe
df_prob_test = pd.DataFrame(prob_values_test, columns=column_names)
df_prob_test

Unnamed: 0,BENIGN_NN_PROB,DNS/LDAP_NN_PROB,MSSQL_NN_PROB,NTP_NN_PROB,NetBIOS/Portmap_NN_PROB,SNMP_NN_PROB,SSDP/UDP_NN_PROB,Syn/UDPLag_NN_PROB,TFTP_NN_PROB,BENIGN_RF_PROB,...,TFTP_KN_PROB,BENIGN_GB_PROB,DNS/LDAP_GB_PROB,MSSQL_GB_PROB,NTP_GB_PROB,NetBIOS/Portmap_GB_PROB,SNMP_GB_PROB,SSDP/UDP_GB_PROB,Syn/UDPLag_GB_PROB,TFTP_GB_PROB
0,3.679741e-11,3.339829e-07,3.366743e-08,2.544459e-07,1.329002e-08,2.816180e-07,9.710048e-01,2.899430e-02,2.459354e-13,0.0,...,0.0,0.000105,0.000151,0.000120,0.000115,0.000130,0.000129,0.994772,0.004365,0.000112
1,1.423489e-30,2.512006e-14,5.062687e-07,0.000000e+00,1.103167e-09,6.254433e-10,1.057567e-10,4.248405e-16,9.999995e-01,0.0,...,1.0,0.000013,0.000024,0.000069,0.000013,0.000024,0.000021,0.000024,0.000018,0.999793
2,4.969105e-08,4.625040e-02,8.878650e-01,7.262385e-07,2.928071e-02,2.985514e-04,3.424244e-02,2.062084e-03,6.865003e-23,0.0,...,0.0,0.000546,0.052376,0.830425,0.000618,0.069450,0.000951,0.039971,0.005108,0.000554
3,2.765367e-09,2.701445e-03,9.388187e-01,3.017278e-08,3.916147e-02,4.223057e-05,1.894573e-02,3.304186e-04,5.376159e-20,0.0,...,0.0,0.000267,0.009137,0.957553,0.000302,0.021517,0.000439,0.010105,0.000410,0.000271
4,1.421904e-30,2.516691e-14,5.039753e-07,0.000000e+00,1.100246e-09,6.209194e-10,1.052235e-10,4.245198e-16,9.999995e-01,0.0,...,1.0,0.000013,0.000024,0.000083,0.000013,0.000024,0.000021,0.000027,0.000018,0.999776
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
442638,9.927040e-10,5.305906e-04,1.253430e-05,3.581478e-04,8.004267e-01,1.986681e-01,7.237868e-07,1.777865e-07,3.027253e-06,0.0,...,0.0,0.000223,0.000379,0.000254,0.000245,0.972620,0.025397,0.000256,0.000399,0.000226
442639,1.336688e-07,8.529317e-01,3.455677e-02,1.302070e-07,8.653848e-04,1.105747e-01,1.017169e-03,2.886150e-05,2.520557e-05,0.0,...,0.0,0.000433,0.781604,0.028924,0.000429,0.000672,0.185858,0.001157,0.000462,0.000459
442640,1.504887e-31,5.734418e-16,1.737349e-08,0.000000e+00,1.481379e-10,2.193089e-11,1.299625e-11,1.175343e-16,1.000000e+00,0.0,...,1.0,0.000013,0.000024,0.000022,0.000013,0.000020,0.000022,0.000029,0.000018,0.999839
442641,1.555428e-05,8.257524e-06,1.411813e-05,2.469464e-08,9.556248e-06,3.819151e-05,3.529673e-04,5.451797e-01,4.543816e-01,0.0,...,0.6,0.000571,0.000671,0.000633,0.000605,0.000644,0.000642,0.001097,0.498168,0.496969


## **Save Probabilities Datasets**
---

In [41]:
# Save validation probabilities dataset
df_prob_val.to_csv(os.path.join(data_path, 'X_prob_val.csv'), index=False)

In [42]:
# Save test probabilities dataset
df_prob_test.to_csv(os.path.join(data_path, 'X_prob_test.csv'), index=False)