In [1]:
import pandas as pd
import numpy as np

In [29]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Conv1D, MaxPooling1D, Flatten, Dropout, Reshape, LayerNormalization, MultiHeadAttention, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.callbacks import Callback, ModelCheckpoint, ReduceLROnPlateau, CSVLogger

In [3]:
from sklearn.preprocessing import QuantileTransformer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score,  f1_score, accuracy_score

In [4]:
selected_features = ['destination_port',
            'init_win_bytes_backward',
            'init_win_bytes_forward',
            'bwd_packets_s',
            'fwd_iat_min',
            'min_seg_size_forward',
            'flow_iat_min',
            'flow_duration',
            'total_length_of_fwd_packets',
            'total_backward_packets',
            'bwd_iat_min',
            'bwd_packet_length_std',
            'fwd_iat_total',
            'fwd_packet_length_mean',
            'fwd_packet_length_max',
            'flow_iat_std',
            'fwd_packets_s',
            'down_up_ratio',
            'total_fwd_packets',
            'bwd_packet_length_min',
            'flow_bytes_s',
            'bwd_header_length',
            'packet_length_mean',
            'total_length_of_bwd_packets']

In [5]:
def change_datatype(df):
    int8_vals = np.iinfo(np.int8)
    int16_vals = np.iinfo(np.int16)
    int32_vals = np.iinfo(np.int32)

    float16_vals = np.finfo(np.float16)
    float32_vals = np.finfo(np.float32)
    
    for col in df.columns:
        max_val = df[col].max()
        min_val = df[col].min()

        # print(f"{col}: max {max_val} -- min {min_val}")
        
        if df[col].dtype == np.int64:
            if max_val <= int8_vals.max and min_val >= int8_vals.min:
                df[col] = df[col].astype(np.int8)
            elif max_val <= int16_vals.max and min_val >= int16_vals.min:
                df[col] = df[col].astype(np.int16)
            elif max_val <= int32_vals.max and min_val >= int32_vals.min:
                df[col] = df[col].astype(np.int32)

        elif df[col].dtype == np.float64:
            if max_val <= float16_vals.max and min_val >= float16_vals.min:
                df[col] = df[col].astype(np.float16)
            elif max_val <= float32_vals.max and min_val >= float32_vals.min:
                df[col] = df[col].astype(np.float32)
    return df

# Without oversampling

In [6]:
df_train = change_datatype(pd.read_csv("./raw/raw_cutoff_trainset.csv"))
df_test = change_datatype(pd.read_csv("./raw/raw_testset.csv"))

df_train = df_train[selected_features + ['label']]
df_test = df_test[selected_features + ['label']]

In [7]:
df_train.shape

(1251919, 25)

In [8]:
df_test.shape

(756690, 25)

In [9]:
X_train = df_train.drop('label', axis = 1)
y_train = df_train['label']

X_test = df_test.drop('label', axis = 1)
y_test = df_test['label']

In [10]:
scaler = QuantileTransformer(
       n_quantiles = 10000,
       random_state = 6969,
       output_distribution = "uniform"
)

In [12]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [14]:
X_train.shape

(1251919, 24)

In [16]:
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_train.shape

(1251919, 24, 1)

In [18]:
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
X_test.shape

(756690, 24, 1)

In [23]:
y_train.unique().shape[0]

10

## CNN Model

In [26]:
model = Sequential([
    Input(shape = (X_train.shape[1], 1)),
    Conv1D(512, 3, padding = "same", activation = "relu"),
    LayerNormalization(),
    
    MaxPooling1D(pool_size = (2)),
    Dropout(0.5),
    
    Conv1D(128, 3, padding = "same", activation = "relu"),
    MaxPooling1D(pool_size = (2)),
    Dropout(0.5),
    
    Conv1D(64, 4, padding = "same", activation = "relu"),
    MaxPooling1D(pool_size = (2)),
    Dropout(0.5),
    
    Flatten(),
    Dense(128, activation = "relu"),
    Dropout(0.3),
    
    Dense(y_train.unique().shape[0], activation="softmax")
])

In [27]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_2 (Conv1D)           (None, 24, 512)           2048      
                                                                 
 layer_normalization_1 (Lay  (None, 24, 512)           1024      
 erNormalization)                                                
                                                                 
 max_pooling1d_1 (MaxPoolin  (None, 12, 512)           0         
 g1D)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 12, 512)           0         
                                                                 
 conv1d_3 (Conv1D)           (None, 12, 128)           196736    
                                                                 
 max_pooling1d_2 (MaxPoolin  (None, 6, 128)            0

In [28]:
model.compile(
        loss = SparseCategoricalCrossentropy(),
        optimizer = Adam(learning_rate = 0.001),
        metrics = ['accuracy']
    )

In [30]:
checkpoint = ModelCheckpoint("./checkpoint/model_weights.h5", 
                             monitor = 'val_loss', verbose = 1, 
                             save_best_only = True, mode = 'min')

In [33]:
reduce_lr = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.2,npatience = 5, 
                              min_lr = 1e-5, cooldown = 1, mode = "min", verbose = 1)

In [34]:
hist = model.fit(X_train, y_train,
                    validation_data = (X_test, y_test), 
                    epochs = 15, batch_size = 512, 
                    callbacks = [checkpoint, reduce_lr])

Epoch 1/150
Epoch 1: val_loss improved from inf to 0.01966, saving model to ./checkpoint\model_weights.h5
Epoch 2/150


  saving_api.save_model(


Epoch 2: val_loss improved from 0.01966 to 0.01625, saving model to ./checkpoint\model_weights.h5
Epoch 3/150
Epoch 3: val_loss improved from 0.01625 to 0.01444, saving model to ./checkpoint\model_weights.h5
Epoch 4/150
Epoch 4: val_loss improved from 0.01444 to 0.01353, saving model to ./checkpoint\model_weights.h5
Epoch 5/150
Epoch 5: val_loss improved from 0.01353 to 0.01301, saving model to ./checkpoint\model_weights.h5
Epoch 6/150
Epoch 6: val_loss improved from 0.01301 to 0.01288, saving model to ./checkpoint\model_weights.h5
Epoch 7/150
Epoch 7: val_loss improved from 0.01288 to 0.00986, saving model to ./checkpoint\model_weights.h5
Epoch 8/150
Epoch 8: val_loss did not improve from 0.00986
Epoch 9/150
Epoch 9: val_loss improved from 0.00986 to 0.00975, saving model to ./checkpoint\model_weights.h5
Epoch 10/150
Epoch 10: val_loss improved from 0.00975 to 0.00969, saving model to ./checkpoint\model_weights.h5
Epoch 11/150
Epoch 11: val_loss improved from 0.00969 to 0.00902, savin