In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
from notebook.services.config import ConfigManager
cm = ConfigManager().update('notebook', {'limit_output': 20})
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import seaborn as sns

In [2]:
Nor_path = "./Dataset/Normal_mixed.csv"
col_names = ["duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes",
                "land", "wrong_fragment", "urgent", "count", "srv_count", "serror_rate",
                "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate",
                "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
                "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
                "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate",
                "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "label"]
Nor_df = pd.read_csv(Nor_path, header=None,names= col_names, nrows= 600000)

In [3]:
Abnor_path = "./Dataset/Abnormal.csv"
Abnor_df = pd.read_csv(Abnor_path, header=None,names= col_names, nrows= 500000)

In [4]:
Kdd_path = "./Dataset/kdd99_extracted.csv"
kdd99_df = pd.read_csv(Kdd_path, header=None,names= col_names, nrows= 400000)

kdd99_nor = kdd99_df[kdd99_df['label'] == 'Normal']
kdd99_abnor = kdd99_df[kdd99_df['label'] != 'Normal']

In [5]:
Train_nor_kdd99, Test_nor_kdd99 = train_test_split(kdd99_nor, test_size=0.3, random_state=1)
Train_abnor_kdd99, Test_abnor_kdd99 = train_test_split(kdd99_abnor, test_size=0.3, random_state=1) 

Train_kdd99 = pd.concat([Train_nor_kdd99, Train_abnor_kdd99], ignore_index=True)
Test_kdd99 = pd.concat([Test_nor_kdd99, Test_abnor_kdd99], ignore_index=True)

In [6]:
#Test ver 1: 4 classes attacks
Abnor_df_v1 = Abnor_df[Abnor_df['label'].isin(['FoT', 'DoS', 'DoS_Gas','BP'])]

#Test Ver 2:3 classes attacks
Abnor_df_v2 = Abnor_df[Abnor_df['label'].isin(['BP', 'DoS', 'DoS_Gas'])]

#Test ver 3: 2 classes attacks
Abnor_df_v3 = Abnor_df[Abnor_df['label'].isin(['BP', 'DoS'])]

#Test ver 4: 1 class attack
Abnor_df_v4 = Abnor_df[Abnor_df['label'].isin(['DoS'])]


In [7]:
Train_nor, Test_nor = train_test_split(Nor_df, test_size=0.2, random_state=1)
Train_abnor, Test_abnor = train_test_split(Abnor_df, test_size=0.2, random_state=1) 

Train = pd.concat([Train_nor, Train_abnor], ignore_index=True)
Test = pd.concat([Test_nor, Test_abnor], ignore_index=True)

In [8]:
Train_4classes = Train_abnor[Train_abnor['label'].isin(['FoT', 'DoS', 'DoS_Gas','BP'])]

Train_3classes = Train_abnor[Train_abnor['label'].isin(['DoS', 'DoS_Gas','BP'])]

Train_2classes = Train_abnor[Train_abnor['label'].isin(['DoS','BP'])]

Train_1class = Train_abnor[Train_abnor['label'].isin(['DoS'])]

In [9]:
def one_hot_encode(data):
    unique_labels = [
    ["tcp", "udp", "icmp"],
    ["other", "private", "ecr_i", "urp_i", "urh_i", "red_i", "eco_i", "tim_i", "oth_i", "domain_u", "tftp_u", "ntp_u", "IRC", 
                "X11", "Z39_50", "aol", "auth", "bgp", "courier", "csnet_ns", "ctf", "daytime", "discard", "domain", "echo", "efs", "exec", 
                "finger", "ftp", "ftp_data", "gopher", "harvest", "hostnames", "http", "http_2784", "http_443", "http_8001", "icmp", "imap4",
                "iso_tsap", "klogin", "kshell", "ldap", "link", "login", "mtp", "name", "netbios_dgm", "netbios_ns", "netbios_ssn", "netstat",
                "nnsp", "nntp", "pm_dump", "pop_2", "pop_3", "printer", "remote_job", "rje", "shell", "smtp", "sql_net", "ssh", "sunrpc", 
                "supdup", "systat", "telnet", "time", "uucp", "uucp_path", "vmnet", "whois"],
    ["SF", "S0", "S1", "S2", "S3", "REJ", "RSTOS0", "RSTO", "RSTR", "SH", "RSTRH", "SHR", "OTH"],
    ["Normal","OaU","DoS","DoS_Gas","FoT","BP"]
    ]
    encoded_data = []
    # Thực hiện mã hóa one-hot
    for row in data:
        encoding = []
        for i, column_value in enumerate(row):
            unique_column_values = unique_labels[i]
            encoding.extend([1 if column_value == unique else 0 for unique in unique_column_values])
        encoded_data.append(encoding)
    return np.array(encoded_data)

In [10]:
encoder = OneHotEncoder( handle_unknown='ignore')
scaler = MinMaxScaler()

In [11]:
def preprocess_test(df, is_fit=True):
    # chuyển normal thành 1 và các lớp khác thành 0
    label = df['label'].map(lambda x: 'Abnormal' if x != 'Normal' else x)

    # loại bỏ cột dữ liệu không cần thiết
    #df = df.drop(["label"], axis=1)
    df = df.drop(["land", "wrong_fragment",  "urgent", "rerror_rate",  "srv_rerror_rate", "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "label"], axis=1)

    # chia dữ liệu ra số, chữ để tiện xử lý
    numerical_data = df.select_dtypes(exclude='object').values
    categorical_data = df.select_dtypes(include='object').values

    categorical_data = one_hot_encode(categorical_data)

    # nối dữ liệu số và onehot lại
    data = np.concatenate([numerical_data, categorical_data], axis=1)

    # chỉ fit với dữ liệu train
    if is_fit:
        scaler.fit(data)

    # dữ liệu chuẩn hóa về dạng [0, 1]
    data = scaler.transform(data)

    return dict(data=data, label=label)

In [12]:
def preprocess(df, is_fit=True):
    # chuyển normal thành 1 và các lớp khác thành 0
    label = df['label'].map(lambda x: 'Abnormal' if x != 'Normal' else x)

    # loại bỏ cột dữ liệu không cần thiết
    df = df.drop(["land", "wrong_fragment",  "urgent", "rerror_rate",  "srv_rerror_rate", "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "label"], axis=1)

    # chia dữ liệu ra số, chữ để tiện xử lý
    numerical_data = df.select_dtypes(exclude='object').values
    categorical_data = df.select_dtypes(include='object').values

    # chỉ fit với dữ liệu train
    if is_fit:
        encoder.fit(categorical_data)

    # chuyển từ dữ liệu chữ sang onehot
    categorical_data = encoder.transform(categorical_data).toarray()

    # nối dữ liệu số và onehot lại
    data = np.concatenate([numerical_data, categorical_data], axis=1)

    # chỉ fit với dữ liệu train
    if is_fit:
        scaler.fit(data)

    # dữ liệu chuẩn hóa về dạng [0, 1]
    data = scaler.transform(data)

    return dict(data=data, label=label)

In [13]:
#Dùng hàm preprocess_test
Train_nor = preprocess_test(Train_nor, True)
Train_abnor = preprocess_test(Train_abnor, False)

In [14]:
Train_nor['data'].shape , Train_abnor['data'].shape

((480000, 106), (293936, 106))

In [15]:
test = preprocess_test(Test, False)
Test_nor = test['data'][test['label'] == 'Normal']
Test_abnor = test['data'][test['label'] == 'Abnormal']

In [16]:
Test_nor.shape , Test_abnor.shape

((120000, 106), (73484, 106))

In [17]:
# Train_nor = preprocess(Train_nor, True)
# Train_abnor = preprocess(Train_abnor, False)

In [18]:
Train_abnor_4clss = preprocess_test(Train_4classes, False)

Train_abnor_3clss = preprocess_test(Train_3classes, False)

Train_abnor_2clss = preprocess_test(Train_2classes, False)

Train_abnor_1clss = preprocess_test(Train_1class, False)

In [19]:
Train_abnor_2clss['data'].shape

(100415, 106)

In [20]:
Test_4class =  Test[Test['label'].isin(['FoT','DoS_Gas','DoS','BP','Normal'])]

Test_3class =  Test[Test['label'].isin(['DoS_Gas','DoS','BP','Normal'])]

Test_2class =  Test[Test['label'].isin(['DoS','BP','Normal'])]

Test_1class =  Test[Test['label'].isin(['DoS','Normal'])]

In [21]:
Test_3class['label'].unique()

array(['Normal', 'BP', 'DoS_Gas', 'DoS'], dtype=object)

In [22]:
# #Trường hợp 4 classes
# test = preprocess_test(Test_4class, False)
# Test_nor = test['data'][test['label'] == 'Normal']
# Test_abnor = test['data'][test['label'] == 'Abnormal']

In [23]:
# #Trường hợp 3 classes
# test = preprocess_test(Test_3class, False)
# Test_nor = test['data'][test['label'] == 'Normal']
# Test_abnor = test['data'][test['label'] == 'Abnormal']

In [24]:
# #Trường hợp 2 classes
# test = preprocess_test(Test_2class, False)
# Test_nor = test['data'][test['label'] == 'Normal']
# Test_abnor = test['data'][test['label'] == 'Abnormal']

In [25]:
# #Trường hợp 1 class
# test = preprocess_test(Test_1class, False)
# Test_nor = test['data'][test['label'] == 'Normal']
# Test_abnor = test['data'][test['label'] == 'Abnormal']

In [26]:
class Autoencoder(keras.Model):
  def __init__(self, input_dim):
    super(Autoencoder, self).__init__()
    self.encoder = keras.Sequential([
      keras.layers.Dense(input_dim, activation='tanh'),
      keras.layers.Dense(52, activation='tanh'),
      keras.layers.Dense(26, activation='tanh'),
      keras.layers.Dense(13, activation='tanh')
    ])
    self.decoder = keras.Sequential([
      #keras.layers.Dense(13, activation='tanh'),
      keras.layers.Dense(26, activation='tanh'),
      keras.layers.Dense(52, activation='tanh'),
      keras.layers.Dense(input_dim, activation='sigmoid'),
    ])

  def call(self, x):
    code = self.encoder(x)
    r = self.decoder(code)
    return r

  def get_reconstruction_error(self, x, batch_size=10000):
    r = self.predict(x, batch_size)
    return keras.metrics.mean_squared_error(x, r)

  def predict_class(self, x, threshold, batch_size=10000):
    reconstruction_error = self.get_reconstruction_error(x, batch_size)
    return np.where(reconstruction_error <= threshold, 'Normal', 'Abnormal')

In [27]:
model = Autoencoder(Train_nor['data'].shape[1])
optimizer = keras.optimizers.Adam(learning_rate=0.0001)
loss_fn = keras.losses.MeanSquaredError()
model.compile(optimizer, loss_fn)

In [28]:
# model.fit(Train_nor['data'], Train_nor['data'], batch_size=64, epochs=200, shuffle=True)

In [29]:
#model.save('Model/AE_fullclass_200_new.model', save_format='tf')  # The file needs to end with the .keras extension

model_path = 'Model/AE_fullclass_200_new.model'
model.load_weights(model_path)

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x2a8640c3f40>

In [30]:
train_normal_re = model.get_reconstruction_error(Train_nor['data'], batch_size=10000)
train_abnormal_re = model.get_reconstruction_error(Train_abnor['data'], batch_size=10000)

test_normal_re = model.get_reconstruction_error(Test_nor, batch_size=10000)
test_abnormal_re = model.get_reconstruction_error(Test_abnor, batch_size=10000)



In [31]:
sigma = 5
alpha = 0.5

In [39]:
sigma_threshold = np.percentile(train_normal_re, 100 - sigma)
sigma_threshold

8.027085959838585e-06

In [38]:
alpha = 0.5
threshold = np.concatenate([train_normal_re,train_abnormal_re]).mean() * alpha
print('Ngưỡng vừa tìm được:', threshold)

Ngưỡng vừa tìm được: 0.00028194053447805345


In [34]:
Train_nor['data']
train = np.concatenate((Train_nor['data'], Train_abnor['data']), axis=0)
label = np.concatenate((Train_nor['label'], Train_abnor['label']), axis=0)

In [36]:
sigma_threshold = 9.500000000000026e-06

In [40]:
# label_predict = model.predict_class(Test_nor, threshold)
# print('Độ chính xác tập khi tái tạo normal tập huấn luyện', end=': ')
# print(accuracy_score(np.full([2000], 'Normal'), label_predict))

label_predict1 = model.predict_class(train, sigma_threshold)
print('Độ chính xác tập huấn luyện với full class', end=': ')
print(accuracy_score(label, label_predict1))

label_predict = model.predict_class(test['data'], sigma_threshold)
print('Độ chính xác tập test với full class', end=': ')
print(accuracy_score(test['label'], label_predict))

# labels = ['Normal', 'Abnormal']

# matrix2 = confusion_matrix(label, label_predict1)
# disp1 = ConfusionMatrixDisplay(
#     confusion_matrix=matrix2, display_labels=labels)

# disp1.plot(cmap=plt.cm.Blues)
# print(classification_report(label,label_predict1,target_names=labels))
# print(classification_report(test['label'],label_predict,target_names=labels))

Độ chính xác tập huấn luyện với full class: 0.7303950197432346
Độ chính xác tập test với full class: 0.7301792396270492


In [48]:
threshold = 8e-06               # Initial threshold
best_threshold = threshold      # Initial best threshold
step = 1e-7                     # Initial step
decay = 0.5                     # Decay rate
num_decay = 5                   # Number of decay times
pre = 0                         # Previous accuracy
cur = 1e-9                      # Current accuracy
best_acc = 1                    # Initial best accuracy
occ = 10                        # Occurence of the previous accuracy better than the current one
count = 0                       # Counter

for d_i in range (num_decay):
    for i in range (1000):
        pre = cur
        pred = model.predict_class(train, threshold, 10000)
        acc  = accuracy_score(label, pred)
        threshold = threshold + step
        cur = acc
        print("Accuracy:", acc, "\nThreshold:", threshold)

        # If the previous accuracy is better than the current one. Plus the counter and store the best threshold and accuracy
        if (pre > cur):
            count = count + 1
            cur = pre
            best_threshold = threshold - count*step
            best_acc = pre
        else:
            count = 0
            best_threshold = threshold
            continue

        if count == occ + 1:
            step = step * decay
            threshold = best_threshold
            count = 0
            cur = best_acc
            print("--------------------------------------------------")
            print("Best accuracy:", best_acc, "\nFinal threshold:", best_threshold)
            break    

Accuracy: 0.730325246532013 
Threshold: 8.1e-06
Accuracy: 0.7306017551838911 
Threshold: 8.200000000000001e-06
Accuracy: 0.7308071985280437 
Threshold: 8.300000000000002e-06
Accuracy: 0.7309454528539827 
Threshold: 8.400000000000003e-06
Accuracy: 0.7310643257323604 
Threshold: 8.500000000000003e-06
Accuracy: 0.7311922432862665 
Threshold: 8.600000000000004e-06
Accuracy: 0.7313524632527755 
Threshold: 8.700000000000005e-06
Accuracy: 0.7315062227367638 
Threshold: 8.800000000000006e-06
Accuracy: 0.7315915011060347 
Threshold: 8.900000000000006e-06
Accuracy: 0.7316690268962809 
Threshold: 9.000000000000007e-06
Accuracy: 0.7317478447830311 
Threshold: 9.100000000000008e-06
Accuracy: 0.7318421678278307 
Threshold: 9.200000000000008e-06
Accuracy: 0.7319339066796221 
Threshold: 9.30000000000001e-06
Accuracy: 0.7319403671621426 
Threshold: 9.40000000000001e-06
Accuracy: 0.731983006346778 
Threshold: 9.50000000000001e-06
Accuracy: 0.731883514915962 
Threshold: 9.600000000000011e-06
Accuracy: 0.