In [1]:
import tensorflow.compat.v1 as tf
import numpy as np
import pandas as pd
import time

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

from dagmm.dagmm import DAGMM
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, f1_score, precision_score, recall_score, accuracy_score

In [2]:
# 手动添加特征，训练集42，测试集41

feather = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 
           'num_failed_logins', 'logged_in', 'lnum_compromised', 'lroot_shell', 'lsu_attempted', 'lnum_root', 'lnum_file_creations', 
           'lnum_shells', 'lnum_access_files', 'lnum_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 
           'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 
           'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 
           'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label']

In [3]:
# TRAIN_DATA_PATH = 'C:\\Users\\Qin\\Desktop\\UNSW-NB 15\\UNSW_NB15_training-set.csv'
TRAIN_DATA_PATH = 'C:\\Users\\Qin\\Desktop\\kddcup\\train_10_percent_kddcup.csv'
TEST_DATA_PATH = 'C:\\Users\\Qin\\Desktop\\kddcup\\test_10_percent_kddcup.csv'

In [4]:
train_df = pd.read_csv(TRAIN_DATA_PATH, header = None, names = feather)
test_df = pd.read_csv(TEST_DATA_PATH, header = None, names = feather)

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
train_df = train_df[train_df['label'] == 0]

In [6]:
y_label = test_df.iloc[ : , 41 : 42]

In [7]:
# 删除所有分类列
train_df.drop(['protocol_type', 'service', 'flag', 'land', 'logged_in', 'is_host_login', 'is_guest_login', 'label'], axis = 1, inplace = True)
# 删除无标准偏差的列
train_df.drop(['wrong_fragment', 'urgent', 'num_failed_logins', 'lsu_attempted', 'lnum_file_creations', 'lnum_outbound_cmds'], axis = 1, inplace = True)

# 删除所有分类列
test_df.drop(['protocol_type', 'service', 'flag', 'land', 'logged_in', 'is_host_login', 'is_guest_login', 'label'], axis = 1, inplace = True)
# 删除无标准偏差的列
test_df.drop(['wrong_fragment', 'urgent', 'num_failed_logins', 'lsu_attempted', 'lnum_file_creations', 'lnum_outbound_cmds'], axis = 1, inplace = True)

In [8]:
model = DAGMM(
    comp_hiddens=[60, 30, 10, 1], comp_activation=tf.nn.tanh,
    est_hiddens=[10, 4], est_dropout_ratio=0.5, est_activation=tf.nn.tanh,
    learning_rate=0.001, epoch_size=50, minibatch_size=1024, random_seed=1111
)

In [9]:
start = time.clock()
model.fit(train_df)
end = time.clock()
print('Time : ', (end - start))

Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


  """Entry point for launching an IPython kernel.


Instructions for updating:
Use keras.layers.dropout instead.
 epoch 100/200 : loss = 15.464
 epoch 200/200 : loss = 14.596
Time :  278.92360049999996


  This is separate from the ipykernel package so we can avoid doing imports until


In [10]:
start = time.clock()
y_pred = model.predict(test_df)
end = time.clock()
print('Time : ', (end - start))

  """Entry point for launching an IPython kernel.


Time :  1.0884927000000175


  This is separate from the ipykernel package so we can avoid doing imports until


In [11]:
# Energy thleshold to detect anomaly = 80% percentile of energies
anomaly_energy_threshold = np.percentile(y_pred, 60)
print(f"Energy thleshold to detect anomaly : {anomaly_energy_threshold:.3f}")

Energy thleshold to detect anomaly : 7.673


In [12]:
# Detect anomalies from test data
y_pred_flag = np.where(y_pred >= anomaly_energy_threshold, 1, 0)

In [13]:
y_labels = pd.Series([0 if i == 0 else 1 for i in y_label['label']])

In [15]:
y_labels

0         0
1         0
2         0
3         1
4         1
         ..
311024    0
311025    0
311026    0
311027    0
311028    0
Length: 311029, dtype: int64

In [16]:
y_pred_flag

array([0, 0, 0, ..., 0, 0, 0])

In [14]:
prec, recall, fscore, _ = precision_recall_fscore_support(y_labels, y_pred_flag, average="binary")
auc = roc_auc_score(y_labels, y_pred_flag)
print(f" Precision = {prec:.5f}")
print(f" Recall    = {recall:.5f}")
print(f" F1-Score  = {fscore:.5f}")
print(f" AUC  = {auc:.5f}")

 Precision = 0.99524
 Recall    = 0.49764
 F1-Score  = 0.66351
 AUC  = 0.74390
