In [1]:
import tensorflow.compat.v1 as tf
import numpy as np
import pandas as pd
import time
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

from dagmm.dagmm import DAGMM
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, f1_score, precision_score, recall_score, accuracy_score

In [2]:
first = time.clock()

  """Entry point for launching an IPython kernel.


In [3]:
# 手动添加特征，训练集42，测试集41

feather = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 
           'num_failed_logins', 'logged_in', 'lnum_compromised', 'lroot_shell', 'lsu_attempted', 'lnum_root', 'lnum_file_creations', 
           'lnum_shells', 'lnum_access_files', 'lnum_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 
           'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 
           'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 
           'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label']

In [4]:
# TRAIN_DATA_PATH = 'C:\\Users\\Qin\\Desktop\\UNSW-NB 15\\UNSW_NB15_training-set.csv'
TRAIN_DATA_PATH = 'C:\\Users\\Qin\\Desktop\\kddcup\\train_10_percent_kddcup.csv'
TEST_DATA_PATH = 'C:\\Users\\Qin\\Desktop\\kddcup\\test_10_percent_kddcup.csv'

In [5]:
train_df = pd.read_csv(TRAIN_DATA_PATH, header = None, names = feather)
test_df = pd.read_csv(TEST_DATA_PATH, header = None, names = feather)

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
train_df = train_df[train_df['label'] == 0]

In [7]:
y_label = test_df.iloc[ : , 41 : 42]

In [8]:
# 删除所有分类列
train_df.drop(['protocol_type', 'service', 'flag', 'land', 'logged_in', 'is_host_login', 'is_guest_login', 'label'], axis = 1, inplace = True)
# 删除无标准偏差的列
train_df.drop(['wrong_fragment', 'urgent', 'num_failed_logins', 'lsu_attempted', 'lnum_file_creations', 'lnum_outbound_cmds'], axis = 1, inplace = True)

# 删除所有分类列
test_df.drop(['protocol_type', 'service', 'flag', 'land', 'logged_in', 'is_host_login', 'is_guest_login', 'label'], axis = 1, inplace = True)
# 删除无标准偏差的列
test_df.drop(['wrong_fragment', 'urgent', 'num_failed_logins', 'lsu_attempted', 'lnum_file_creations', 'lnum_outbound_cmds'], axis = 1, inplace = True)

In [9]:
# train
# 采用主成分分析法对数据进行降维，覆盖原始数据80%的方差
# ｎ＿components：指定希望PCA降维后的特征维度数目
# 最常用的做法是直接指定降维到的维度数目，此时n_components是一个大于等于1的整数。
# 当然，我们也可以指定主成分的方差和所占的最小比例阈值，让PCA类自己去根据样本特征方差来决定降维到的维度数，
# 此时n_components是一个（0，1]之间的数。
pca = PCA(n_components = 0.8)
pca.fit(train_df)

pca_cols = ['PCA_' + str(i) for i in range(pca.n_components_)]
train_df = pd.DataFrame(pca.transform(train_df), columns = pca_cols)
test_df = pd.DataFrame(pca.transform(test_df), columns = pca_cols)

In [10]:
model = DAGMM(
    comp_hiddens=[60, 30, 10, 1], comp_activation=tf.nn.tanh,
    est_hiddens=[10, 4], est_dropout_ratio=0.5, est_activation=tf.nn.tanh,
    learning_rate=0.001, epoch_size=50, minibatch_size=64, random_seed=1111
)

In [11]:
model = DAGMM(
    comp_hiddens=[60, 30, 10, 1], comp_activation=tf.nn.tanh,
    est_hiddens=[10, 4], est_dropout_ratio=0.5, est_activation=tf.nn.tanh,
    learning_rate=0.0001, epoch_size=200, minibatch_size=1024, random_seed=1111
)

In [12]:
start = time.clock()
model.fit(train_df)
end = time.clock()
print('Time : ', (end - start))

Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use keras.layers.dropout instead.


  """Entry point for launching an IPython kernel.


 epoch 100/200 : loss = 0.895
 epoch 200/200 : loss = 0.713
Time :  215.9270139


  This is separate from the ipykernel package so we can avoid doing imports until


In [13]:
start = time.clock()
y_pred = model.predict(test_df)
end = time.clock()
print('Time : ', (end - start))

  """Entry point for launching an IPython kernel.


Time :  0.6798515000000123


  This is separate from the ipykernel package so we can avoid doing imports until


In [14]:
# Energy thleshold to detect anomaly = 80% percentile of energies
anomaly_energy_threshold = np.percentile(y_pred, 60)
print(f"Energy thleshold to detect anomaly : {anomaly_energy_threshold:.3f}")

Energy thleshold to detect anomaly : -5.188


In [15]:
# Detect anomalies from test data
y_pred_flag = np.where(y_pred >= anomaly_energy_threshold, 1, 0)

In [16]:
y_labels = pd.Series([0 if i == 0 else 1 for i in y_label['label']])

In [17]:
y_labels

0         0
1         0
2         0
3         1
4         1
         ..
311024    0
311025    0
311026    0
311027    0
311028    0
Length: 311029, dtype: int64

In [18]:
y_pred_flag

array([0, 0, 0, ..., 0, 0, 0])

In [19]:
prec, recall, fscore, _ = precision_recall_fscore_support(y_labels, y_pred_flag, average="binary")
auc = roc_auc_score(y_labels, y_pred_flag)
print(f" Precision = {prec:.5f}")
print(f" Recall    = {recall:.5f}")
print(f" F1-Score  = {fscore:.5f}")
print(f" AUC  = {auc:.5f}")

 Precision = 0.92776
 Recall    = 0.46457
 F1-Score  = 0.61912
 AUC  = 0.65753


In [20]:
confusion_matrix(y_labels, y_pred_flag)

array([[ 51534,   9059],
       [134091, 116345]], dtype=int64)

In [21]:
(51534 + 116345) / len(y_labels)

0.5397535278060889

In [22]:
second = time.clock()

  """Entry point for launching an IPython kernel.


In [23]:
print('time : ', (second - first))

time :  220.1142396
