In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from tensorflow.keras import layers, losses
from tensorflow.keras.datasets import fashion_mnist
from tensorflow.keras.models import Model

import seaborn as sns
import re
import datetime
import os

%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [None]:
from google.colab import drive
import glob
drive.mount('/content/drive')

Mounted at /content/drive


# 데이터 준비 및 전처리


In [None]:
files = [file for file in glob.glob("/content/drive/MyDrive/CICIDS2017**/*.csv", recursive=True)]
for f in files:
  print(f)
dataset = [pd.read_csv(f) for f in files]
print(dataset[0].shape)

# 하나의 dataframe으로 합치고 중복 제거
dataset = pd.concat([d for d in dataset]).drop_duplicates(keep=False)
dataset.reset_index(drop=True, inplace = True)

# Column 이름 공백 제거
col_names = [col.replace(' ', '') for col in dataset.columns]
dataset.columns = col_names

#필요없는 행 제거
dataset = dataset.drop(["FwdHeaderLength.1"],axis = 1)

# Label 값 다듬기
label_names = dataset['Label'].unique()
label_names = [re.sub("[^a-zA-Z ]+", "", l) for l in label_names] 
label_names = [re.sub("[\s\s]", '_', l) for l in label_names]
label_names = [lab.replace("__", "_") for lab in label_names]

labels = dataset['Label'].unique()
for i in range(0,len(label_names)):
    dataset['Label'] = dataset['Label'].replace({labels[i] : label_names[i]})

# 결측치 제거
dataset.dropna(inplace=True)

# 무한대인 것 제거
labl = dataset['Label']
dataset = dataset.loc[:, dataset.columns != 'Label'].astype('float64')
dataset = dataset.replace([np.inf, -np.inf], np.nan)
dataset = dataset.merge(labl, how='outer', left_index=True, right_index=True)
dataset.dropna(inplace=True)


# 데이터 스케일링
labels = dataset['Label']
features = dataset.loc[:, dataset.columns != 'Label'].astype('float64')

scaler = MinMaxScaler() # 한번 바꿔봄..
scaler.fit(features)
features = scaler.transform(features)

/content/drive/MyDrive/CICIDS2017/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
/content/drive/MyDrive/CICIDS2017/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv
/content/drive/MyDrive/CICIDS2017/Friday-WorkingHours-Morning.pcap_ISCX.csv
/content/drive/MyDrive/CICIDS2017/Monday-WorkingHours.pcap_ISCX.csv
/content/drive/MyDrive/CICIDS2017/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
/content/drive/MyDrive/CICIDS2017/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
/content/drive/MyDrive/CICIDS2017/Tuesday-WorkingHours.pcap_ISCX.csv
/content/drive/MyDrive/CICIDS2017/Wednesday-workingHours.pcap_ISCX.csv
(225745, 79)


In [None]:
dataset.info()

일단 14개로 분류하기엔 특정 공격 비율이 너무 낮아서 Dos 계열만 나왔던 것 같고
cicids 2017, 2018 공격 분류가 좀 달라서 비교 표에 있는 기준으로 8가지로 합쳤음

In [None]:
attack_class = {
'DoS_GoldenEye' : 'DoS',
'DoS_Hulk' : 'DoS',
'DoS_Slowhttptest' : 'DoS',
'DoS_slowloris' : 'DoS',
'FTPPatator' : 'Brute Force',
'Web_Attack_Brute_Force' : 'Brute Force',
'Web_Attack_Sql_Injection' : 'Web Attack',
'Web_Attack_XSS' : 'Web Attack',
'Bot' : 'Botnet',
'PortScan' : 'Port Scan',
'Heartbleed' : 'Web Attack',
'SSHPatator' : 'Web Attack'
}

In [None]:
dataset = dataset.replace({'Label' : attack_class})
np.unique(dataset.Label)

array(['BENIGN', 'Botnet', 'Brute Force', 'DDoS', 'DoS', 'Infiltration',
       'Port Scan', 'Web Attack'], dtype=object)

In [None]:
labels = dataset['Label']

categorical(text) 데이터를 numerical 데이터로 변환
https://azanewta.tistory.com/46

- Label encoding : 알파벳 오더 순으로 숫자 할당
- One-hot Encoding ( -> sklearn.preprocessing.OneHotEncoder) : 0, 1로 만 이루어진 값으로 데이터 구분
( 데이터 4가지면 00, 01, 10, 11 이런 식 )


In [None]:
# 라벨 인코딩
LE = LabelEncoder() 
labels = LE.fit_transform(labels)

# 원핫인코딩
# 인자에 숫자만 올 수 있기 때문에 라벨 인코딩 먼저 해줌 
# OE = OneHotEncoder()
# labels = labels.reshape(-1,1)
# OE.fit(labels)
# labels = OE.transform(labels)

np.unique(labels)

array([0, 1, 2, 3, 4, 5, 6, 7])

In [None]:
# 데이터 분할(train, test)
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=.2)
features_train.shape, features_test.shape, labels_train.shape, labels_test.shape

((1940581, 77), (485146, 77), (1940581,), (485146,))

#모델 학습 : 78개 특징 + softmax 

In [None]:
# Clearing variables.
dataset = None
finite = None
labl = None
d = None
features = None
labels = None

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=(77,)),
    tf.keras.layers.Dense(67, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(8, activation='softmax') # 분류 위한 softmax 층 (label 종류 15개)
])

In [None]:
model.compile(optimizer='adam',
             loss='sparse_categorical_crossentropy',
             metrics=['accuracy'])

log_dir = os.path.join("train_logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"),) 
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
eary_stop_callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=10, restore_best_weights=True)

In [None]:
%tensorflow_version 2.x
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [None]:
model.fit(features_train,
          labels_train,
          epochs=20,
          callbacks=[tensorboard_callback, eary_stop_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fdb328b4c10>

In [None]:
# Evaluating model accuracy.
model.evaluate(features_test, labels_test, verbose=2)

15161/15161 - 34s - loss: 0.0284 - accuracy: 0.9896 - 34s/epoch - 2ms/step


[0.028355594724416733, 0.9895639419555664]

In [None]:
predictions = model.predict(features_test)

In [None]:
idx = np.argmax(predictions, axis = 1)
idx = pd.DataFrame(idx, columns = ['Label'])

In [None]:
idx

In [None]:
predicted_label = idx['Label']

In [None]:
np.unique(predicted_label)

array([0, 1, 2, 3, 4, 6, 7])

In [None]:
np.unique(labels_test)

array([0, 1, 2, 3, 4, 5, 6, 7])

In [None]:
# 수치 4개가 다 같게 나옴,,, 

from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score, precision_recall_curve, roc_auc_score, roc_curve

accuracy = accuracy_score(labels_test, predicted_label)
precision = precision_score(labels_test, predicted_label,average='micro')
recall = recall_score(labels_test, predicted_label,average='micro')
f1 = f1_score(labels_test, predicted_label,average='micro')
accuracy, precision, recall, f1

(0.9895639663111723,
 0.9895639663111723,
 0.9895639663111723,
 0.9895639663111723)

In [None]:
# Saving the model.
model_path = os.path.join("/content/drive/MyDrive/", "AEwithLabel_sy.h5") # 라벨 8가지로 만든 애는 Sy로 저장해놓음 
model.save(model_path)