## NIST Standard 모델 테스트

* Malicious 데이터를 4가지로 분류하기

### 0. 필요한 라이브러리 import 및 Data 불러오기

#### 라이브러리 임포트

In [11]:
import classification_util
import pandas as pd

#### 데이터 불러오기

In [12]:
files={
    'CICI':'/home/irteam/dcloud-global-dir/MLAC/new_data/CICI.csv',
    'UNSW':'/home/irteam/dcloud-global-dir/MLAC/new_data/UNSW.csv'
}

In [13]:
CICI_data = pd.read_csv(files['CICI'])
UNSW_data = pd.read_csv(files['UNSW'])

총 3개의 lable : attack_category, label, nist_category

##### 데이터 살펴보기

In [14]:
CICI_data.head(1)

Unnamed: 0,protocol,flow_duration,tot_fwd_pkts,tot_bwd_pkts,tot_len_fwd_pkts,tot_len_bwd_pkts,fwd_pkt_len_max,fwd_pkt_len_min,fwd_pkt_len_mean,fwd_pkt_len_std,...,active_std,active_max,active_min,idle_mean,idle_std,idle_max,idle_min,attack_category,label,nist_category
0,0.385385,0.969051,0.946613,0.878378,0.980855,0.898382,0.902052,0.0,0.953632,0.921887,...,0.98494,0.978687,0.910856,0.951187,0.994026,0.952075,0.923429,0,0.0,0


In [15]:
CICI_data['label'].value_counts()

0.0    5759318
1.0     704929
Name: label, dtype: int64

In [16]:
CICI_data['nist_category'].value_counts()

0    5759318
3     403181
1     176324
2     105629
4      19795
Name: nist_category, dtype: int64

In [17]:
UNSW_data.head(1)

Unnamed: 0,dur,sbytes,dbytes,sttl,dttl,sloss,dloss,sload,dload,spkts,...,service_irc,service_pop3,service_radius,service_smtp,service_snmp,service_ssh,service_ssl,attack_category,label,nist_category
0,0.104605,0.517017,0.0,0.435936,0.0,0.0,0.0,0.980584,0.0,0.586086,...,0,0,0,0,0,0,0,0,0.0,0


In [18]:
UNSW_data['label'].value_counts()

0.0    4157619
1.0     773973
Name: label, dtype: int64

In [19]:
UNSW_data['nist_category'].value_counts()

0    4157619
3     299798
4     218158
1     174968
2      81049
Name: nist_category, dtype: int64

#### 모델 불러오기

In [20]:
models = classification_util.getModels()

### 1. NIST Standard 모델 훈련 및 결과보기

#### 함수 수정하기 (미반영)

In [21]:
import time
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn import svm
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn import metrics
import os
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
import itertools

In [22]:
"""
Devide Data into Train & Test set
Input : type(binary VS multiclass)
Ouput : X_train, X_test, y_train, y_test set
Bring Preprocessed Data
"""
def TrainTestSplit(type, data):
    print('Dataset Split')
    if type == 'B':
        print('Binary Classification Dataset Split')
        target = data['label']
    elif type == 'N':
        # Remove Benign Data
        benign = data[data['label'] == 0].index
        data.drop(benign, inplace=True)
        target = data['nist_category']

    del data['attack_category']
    del data['label']
    del data['nist_category']
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.3, shuffle=True, stratify=target, random_state=34)
    return X_train, X_test, y_train, y_test

In [25]:
"""
Plot Confusion Matrix
"""
# confusion matrix plot
def plot_confusion_matrix(con_mat,labels,title:str,cmap=plt.cm.get_cmap('Blues'),normalize=False, confusion_path=None):
    plt.imshow(con_mat,interpolation='nearest',cmap=cmap)
    plt.title(title)
    plt.colorbar()
    marks=np.arange(len(labels))
    nlabels=[]
    for k in range(len(con_mat)):
        n=sum(con_mat[k])
        nlabel='{0}(n={1})'.format(labels[k],n)
        nlabels.append(nlabel)

    plt.xticks(marks,labels,rotation=45)
    plt.yticks(marks,nlabels)

    thresh=con_mat.max()/2.
    if normalize:
        for i, j in itertools.product(range(con_mat.shape[0]), range(con_mat.shape[1])):
            plt.text(j, i, '{0}%'.format(con_mat[i, j] * 100 / n), horizontalalignment="center", color="white" if con_mat[i, j] > thresh else "black")
    else:
        for i, j in itertools.product(range(con_mat.shape[0]), range(con_mat.shape[1])):
            plt.text(j, i, con_mat[i, j], horizontalalignment="center", color="white" if con_mat[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    #plt.show()
    #이미지 저장
    plt.savefig(confusion_path+'/'+title+'.png',facecolor='#eeeeee',edgecolor='blue',pad_inches=0.5)
    plt.clf()

In [36]:
"""
Multiclass Classification
Input : file name, model list(from getModels function), train&test set(from getData function)
Output : Evaluation result as csv file, Confusion Matrix of each Model test
"""
def MultiClassification(dtype, type, file, models, X_train, X_test, y_train, y_test):
    accuracy = pd.DataFrame(columns=['Model','Acc','F1_mi','Recall_mi','Precision_mi','F1_ma','Recall_ma','Precision_ma','F1_we','Recall_we','Precision_we','Execution'])
    print('Model\tAcc\tF1_mi\tRecall_mi\tPrecision_mi\tF1_ma\tRecall_ma\tPrecision_ma\tF1_we\tRecall_we\tPrecision_we\tExecution')
    cnt = 0
    for name, model in models:
        start_time = time.time()
        # 모델 훈련 및 예측
        model.fit(X_train, y_train)
        prediction = model.predict(X_test)
        # 지표 추출
        delta = time.time() - start_time
        acc = accuracy_score(y_test, prediction)
        f1_mi = f1_score(y_test, prediction,average='micro')
        f1_ma = f1_score(y_test, prediction,average='macro')
        f1_we = f1_score(y_test, prediction,average='weighted')
        recall_mi = recall_score(y_test, prediction, average='micro')
        recall_ma = recall_score(y_test, prediction, average='macro')
        recall_we = recall_score(y_test, prediction, average='weighted')
        precision_mi = precision_score(y_test, prediction, average='micro')
        precision_ma = precision_score(y_test, prediction, average='macro')
        precision_we = precision_score(y_test, prediction, average='weighted')
        confusion = metrics.confusion_matrix(y_test, prediction)
        # 저장
        accuracy.loc[cnt] = [name, acc, f1_mi, f1_ma, f1_we, recall_mi, recall_ma, recall_we, precision_mi, precision_ma, precision_we, delta]
        print('{}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.2f} secs'.format(name, acc, f1_mi, f1_ma, f1_we, recall_mi, recall_ma, recall_we, precision_mi, precision_ma, precision_we, delta))
        cnt += 1
        marks = np.arange(len(y_train.value_counts()))
        if type == 'N':
            if dtype == 'C':
                os.makedirs(os.getcwd()+'/Matrix/NIST/CICI',exist_ok=True)
                plot_confusion_matrix(confusion,labels=marks,title=name,confusion_path=os.getcwd()+'/Matrix/NIST/CICI')
            elif dtype == 'U':
                os.makedirs(os.getcwd()+'/Matrix/NIST/UNSW',exist_ok=True)
                plot_confusion_matrix(confusion,labels=marks,title=name,confusion_path=os.getcwd()+'/Matrix/NIST/UNSW')     
    accuracy = accuracy.round(3)
    accuracy.to_csv(os.getcwd()+'/Score/'+file+'.csv',index=False)

#### CICI

In [23]:
CICI_nist_X_train, CICI_nist_X_test, CICI_nist_y_train, CICI_nist_y_test =TrainTestSplit('N', CICI_data)

Dataset Split


In [35]:
MultiClassification(type='N', file='CICI_nist', models=models, X_train= CICI_nist_X_train, X_test= CICI_nist_X_test, y_train= CICI_nist_y_train, y_test= CICI_nist_y_test)

Model	Acc	F1_mi	Recall_mi	Precision_mi	F1_ma	Recall_ma	Precision_ma	F1_we	Recall_we	Precision_we	Execution
RF	0.933	0.933	0.880	0.935	0.933	0.869	0.933	0.933	0.922	0.949	1.50 secs
DT	0.936	0.936	0.887	0.939	0.936	0.877	0.936	0.936	0.925	0.950	6.37 secs
NB	0.677	0.677	0.565	0.732	0.677	0.692	0.677	0.677	0.587	0.841	0.42 secs
LDA	0.932	0.932	0.876	0.936	0.932	0.871	0.932	0.932	0.905	0.949	2.62 secs
QDA	0.857	0.857	0.687	0.883	0.857	0.804	0.857	0.857	0.760	0.957	0.75 secs
LR	0.939	0.939	0.886	0.941	0.939	0.878	0.939	0.939	0.919	0.953	34.15 secs
ABoost	0.904	0.904	0.858	0.908	0.904	0.862	0.904	0.904	0.882	0.926	81.10 secs
k-NN	0.941	0.941	0.898	0.943	0.941	0.890	0.941	0.941	0.922	0.950	616.73 secs
MLP	0.945	0.945	0.899	0.947	0.945	0.894	0.945	0.945	0.927	0.958	142.01 secs
SVM	0.938	0.938	0.884	0.941	0.938	0.876	0.938	0.938	0.918	0.952	165.38 secs


<Figure size 432x288 with 0 Axes>

#### UNSW

In [24]:
UNSW_nist_X_train, UNSW_nist_X_test, UNSW_nist_y_train, UNSW_nist_y_test =TrainTestSplit('N', UNSW_data)

Dataset Split


In [37]:
MultiClassification(dtype='U', type='N', file='UNSW_nist', models=models, X_train= UNSW_nist_X_train, X_test= UNSW_nist_X_test, y_train= UNSW_nist_y_train, y_test= UNSW_nist_y_test)

Model	Acc	F1_mi	Recall_mi	Precision_mi	F1_ma	Recall_ma	Precision_ma	F1_we	Recall_we	Precision_we	Execution
RF	0.905	0.905	0.858	0.902	0.905	0.846	0.905	0.905	0.880	0.906	1.14 secs
DT	0.952	0.952	0.934	0.955	0.952	0.958	0.952	0.952	0.923	0.964	4.46 secs
NB	0.897	0.897	0.825	0.887	0.897	0.815	0.897	0.897	0.859	0.888	1.46 secs
LDA	0.945	0.945	0.922	0.949	0.945	0.946	0.945	0.945	0.912	0.960	10.07 secs
QDA	0.938	0.938	0.915	0.943	0.938	0.939	0.938	0.938	0.906	0.956	3.45 secs
LR	0.947	0.947	0.925	0.951	0.947	0.947	0.947	0.947	0.914	0.960	66.53 secs
ABoost	0.954	0.954	0.936	0.956	0.954	0.955	0.954	0.954	0.923	0.961	70.34 secs
k-NN	0.961	0.961	0.943	0.962	0.961	0.954	0.961	0.961	0.935	0.964	4630.42 secs
MLP	0.963	0.963	0.947	0.965	0.963	0.965	0.963	0.963	0.935	0.970	316.59 secs
SVM	0.947	0.947	0.925	0.951	0.947	0.948	0.947	0.947	0.914	0.961	105.31 secs


FileNotFoundError: [Errno 2] No such file or directory: '/home/irteam/wendyunji-dcloud-dir/MLAC/Workspace/230602/Score/UNSW_nist.csv'

<Figure size 432x288 with 0 Axes>