In [75]:
import time
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn import metrics
from load import create_pipeline
import os



In [76]:

from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
import itertools

In [77]:
# 전처리
# Get Dataset
files={
    'CICI':'/home/irteam/junghye-dcloud-dir/MLAC/new_data/CICI.csv',
    'UNSW':'/home/irteam/junghye-dcloud-dir/MLAC/new_data/UNSW.csv'
}

data = pd.read_csv(files['UNSW'])
data=data[np.isfinite(data).all(1)]




In [78]:
#target
binary_t=data['label']
multi1_t=data['nist_category']
multi2_t=data['attack_category'] # 최종
# 마지막 3-class classifier
class_1_data=data[data['nist_category']==1] # 여기서 각각 attack_category예측
class_2_data=data[data['nist_category']==2]
class_3_data=data[data['nist_category']==3]
class_4_data=data[data['nist_category']==4]

for class_data in [class_1_data,class_2_data,class_3_data,class_4_data]:
    class_data.drop(labels=['nist_category','label'],axis=1,inplace=True)

data=data.drop(labels=['label','attack_category','nist_category'],axis=1)

In [79]:
# Define Models
models = []
models.append(('RF', RandomForestClassifier(max_depth=5, n_estimators=5, max_features=3)))    
models.append(('CART', DecisionTreeClassifier(max_depth=5)))
models.append(('NB', GaussianNB()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('QDA', QuadraticDiscriminantAnalysis()))
models.append(('LR', LogisticRegression(solver='lbfgs', max_iter=200)))
models.append(('ABoost', AdaBoostClassifier()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('MLP', MLPClassifier()))

In [80]:
df=pd.DataFrame(columns=['name','b_acc','b_f1','b_rc','b_pc','m_acc','m_f1','m_rc','m_pc']+\
                 ['c1_acc','c1_f1','c1_rc','c1_pc']+\
                    ['c2_acc','c2_f1','c2_rc','c2_pc','c3_acc','c3_f1','c3_rc','c3_pc','c4_acc','c4_f1','c4_rc','c4_pc']+\
                     ['final_acc','final_f1','final_rc','final_pc'])
eval_path='/home/irteam/junghye-dcloud-dir/MLAC/evaluation'
confusion_path='/home/irteam/junghye-dcloud-dir/MLAC/confusion_matrix/hierarchical/UNSW'
cnt=0


In [81]:
X_train,X_test,y_train,y_test=train_test_split(data,binary_t,test_size=0.3, shuffle=True, stratify=binary_t, random_state=34)

In [82]:
multi1_train=multi1_t.loc[y_train.index]

multi1_test=multi1_t.loc[y_test.index]
multi2_test=multi2_t.loc[y_test.index]


class_1_X_train=class_1_data.loc[class_1_data.index.isin(y_train.index)] # y값 기준
class_1_y_train=class_1_X_train['attack_category']
class_1_y_test=class_1_data.loc[class_1_data.index.isin(y_test.index),'attack_category']

class_2_X_train=class_2_data.loc[class_2_data.index.isin(y_train.index)]
class_2_y_train=class_2_X_train['attack_category']
class_2_y_test=class_2_data.loc[class_2_data.index.isin(y_test.index),'attack_category']

class_3_X_train=class_3_data.loc[class_3_data.index.isin(y_train.index)]
class_3_y_train=class_3_X_train['attack_category']
class_3_y_test=class_3_data.loc[class_3_data.index.isin(y_test.index),'attack_category']

class_4_X_train=class_4_data.loc[class_4_data.index.isin(y_train.index)]
class_4_y_train=class_4_X_train['attack_category']
class_4_y_test=class_4_data.loc[class_4_data.index.isin(y_test.index),'attack_category']

# train data에서 label drop시키기
for class_train in [class_1_X_train,class_2_X_train, class_3_X_train,class_4_X_train]:
    class_train.drop(['attack_category'],axis=1,inplace=True)
# 각 train이랑 test로 나눔 
# 여기서 attack_category가 0인 행들은 빼버리기 


1. 모든 classifier을 동시에 다 train (있는 데이터 다 데리고)
    그리고 test할 떄만 계층적으로 내려옴 (test 데이터를 쪼개고쪼개고..??)

2. 한 classifier을 train -> train set으로 예측 진행 1로 분류된 데이터 -> 

In [83]:
def plot_confusion_matrix(con_mat,labels,title:str,cmap=plt.cm.get_cmap('Blues'),normalize=False):
    plt.figure(figsize=(20,15))
    plt.imshow(con_mat,interpolation='nearest',cmap=cmap)
    plt.title(title)
    plt.colorbar()
    marks=np.arange(len(labels))
    nlabels=[]
    for k in range(len(con_mat)):
        n=sum(con_mat[k])
        nlabel='{0}(n={1})'.format(labels[k],n)
        nlabels.append(nlabel)

    plt.xticks(marks,labels,rotation=45)
    plt.yticks(marks,nlabels)

    thresh=con_mat.max()/2.
    if normalize:
        for i, j in itertools.product(range(con_mat.shape[0]), range(con_mat.shape[1])):
            plt.text(j, i, '{0}%'.format(con_mat[i, j] * 100 / n), horizontalalignment="center", color="white" if con_mat[i, j] > thresh else "black")
    else:
        for i, j in itertools.product(range(con_mat.shape[0]), range(con_mat.shape[1])):
            plt.text(j, i, con_mat[i, j], horizontalalignment="center", color="white" if con_mat[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    #plt.show()
    #이미지 저장
    plt.savefig(confusion_path+'/'+title+'.png',facecolor='#eeeeee')
    plt.clf()

In [84]:
def test_result(model:str,test,pred) ->list:

    acc=accuracy_score(test,pred)
    f1=f1_score(test,pred,average='weighted')
    recall=recall_score(test,pred,average='weighted')
    precision=precision_score(test,pred,average='weighted')
    #confusion=metrics.confusion_matrix(test,pred)
    #plot_confusion_matrix(confusion,labels=list(set(target)),title=model)
    print(f'{model} result , acc:{acc}, f1:{f1},recall:{recall},precision:{precision}')
    return([acc,f1,recall,precision])



In [85]:
attack_23={0:'Benign',1: 'Analysis', 2: 'Backdoor', 3: 'Bot', 4: 'DDoS', 5: 'DoS', 6: 'DoS GoldenEye', 7: 'DoS Slowhttptest', 8: 'DoS slowloris', 9: 'Dos Hulk', 10: 'Exploits', 11: 'FTP-Patator', 12: 'Fuzzers', 13: 'Generic', 14: 'Heartbleed', 15: 'Infiltration', 16: 'PortScan', 17: 'Reconnaissance', 18: 'SSH-Patator', 19: 'Shellcode', 20: 'Web Attack – Brute Force', 21: 'Web Attack – Sql Injection', 22: 'Web Attack – XSS', 23: 'Worms'}

In [None]:
for name, model in models:
    # all models training (다 같은 trainset으로)
    model_eval=[]
    model_eval.append(name)
    print('layer 1 (binary classifier) train & test')
    # binary classification
    binary_model=model
    
    binary_model.fit(X_train,y_train)

        
    binary_pred=binary_model.predict(X_test) #  
    binary_result=test_result(name,y_test,binary_pred)
    model_eval.extend(binary_result)
    

    print('layer 2(4-classes classifier) train & test')
    #2-step training
    # 주의 : label이 0인 것 빼고 training
    multi1_model=create_pipeline(model)
    multi1_model.fit(X_train[multi1_train!=0],multi1_train[multi1_train!=0])
    malicious_indices=np.where(binary_pred==1)[0]
    if malicious_indices.any():
        multi1_X_test=X_test.iloc[malicious_indices] # 1로 분류된것만 이게 빈 걸로 반환됨 
        multi1_pred=multi1_model.predict(multi1_X_test)
        # 리스트에서 1인 곳 의 값 반환 -> list 
        multi1_test_selected = multi1_test.iloc[np.where(binary_pred==1)[0]]# label중에서도 1로 분류된 것들만 (1,2,3,4)
        multi2_test_selected=multi2_test.iloc[np.where(binary_pred==1)[0]]
        multi1_result=test_result(name,multi1_test_selected,multi1_pred)
        model_eval.extend(multi1_result)
    else:
        print('no malicious predicted')
        import sys
        sys.exit()
        
    # last-step training
    
    print('Reconnaissance train & test')
    class_1_model=model
    class_1_model.fit(class_1_X_train,class_1_y_train) # 66개 feature
  
    indices1 = np.where(multi1_pred == 1)[0] # binary pred=1 -> multi1_pred=1 
    if indices1.any():
        class_1_X_test_selected = multi1_X_test.iloc[indices1] # 1로 예측된 X_test 


        class_1_pred=class_1_model.predict(class_1_X_test_selected) # predict
        # class_1_pred와 
        class_1_y_test_selected=multi2_test_selected.iloc[indices1]
        class_1_result=test_result(name,class_1_y_test_selected,class_1_pred)
        model_eval.extend(class_1_result)
    else:
        model_eval.extend([0,0,0,0])

    print('Access train & test ')
    class_2_model=model
    class_2_model.fit(class_2_X_train,class_2_y_train)
    
    indices2 = np.where(multi1_pred == 2)[0]
    if indices2.any():
        class_2_X_test_selected=multi1_X_test.iloc[indices2]


        class_2_pred=class_2_model.predict(class_2_X_test_selected)
        class_2_y_test_selected=multi2_test_selected.iloc[indices2]
        class_2_result=test_result(name,class_2_y_test_selected,class_2_pred)
        model_eval.extend(class_2_result)
    else:
        model_eval.extend([0,0,0,0])
    

    print('Dos train & test')
    class_3_model=model
    class_3_model.fit(class_3_X_train,class_3_y_train)
    indices3 = np.where(multi1_pred == 3)[0]
    if indices3.any():
        class_3_X_test_selected=multi1_X_test.iloc[indices3]


        class_3_pred=class_3_model.predict(class_3_X_test_selected)
        class_3_y_test_selected=multi2_test_selected.iloc[indices3]
        class_3_result=test_result(name,class_3_y_test_selected,class_3_pred)

        model_eval.extend(class_3_result)
    else:
        model_eval.extend([0,0,0,0])

    print('Malware train & test')
    class_4_model=model
    class_4_model.fit(class_4_X_train,class_4_y_train)
    indices4 = np.where(multi1_pred == 4)[0]
    if indices4.any():
        class_4_X_test_selected=multi1_X_test.iloc[indices4]


        class_4_pred=class_4_model.predict(class_4_X_test_selected)
        class_4_y_test_selected=multi2_test_selected.iloc[indices4]
        class_4_result=test_result(name,class_4_y_test_selected,class_4_pred)

        model_eval.extend(class_4_result)
    else:
        model_eval.extend([0,0,0,0])
   
   
    final_y_pred=[]
    for class_pred in [class_1_pred,class_2_pred,class_3_pred,class_4_pred]:
        final_y_pred.extend(class_pred)

    final_y_test=[]
    for class_test in [class_1_y_test_selected,class_2_y_test_selected,class_3_y_test_selected,class_4_y_test_selected]:
        final_y_test.extend(class_test)
    
    
    final_result=test_result(name,final_y_test,final_y_pred)
    model_eval.extend(final_result)
    
    # plot confusion matrix
    #target list
    encoded_list=[]
    encoded_list.extend(final_y_pred)
    encoded_list.extend(final_y_test)
    encoded_list=list(set(encoded_list))
    target_list=[]
    for encoded_label in encoded_list:
        target_list.append(attack_23[encoded_label])
   
    confusion=metrics.confusion_matrix(final_y_test,final_y_pred)
    plot_confusion_matrix(confusion,labels=target_list,title=name)
    
    df.loc[cnt]=model_eval
    cnt=cnt+1
    

df.to_csv(os.path.join(eval_path,'UNSW_hierarchical.csv'),index=False)
    

layer 1 (binary classifier) train & test
RF result , acc:0.8949460552978821, f1:0.871844746176852,recall:0.8949460552978821,precision:0.9057916101757256
layer 2(4-classes classifier) train & test
RF result , acc:0.9887779106089237, f1:0.98598882416715,recall:0.9887779106089237,precision:0.9835918793871729
Reconnaissance train & test
RF result , acc:0.5, f1:0.3378378378378379,recall:0.5,precision:0.25510204081632654
Access train & test 
Dos train & test
RF result , acc:0.9849139045452195, f1:0.9798486912880007,recall:0.9849139045452195,precision:0.9753451057614118
Malware train & test
RF result , acc:0.04909560723514212, f1:0.004595155356983746,recall:0.04909560723514212,precision:0.0024103786497873393
RF result , acc:0.9148683334917768, f1:0.9108296562265669,recall:0.9148683334917768,precision:0.9265701928904521
layer 1 (binary classifier) train & test
CART result , acc:0.9901897831532473, f1:0.9902394755571581,recall:0.9901897831532473,precision:0.9903459761082976
layer 2(4-classes cl

In [None]:
#len(binary_pred)
len(X_test)

In [None]:
if malicious_indices.all():
    print('있으')