In [1]:
import time
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn import metrics
from pipeline import create_pipeline
import os



In [2]:

from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
import itertools

In [3]:
# 전처리
# Get Dataset
files={
    'CICI':'/home/irteam/junghye-dcloud-dir/MLAC/new_data/CICI.csv',
    'UNSW':'/home/irteam/junghye-dcloud-dir/MLAC/new_data/UNSW.csv'
}

data = pd.read_csv(files['CICI'])
data=data[np.isfinite(data).all(1)]




In [4]:
data_backup=data.copy()

In [5]:
#target
binary_t=data['label']
multi1_t=data['nist_category']
multi2_t=data['attack_category'] # 최종
# 마지막 3-class classifier
class_1_data=data[data['nist_category']==1] # 여기서 각각 attack_category예측
class_2_data=data[data['nist_category']==2]
class_3_data=data[data['nist_category']==3]
class_4_data=data[data['nist_category']==4]

for class_data in [class_1_data,class_2_data,class_3_data,class_4_data]:
    class_data.drop(labels=['nist_category','label'],axis=1,inplace=True)

data=data.drop(labels=['label','attack_category','nist_category'],axis=1)

In [6]:
# Define Models
models = []
models.append(('RF', RandomForestClassifier(max_depth=5, n_estimators=5, max_features=3)))    
models.append(('CART', DecisionTreeClassifier(max_depth=5)))
models.append(('NB', GaussianNB()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('QDA', QuadraticDiscriminantAnalysis()))
models.append(('LR', LogisticRegression(solver='lbfgs', max_iter=200)))
models.append(('ABoost', AdaBoostClassifier()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('MLP', MLPClassifier()))

In [7]:
df=pd.DataFrame(columns=['name','b_acc','b_f1','b_rc','b_pc','m_acc','m_f1','m_rc','m_pc']+\
                 ['c1_acc','c1_f1','c1_rc','c1_pc']+\
                    ['c2_acc','c2_f1','c2_rc','c2_pc','c3_acc','c3_f1','c3_rc','c3_pc','c4_acc','c4_f1','c4_rc','c4_pc'])
eval_path='/home/irteam/junghye-dcloud-dir/MLAC/evaluation'
confusion_path='/home/irteam/junghye-dcloud-dir/MLAC/confusion_matrix/hierarchical'
cnt=0


In [8]:
X_train,X_test,y_train,y_test=train_test_split(data,binary_t,test_size=0.3, shuffle=True, stratify=binary_t, random_state=34)

In [9]:
multi1_train=multi1_t.loc[y_train.index]

multi1_test=multi1_t.loc[y_test.index]



class_1_train=class_1_data.loc[class_1_data.index.isin(y_train.index)] # y값 기준
class_1_train_lbl=class_1_train['attack_category']
class_1_test_lbl=class_1_data.loc[class_1_data.index.isin(y_test.index),'attack_category']

class_2_train=class_2_data.loc[class_2_data.index.isin(y_train.index)]
class_2_train_lbl=class_2_train['attack_category']
class_2_test_lbl=class_2_data.loc[class_2_data.index.isin(y_test.index),'attack_category']

class_3_train=class_3_data.loc[class_3_data.index.isin(y_train.index)]
class_3_train_lbl=class_3_train['attack_category']
class_3_test_lbl=class_3_data.loc[class_3_data.index.isin(y_test.index),'attack_category']

class_4_train=class_4_data.loc[class_4_data.index.isin(y_train.index)]
class_4_train_lbl=class_4_train['attack_category']
class_4_test_lbl=class_4_data.loc[class_4_data.index.isin(y_test.index),'attack_category']

# train data에서 label drop시키기
for class_train in [class_1_train,class_2_train, class_3_train,class_4_train]:
    class_train.drop(['attack_category'],axis=1,inplace=True)
# 각 train이랑 test로 나눔 
# 여기서 attack_category가 0인 행들은 빼버리기 


1. 모든 classifier을 동시에 다 train (있는 데이터 다 데리고)
    그리고 test할 떄만 계층적으로 내려옴 (test 데이터를 쪼개고쪼개고..??)

2. 한 classifier을 train -> train set으로 예측 진행 1로 분류된 데이터 -> 

In [10]:
def plot_confusion_matrix(con_mat,labels,title:str,cmap=plt.cm.get_cmap('Blues'),normalize=False):
    plt.figure(figsize=(20,15))
    plt.imshow(con_mat,interpolation='nearest',cmap=cmap)
    plt.title(title)
    plt.colorbar()
    marks=np.arange(len(labels))
    nlabels=[]
    for k in range(len(con_mat)):
        n=sum(con_mat[k])
        nlabel='{0}(n={1})'.format(labels[k],n)
        nlabels.append(nlabel)

    plt.xticks(marks,labels,rotation=45)
    plt.yticks(marks,nlabels)

    thresh=con_mat.max()/2.
    if normalize:
        for i, j in itertools.product(range(con_mat.shape[0]), range(con_mat.shape[1])):
            plt.text(j, i, '{0}%'.format(con_mat[i, j] * 100 / n), horizontalalignment="center", color="white" if con_mat[i, j] > thresh else "black")
    else:
        for i, j in itertools.product(range(con_mat.shape[0]), range(con_mat.shape[1])):
            plt.text(j, i, con_mat[i, j], horizontalalignment="center", color="white" if con_mat[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    #plt.show()
    #이미지 저장
    plt.savefig(confusion_path+'/'+title+'.png',facecolor='#eeeeee')
    plt.clf()

In [11]:
def test_result(model:str,test,pred,target:list) ->list:

    acc=accuracy_score(test,pred)
    f1=f1_score(test,pred,average='weighted')
    recall=recall_score(test,pred,average='weighted')
    precision=precision_score(test,pred,average='weighted')
    #confusion=metrics.confusion_matrix(test,pred)
    #plot_confusion_matrix(confusion,labels=list(set(target)),title=model)
    print(f'{model} result , acc:{acc}, f1:{f1},recall:{recall},precision:{precision}')
    return([acc,f1,recall,precision])
    
    


In [12]:
for name, model in models:
    # all models training (다 같은 trainset으로)
    model_eval=[]
    model_eval.append(name)
    print('binary train starting...')
    # binary classification
    binary_model=model
    
    binary_model.fit(X_train,y_train)

        # 순차적으로 test 진행
    print('binary test starting...')
    binary_pred=binary_model.predict(X_test) #  
    binary_result=test_result(name,y_test,binary_pred,binary_t)
    model_eval.extend(binary_result)
    

    print('multi1 train starting...')
    #2-step training
    # 주의 : label이 0인 것 빼고 training
    multi1_model=create_pipeline(model)
    multi1_model.fit(X_train[multi1_train!=0],multi1_train[multi1_train!=0])
    multi1_X_test=X_test.iloc[np.where(binary_pred==1)[0]] # 1로 분류된것만 이게 빈 걸로 반환됨 
    
    print('multi1 test starting...')
    multi1_pred=multi1_model.predict(multi1_X_test)
    # 리스트에서 1인 곳 의 값 반환 -> list 
    multi1_test_selected = multi1_test.iloc[np.where(binary_pred==1)[0]]# label중에서도 1로 분류된 것들만 (1,2,3,4)
    multi1_result=test_result(name,multi1_test_selected,multi1_pred,multi1_t)
    model_eval.extend(multi1_result)
    # last-step training
    # all 4 models
    
    print('class1 train starting...')
    class_1_model=model
    class_1_model.fit(class_1_train,class_1_train_lbl) # 66개 feature
    train_pred=class_1_model.predict(class_1_train)
    
    print('train 결과:',accuracy_score(class_1_train_lbl,train_pred))
    # class_1_test 셋 조정
    indices = np.where(multi1_pred == 1)[0]
    class_1_test_selected = multi1_X_test.iloc[indices] # 1로 예측된 X_test 
    
     # 이 부분에서 column이 3개 날아가는듯 
    print('class_1 test starting..')
    class_1_pred=class_1_model.predict(class_1_test_selected) # predict
    
    class_1_result=test_result(name,multi2_t.iloc[indices],class_1_pred,class_1_test_lbl)
    model_eval.extend(class_1_result)
    

    print('class2 train starting...')
    class_2_model=model
    class_2_model.fit(class_2_train,class_2_train_lbl)
    train_pred=class_2_model.predict(class_2_train)
    
    print('train 결과:',accuracy_score(class_2_train_lbl,train_pred))
    indices2 = np.where(multi1_pred == 2)[0]
    class_2_test_selected=multi1_X_test.iloc[indices2]
    
    print('class_2 test starting..')
    class_2_pred=class_2_model.predict(class_2_test_selected)
    
    class_2_result=test_result(class_2_model,multi2_t.iloc[indices2],class_2_pred,class_2_test_lbl)
    model_eval.extend(class_2_result)
    

    print('class3 train starting...')
    class_3_model=model
    class_3_model.fit(class_3_train,class_3_train_lbl)
    indices3 = np.where(multi1_pred == 3)[0]
    class_3_test_selected=multi1_X_test.iloc[indices3]

    print('class_3 test starting..')
    class_3_pred=class_3_model.predict(class_3_test_selected)
    class_3_result=test_result(class_3_model,multi2_t.iloc[indices3],class_3_pred,class_3_test_lbl)

    model_eval.extend(class_3_result)


    print('class4 train starting...')
    class_4_model=model
    class_4_model.fit(class_4_train,class_4_train_lbl)
    indices4 = np.where(multi1_pred == 4)[0]
    class_4_test_selected=multi1_X_test.iloc[indices4]

    print('class_4 test starting..')
    class_4_pred=class_4_model.predict(class_4_test_selected)
    class_4_result=test_result(class_4_model,multi2_t.iloc[indices4],class_4_pred,class_4_test_lbl)

    model_eval.extend(class_4_result)
   
   
   
    


    df.loc[cnt]=model_eval
    cnt=cnt+1
    
    

df.to_csv(os.path.join(eval_path,'hierarchical.csv'),index=False)
    

binary train starting...
binary test starting...
RandomForestClassifier(max_depth=5, max_features=3, n_estimators=5) result , acc:0.9695143803741089, f1:0.9675102441919022,recall:0.9695143803741089,precision:0.9699638505870618
multi1 train starting...
multi1 test starting...
Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', MinMaxScaler()),
                ('model',
                 RandomForestClassifier(max_depth=5, max_features=3,
                                        n_estimators=5))]) result , acc:0.9338648052435171, f1:0.9313830611534732,recall:0.9338648052435171,precision:0.9376436727110034
class1 train starting...
train 결과: 0.9961868832020466
class_1 test starting..
RandomForestClassifier(max_depth=5, max_features=3, n_estimators=5) result , acc:0.0, f1:0.0,recall:0.0,precision:0.0
class2 train starting...
train 결과: 0.7737989263845054
class_2 test starting..
RandomForestClassifier(max_depth=5, max_features=3, n_estimators=5) result , acc:0.0, f1:0.0,recall:0.0,precisio