In [1]:
import time
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn import metrics
from pipeline import create_pipeline
import os



In [2]:

from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
import itertools

In [3]:
# 전처리
# Get Dataset
files={
    'CICI':'/home/irteam/junghye-dcloud-dir/MLAC/data/encoded_ConcatedCICI.csv',
    'UNSW': '/home/irteam/junghye-dcloud-dir/MLAC/data/encoded_ConcatedUNSW.csv'
}

data = pd.read_csv(files['CICI'])
data=data[np.isfinite(data).all(1)]




In [4]:
data.drop(['Unnamed: 0','Unnamed: 0.1','Unnamed: 0.1.1'],axis=1,inplace=True)

In [5]:
data.head()

Unnamed: 0,protocol,flow_duration,tot_fwd_pkts,tot_bwd_pkts,tot_len_fwd_pkts,tot_len_bwd_pkts,fwd_pkt_len_max,fwd_pkt_len_min,fwd_pkt_len_mean,fwd_pkt_len_std,...,active_max,active_min,idle_mean,idle_std,idle_max,idle_min,attack_category,label,nist_category,new_category
0,0.389389,0.975383,0.952661,0.872873,0.977796,0.891739,0.890641,0.0,0.952564,0.920861,...,0.978227,0.911498,0.954011,0.992035,0.949882,0.923901,0.0,0.0,0.0,0.0
1,0.0,0.999351,0.978378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.9958,0.909919,0.960598,0.995313,0.967231,0.917421,0.0,0.0,0.0,0.0
2,0.389389,0.923154,0.736236,0.713714,0.937297,0.787805,0.9585,0.0,0.965808,0.966347,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.355843,0.47998,0.368368,0.618619,0.657157,0.618118,0.944444,0.784284,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.586652,0.307307,0.139139,0.496997,0.427928,0.605606,0.933433,0.766767,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
#target
binary_t=data['label']
multi1_t=data['nist_category']
multi2_t=data['attack_category'] # 최종
# 마지막 3-class classifier
class_1_data=data[data['nist_category']==1] # 여기서 각각 attack_category예측
class_2_data=data[data['nist_category']==2]
class_3_data=data[data['nist_category']==3]
data=data.drop(labels=['label','attack_category','nist_category','new_category'],axis=1)

In [7]:
binary_t

0          0.0
1          0.0
2          0.0
3          0.0
4          0.0
          ... 
2157002    0.0
2157003    0.0
2157004    0.0
2157005    0.0
2157006    0.0
Name: label, Length: 2157007, dtype: float64

In [8]:
# Define Models
models = []
models.append(('RF', RandomForestClassifier(max_depth=5, n_estimators=5, max_features=3)))    
models.append(('CART', DecisionTreeClassifier(max_depth=5)))
models.append(('NB', GaussianNB()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('QDA', QuadraticDiscriminantAnalysis()))
models.append(('LR', LogisticRegression(solver='lbfgs', max_iter=200)))
models.append(('ABoost', AdaBoostClassifier()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('MLP', MLPClassifier()))

In [9]:
df=pd.DataFrame(columns=['name','b_acc','b_f1','b_recall','b_precision','m_acc','m_f1','m_recall','m_precision']+\
                 ['c1_acc','c1_f1','c1_recall','c1_precision']+\
                    ['c2_acc','c2_f1','c2_recall','c2_precision','c3_acc','c3_f1','c3_recall','c3_precision'])
eval_path='/home/irteam/junghye-dcloud-dir/MLAC/evaluation'
confusion_path='/home/irteam/junghye-dcloud-dir/MLAC/confusion_matrix/hierarchical'
cnt=0


In [10]:
X_train,X_test,y_train,y_test=train_test_split(data,binary_t,test_size=0.3, shuffle=True, stratify=binary_t, random_state=34)

In [11]:
multi1_train=multi1_t.loc[y_train.index]

multi1_test=multi1_t.loc[y_test.index]

class_1_train=class_1_data.loc[class_1_data.index.isin(y_train.index)]
class_1_test=class_1_data.loc[class_1_data.index.isin(y_test.index)]

class_2_train=class_2_data.loc[class_2_data.index.isin(y_train.index)]
class_2_test=class_2_data.loc[class_2_data.index.isin(y_test.index)]

class_3_train=class_3_data.loc[class_3_data.index.isin(y_train.index)]
class_3_test=class_3_data.loc[class_3_data.index.isin(y_test.index)]

# 각 train이랑 test로 나눔 



1. 모든 classifier을 동시에 다 train (있는 데이터 다 데리고)
    그리고 test할 떄만 계층적으로 내려옴 (test 데이터를 쪼개고쪼개고..??)

2. 한 classifier을 train -> train set으로 예측 진행 1로 분류된 데이터 -> 

In [12]:
def plot_confusion_matrix(con_mat,labels,title:str,cmap=plt.cm.get_cmap('Blues'),normalize=False):
    plt.imshow(con_mat,interpolation='nearest',cmap=cmap)
    plt.title(title)
    plt.colorbar()
    marks=np.arange(len(labels))
    nlabels=[]
    for k in range(len(con_mat)):
        n=sum(con_mat[k])
        nlabel='{0}(n={1})'.format(labels[k],n)
        nlabels.append(nlabel)

    plt.xticks(marks,labels)
    plt.yticks(marks,nlabels)

    thresh=con_mat.max()/2.
    if normalize:
        for i, j in itertools.product(range(con_mat.shape[0]), range(con_mat.shape[1])):
            plt.text(j, i, '{0}%'.format(con_mat[i, j] * 100 / n), horizontalalignment="center", color="white" if con_mat[i, j] > thresh else "black")
    else:
        for i, j in itertools.product(range(con_mat.shape[0]), range(con_mat.shape[1])):
            plt.text(j, i, con_mat[i, j], horizontalalignment="center", color="white" if con_mat[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    #plt.show()
    #이미지 저장
    plt.savefig(confusion_path+'/'+title+'.png',facecolor='#eeeeee')
    plt.clf()

In [13]:
def test_result(model:str,test,pred,target:list) ->list:

    acc=accuracy_score(test,pred)
    f1=f1_score(test,pred,average='weighted')
    recall=recall_score(test,pred,average='weighted')
    precision=precision_score(test,pred,average='weighted')
    #confusion=metrics.confusion_matrix(test,pred)
    #plot_confusion_matrix(confusion,labels=list(set(target)),title=model)
    print(f'{model} result , acc:{acc}, f1:{f1},recall:{recall},precision:{precision}')
    return([acc,f1,recall,precision])
    
    


In [14]:
for name, model in models:
    # all models training (다 같은 trainset으로)
    model_eval=[]
    print('binary train starting...')
    # binary classification
    binary_model=create_pipeline(model)
    binary_model.fit(X_train,y_train)

    print('multi1 train starting...')
    #2-step training
    multi1_model=create_pipeline(model)
    multi1_model.fit(X_train,multi1_train)

    # last-step training
    # all 4 models
    
    print('class1 train starting...')
    class_1_model=create_pipeline(model)
    class_1_model.fit(X_train.loc[X_train.index.isin(class_1_train.index)],class_1_train['attack_category'])

    print('class2 train starting...')
    class_2_model=create_pipeline(model)
    class_2_model.fit(X_train.loc[X_train.index.isin(class_2_train.index)],class_2_train['attack_category'])

    print('class3 train starting...')
    class_3_model=create_pipeline(model)
    class_3_model.fit(X_train.loc[X_train.index.isin(class_3_train.index)],class_3_train['attack_category'])


    # 순차적으로 test 진행
    print('binary test starting...')
    binary_pred=binary_model.predict(X_test) # 이게 값이 이상함.. 요소 값이 다 7임 
    binary_result=test_result(binary_model,y_test,binary_pred,binary_t)
    model_eval.extend(binary_result)
    
   
    # multi1_test filtering
    
    multi1_X_test=X_test.iloc[np.where(binary_pred==1)[0]] # 1로 분류된것만 이게 빈 걸로 반환됨 
    print('multi1 test starting...')
    multi1_pred=multi1_model.predict(multi1_X_test)
    multi1_test=multi1_test.iloc[np.where(binary_pred==1)[0]].tolist() # label중에서도 1로 분류된 것들만 
    multi1_result=test_result(multi1_model,multi1_test,multi1_pred,multi1_t)
    model_eval.extend(multi1_result)
    

    # class 1,2,3 filtering and test
    # 3개 classifier의 x_test부분
    class_1_test=multi1_X_test.iloc[np.where(multi1_pred==1)[0]]
    class_2_test=multi1_X_test.iloc[np.where(multi1_pred==2)[0]]
    class_3_test=multi1_X_test.iloc[np.where(multi1_pred==3)[0]]

    print('class_1 test starting..')
    class_1_pred=class_1_model.predict(class_1_test)
    
    class_1_result=test_result(class_1_model,class_1_test['attack_category'],class_1_pred,class_1_test['attack_category'])
    model_eval.extend(class_1_result)
    
    print('class_2 test starting..')
    class_2_pred=class_2_model.predict(class_2_test)
    
    class_2_result=test_result(class_2_model,class_2_test['attack_category'],class_2_pred,class_2_test['attack_category'])
    model_eval.extend(class_2_result)
    
    print('class_3 test starting..')
    class_3_pred=class_3_model.predict(class_3_test)
    class_3_result=test_result(class_3_model,class_3_test['attack_category'],class_3_pred,class_3_test['attack_category'])

    model_eval.extend(class_3_result)
    


    df.loc[cnt]=model_eval
    cnt=cnt+1
    
    

df.to_csv(os.path.join(eval_path,'hierarchical.csv'),index=False)
    

binary train starting...
multi1 train starting...
class1 train starting...
class2 train starting...
class3 train starting...
binary test starting...
Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', MinMaxScaler()),
                ('model',
                 RandomForestClassifier(max_depth=5, max_features=3,
                                        n_estimators=5))]) result , acc:0.0, f1:0.0,recall:0.0,precision:0.0
multi1 test starting...


ValueError: Found array with 0 sample(s) (shape=(0, 77)) while a minimum of 1 is required.

In [16]:
X_test.columns

Index(['protocol', 'flow_duration', 'tot_fwd_pkts', 'tot_bwd_pkts',
       'tot_len_fwd_pkts', 'tot_len_bwd_pkts', 'fwd_pkt_len_max',
       'fwd_pkt_len_min', 'fwd_pkt_len_mean', 'fwd_pkt_len_std',
       'bwd_pkt_len_max', 'bwd_pkt_len_min', 'bwd_pkt_len_mean',
       'bwd_pkt_len_std', 'flow_byts_s', 'flow_pkts_s', 'flow_iat_mean',
       'flow_iat_std', 'flow_iat_max', 'flow_iat_min', 'fwd_iat_tot',
       'fwd_iat_mean', 'fwd_iat_std', 'fwd_iat_max', 'fwd_iat_min',
       'bwd_iat_tot', 'bwd_iat_mean', 'bwd_iat_std', 'bwd_iat_max',
       'bwd_iat_min', 'fwd_psh_flags', 'bwd_psh_flags', 'fwd_urg_flags',
       'bwd_urg_flags', 'fwd_header_len', 'bwd_header_len', 'fwd_pkts_s',
       'bwd_pkts_s', 'pkt_len_min', 'pkt_len_max', 'pkt_len_mean',
       'pkt_len_std', 'pkt_len_var', 'fin_flag_cnt', 'syn_flag_cnt',
       'rst_flag_cnt', 'psh_flag_cnt', 'ack_flag_cnt', 'urg_flag_cnt',
       'cwe_flag_count', 'ece_flag_cnt', 'down_up_ratio', 'pkt_size_avg',
       'fwd_seg_size_avg', 'b

In [17]:
multi1_X_test

Unnamed: 0,protocol,flow_duration,tot_fwd_pkts,tot_bwd_pkts,tot_len_fwd_pkts,tot_len_bwd_pkts,fwd_pkt_len_max,fwd_pkt_len_min,fwd_pkt_len_mean,fwd_pkt_len_std,...,fwd_act_data_pkts,fwd_seg_size_min,active_mean,active_std,active_max,active_min,idle_mean,idle_std,idle_max,idle_min


In [23]:
binary_t.value_counts()


0.0    1599397
1.0     557610
Name: label, dtype: int64