In [1]:
import time
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn import metrics
from load import test_result,create_pipeline,plot_confusion_matrix

import os



In [2]:

from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
import itertools

In [3]:
# 전처리
# Get Dataset
files={
    'CICI':'/home/irteam/junghye-dcloud-dir/MLAC/new_data/CICI.csv',
    'UNSW':'/home/irteam/junghye-dcloud-dir/MLAC/new_data/UNSW.csv'
}

data = pd.read_csv(files['CICI'])
data=data[np.isfinite(data).all(1)]




In [4]:
#target
binary_t=data['label']
multi1_t=data['nist_category']
multi2_t=data['attack_category'] # 최종
# 마지막 3-class classifier
class_1_data=data[data['nist_category']==1] # label 안뗸 데이터셋 
class_2_data=data[data['nist_category']==2]
class_3_data=data[data['nist_category']==3]
class_4_data=data[data['nist_category']==4]

for class_data in [class_1_data,class_2_data,class_3_data,class_4_data]:
    class_data.drop(labels=['nist_category','label'],axis=1,inplace=True)

data=data.drop(labels=['label','attack_category','nist_category'],axis=1)

In [5]:
# Define Models
models = []
models.append(('RF', RandomForestClassifier(max_depth=5, n_estimators=5, max_features=3)))    
models.append(('CART', DecisionTreeClassifier(max_depth=5)))
models.append(('NB', GaussianNB()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('QDA', QuadraticDiscriminantAnalysis()))
models.append(('LR', LogisticRegression(solver='lbfgs', max_iter=200)))
models.append(('ABoost', AdaBoostClassifier()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('MLP', MLPClassifier()))

In [6]:
df=pd.DataFrame(columns=['name','b_acc','b_f1','b_rc','b_pc','m_acc','m_f1','m_rc','m_pc']+\
                 ['c1_acc','c1_f1','c1_rc','c1_pc']+\
                    ['c2_acc','c2_f1','c2_rc','c2_pc','c3_acc','c3_f1','c3_rc','c3_pc','c4_acc','c4_f1','c4_rc','c4_pc'])
eval_path='/home/irteam/junghye-dcloud-dir/MLAC/evaluation'
confusion_path='/home/irteam/junghye-dcloud-dir/MLAC/confusion_matrix/hierarchical'
cnt=0


In [7]:
X_train,X_test,y_train,y_test=train_test_split(data,binary_t,test_size=0.3, shuffle=True, stratify=binary_t, random_state=34)

In [8]:
multi1_train=multi1_t.loc[y_train.index]

multi1_test=multi1_t.loc[y_test.index]

# class data, class_1_train, class_y_train, class_y_test 

nist_data={
    1: {'data':class_1_data,'y_train':None,'y_test':None },
    2: {'data':class_2_data,'y_train':None,'y_test':None },
    3: {'data':class_3_data,'y_train':None,'y_test':None },
    4: {'data':class_4_data,'y_train':None,'y_test':None }
}

for cls,data in nist_data.items():
    data['X_train']=data['data'].loc[data['data'].index.isin(y_train.index)] # 전체 데이터 중 위에서 split한 y_train과 상응하는 X_train만
    data['y_train']=data['X_train']['attack_category'] # X_train이 label 떼기 전이니까 attack category 가져옴
    data['y_test']=data['data'].loc[data['data'].index.isin(y_test.index), 'attack_category'] # y_test와 상응하는 attack cateogry만 뽑아서
    data['X_train'].drop('attack_category',axis=1,inplace=True) # X_train에서 라벨 있으면 안되니까 지우기 



1. 모든 classifier을 동시에 다 train (있는 데이터 다 데리고)
    그리고 test할 떄만 계층적으로 내려옴 (test 데이터를 쪼개고쪼개고..??)

2. 한 classifier을 train -> train set으로 예측 진행 1로 분류된 데이터 -> 

In [9]:
model_eval=[]

# layer-3 train & test
def train_and_predict(model, X_train, y_train, multi1_X_test, indices):
    model.fit(X_train,y_train)
    test_selected = multi1_X_test.iloc[indices]
    predictions = model.predict(test_selected)
    result = test_result(model, multi2_t.loc[indices], predictions)
    model_eval.extend(result)


In [10]:
for name, model in models:
    # all models training (다 같은 trainset으로)
    model_eval=[]
    model_eval.append(name)
    print('binary train starting...')
    # binary classification
    binary_model=model
    
    binary_model.fit(X_train,y_train)

    
    print('binary test starting...')
    binary_pred=binary_model.predict(X_test) #  
    binary_result=test_result(binary_model,y_test,binary_pred)
    model_eval.extend(binary_result)
    
    # 4 class classification
    print('multi1 train starting...')
    #2-step training
    # 주의 : label이 0인 것 빼고 training
    
    multi1_model=create_pipeline(model)
    multi1_model.fit(X_train[multi1_train!=0],multi1_train[multi1_train!=0])
    multi1_X_test=X_test[multi1_test!=0].iloc[np.where(binary_pred==1)[0]] # 1로 분류된것만 이게 빈 걸로 반환됨 
    
    print('multi1 test starting...')
    multi1_pred=multi1_model.predict(multi1_X_test)
    # 리스트에서 1인 곳 의 값 반환 -> list 
    multi1_test_selected = multi1_test[multi1_test!=0].iloc[np.where(binary_pred==1)[0]]# label중에서도 1로 분류된 것들만 
    multi1_result=test_result(multi1_model,multi1_test_selected,multi1_pred)
    model_eval.extend(multi1_result)
    # last-step training
    # all 4 models

    # step 2의 결과로 1,2,3,4 로 분류된 것 
    indices1 = np.where(multi1_pred == 1)[0]
    indices2 = np.where(multi1_pred == 2)[0]
    indices3 = np.where(multi1_pred == 3)[0]
    indices4 = np.where(multi1_pred == 4)[0]
    indices=[indices1,indices2,indices3,indices4]
    #model, X_train, y_train, step2_X_test, y_test, indices
    for cls,data in nist_data.items():
        
        train_and_predict(model,data['X_train'],data['y_train'],multi1_X_test,indices[cls-1])

    

   

    df.loc[cnt]=model_eval
    cnt=cnt+1
    
    

df.to_csv(os.path.join(eval_path,'hierarchical.csv'),index=False)
    

binary train starting...
binary test starting...
RandomForestClassifier(max_depth=5, max_features=3, n_estimators=5) result , acc:0.970591071405551, f1:0.9687361558694372,recall:0.970591071405551,precision:0.9710168555131092
multi1 train starting...
multi1 test starting...
Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', MinMaxScaler()),
                ('model',
                 RandomForestClassifier(max_depth=5, max_features=3,
                                        n_estimators=5))]) result , acc:0.926084361946041, f1:0.9244117853783722,recall:0.926084361946041,precision:0.9328612339148686
RandomForestClassifier(max_depth=5, max_features=3, n_estimators=5) result , acc:0.0, f1:0.0,recall:0.0,precision:0.0
RandomForestClassifier(max_depth=5, max_features=3, n_estimators=5) result , acc:0.0, f1:0.0,recall:0.0,precision:0.0
RandomForestClassifier(max_depth=5, max_features=3, n_estimators=5) result , acc:0.0, f1:0.0,recall:0.0,precision:0.0
RandomForestClassifier(max_depth=5, 

KeyboardInterrupt: 

In [14]:
boolean_binary=binary_pred==1 # 결론 : np.where을 쓰는게 맞는 인덱싱임 

In [15]:
boolean_to_idx={i:boolean_binary[i] for i in range(len(boolean_binary))}

In [16]:
for idx,boolean_value in boolean_to_idx.items():
    if boolean_value==True:
        print(idx)

17
48
58
59
74
85
118
129
138
147
157
159
211
221
227
228
256
275
292
310
319
323
362
364
366
370
372
421
445
449
451
456
462
474
481
489
493
496
516
524
525
539
540
545
549
568
571
572
589
590
593
595
633
638
667
689
733
744
752
753
768
773
796
803
856
863
869
873
884
894
895
922
927
955
976
982
984
987
988
1005
1041
1042
1083
1084
1121
1159
1202
1209
1231
1279
1329
1375
1377
1393
1404
1409
1423
1424
1444
1496
1513
1517
1532
1540
1547
1563
1590
1600
1612
1615
1621
1634
1641
1647
1652
1658
1676
1687
1703
1706
1719
1724
1765
1768
1770
1776
1788
1793
1796
1819
1824
1827
1839
1862
1866
1871
1873
1921
1928
1931
1950
1959
1988
1999
2032
2035
2072
2087
2089
2113
2130
2148
2153
2166
2195
2209
2247
2255
2263
2271
2275
2277
2286
2293
2298
2302
2318
2324
2326
2337
2349
2407
2426
2433
2440
2447
2449
2450
2470
2525
2533
2556
2593
2596
2625
2647
2650
2656
2664
2672
2675
2712
2717
2735
2751
2756
2758
2790
2794
2802
2815
2820
2826
2827
2839
2864
2871
2878
2885
2890
2893
2899
2909
2919
2920
2939
2959


In [17]:
X_test.loc[binary_pred==1]

Unnamed: 0,protocol,flow_duration,tot_fwd_pkts,tot_bwd_pkts,tot_len_fwd_pkts,tot_len_bwd_pkts,fwd_pkt_len_max,fwd_pkt_len_min,fwd_pkt_len_mean,fwd_pkt_len_std,...,init_bwd_win_byts,fwd_act_data_pkts,active_mean,active_std,active_max,active_min,idle_mean,idle_std,idle_max,idle_min
1409533,0.385385,0.578716,0.602603,0.635135,0.398899,0.884885,0.408408,0.000000,0.452452,0.647648,...,0.426927,0.640641,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
321830,0.385385,0.941032,0.689189,0.722222,0.717718,0.869369,0.890886,0.000000,0.830900,0.905586,...,0.447948,0.696697,0.927857,0.000000,0.927895,0.932875,0.980063,0.000000,0.977042,0.978645
386108,0.385385,0.956345,0.689189,0.722222,0.672864,0.869369,0.866867,0.000000,0.806306,0.875919,...,0.447948,0.696697,0.927928,0.000000,0.928721,0.933430,0.990991,0.000000,0.989489,0.989079
1806777,0.385385,0.081294,0.311311,0.136136,0.331331,0.396396,0.338839,0.685686,0.346346,0.000000,...,0.351852,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
402835,0.385385,0.946442,0.704705,0.700200,0.703106,0.869369,0.883884,0.000000,0.774543,0.894907,...,0.447948,0.506507,0.905405,0.000000,0.908408,0.907407,0.984998,0.000000,0.981985,0.983302
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1488679,0.385385,0.965253,0.719720,0.635135,0.517518,0.891391,0.408408,0.000000,0.428929,0.631632,...,0.426927,0.792292,0.991310,0.000000,0.987283,0.994991,0.987130,0.000000,0.985152,0.986320
433427,0.385385,0.943098,0.704705,0.700200,0.703106,0.869369,0.880881,0.000000,0.774543,0.892003,...,0.447948,0.640641,0.918195,0.000000,0.919976,0.921928,0.982983,0.000000,0.979334,0.980522
403323,0.385385,0.945043,0.689189,0.665666,0.678679,0.869369,0.869416,0.000000,0.809309,0.888755,...,0.447948,0.696697,0.932939,0.000000,0.933936,0.937969,0.983734,0.000000,0.980522,0.981648
5649363,0.000000,0.958959,0.980981,0.136136,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.981982,0.987348,0.979232,0.982179,0.954955,0.980025,0.954955,0.958959


In [18]:
X_test.iloc[np.where(binary_pred==1)[0]]

Unnamed: 0,protocol,flow_duration,tot_fwd_pkts,tot_bwd_pkts,tot_len_fwd_pkts,tot_len_bwd_pkts,fwd_pkt_len_max,fwd_pkt_len_min,fwd_pkt_len_mean,fwd_pkt_len_std,...,init_bwd_win_byts,fwd_act_data_pkts,active_mean,active_std,active_max,active_min,idle_mean,idle_std,idle_max,idle_min
1409533,0.385385,0.578716,0.602603,0.635135,0.398899,0.884885,0.408408,0.000000,0.452452,0.647648,...,0.426927,0.640641,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
321830,0.385385,0.941032,0.689189,0.722222,0.717718,0.869369,0.890886,0.000000,0.830900,0.905586,...,0.447948,0.696697,0.927857,0.000000,0.927895,0.932875,0.980063,0.000000,0.977042,0.978645
386108,0.385385,0.956345,0.689189,0.722222,0.672864,0.869369,0.866867,0.000000,0.806306,0.875919,...,0.447948,0.696697,0.927928,0.000000,0.928721,0.933430,0.990991,0.000000,0.989489,0.989079
1806777,0.385385,0.081294,0.311311,0.136136,0.331331,0.396396,0.338839,0.685686,0.346346,0.000000,...,0.351852,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
402835,0.385385,0.946442,0.704705,0.700200,0.703106,0.869369,0.883884,0.000000,0.774543,0.894907,...,0.447948,0.506507,0.905405,0.000000,0.908408,0.907407,0.984998,0.000000,0.981985,0.983302
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1488679,0.385385,0.965253,0.719720,0.635135,0.517518,0.891391,0.408408,0.000000,0.428929,0.631632,...,0.426927,0.792292,0.991310,0.000000,0.987283,0.994991,0.987130,0.000000,0.985152,0.986320
433427,0.385385,0.943098,0.704705,0.700200,0.703106,0.869369,0.880881,0.000000,0.774543,0.892003,...,0.447948,0.640641,0.918195,0.000000,0.919976,0.921928,0.982983,0.000000,0.979334,0.980522
403323,0.385385,0.945043,0.689189,0.665666,0.678679,0.869369,0.869416,0.000000,0.809309,0.888755,...,0.447948,0.696697,0.932939,0.000000,0.933936,0.937969,0.983734,0.000000,0.980522,0.981648
5649363,0.000000,0.958959,0.980981,0.136136,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.981982,0.987348,0.979232,0.982179,0.954955,0.980025,0.954955,0.958959


In [19]:
X_train[y_train!=0]

Unnamed: 0,protocol,flow_duration,tot_fwd_pkts,tot_bwd_pkts,tot_len_fwd_pkts,tot_len_bwd_pkts,fwd_pkt_len_max,fwd_pkt_len_min,fwd_pkt_len_mean,fwd_pkt_len_std,...,init_bwd_win_byts,fwd_act_data_pkts,active_mean,active_std,active_max,active_min,idle_mean,idle_std,idle_max,idle_min
390987,0.385385,0.963021,0.738739,0.665666,0.662419,0.869369,0.866867,0.000000,0.594595,0.837753,...,0.447948,0.506507,0.918017,0.000000,0.919921,0.921279,0.996997,0.000000,0.996997,0.996997
407293,0.385385,0.943983,0.689189,0.722222,0.712212,0.869369,0.886557,0.000000,0.827352,0.903644,...,0.447948,0.696697,0.927953,0.000000,0.929934,0.933944,0.983483,0.000000,0.980063,0.981315
441381,0.385385,0.002503,0.482482,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4857693,0.385385,0.839179,0.792793,0.762262,0.644645,0.489489,0.578579,0.000000,0.526473,0.671482,...,0.811812,0.792292,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
251635,0.385385,0.730807,0.602603,0.722222,0.723533,0.869369,0.901883,0.000000,0.942482,0.942332,...,0.447948,0.506507,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300116,0.385385,0.955126,0.689189,0.700200,0.699199,0.869369,0.881882,0.000000,0.817221,0.901798,...,0.447948,0.506507,0.903904,0.000000,0.906406,0.905906,0.989990,0.000000,0.987988,0.988042
1405772,0.385385,0.840199,0.602603,0.700200,0.398899,0.884885,0.408408,0.000000,0.452452,0.647648,...,0.426927,0.640641,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1455023,0.385385,0.860391,0.646146,0.000000,0.392893,0.000000,0.374875,0.725726,0.398899,0.000000,...,0.000000,0.696697,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
277149,0.385385,0.941726,0.646146,0.665666,0.660778,0.869369,0.864044,0.000000,0.844405,0.903523,...,0.447948,0.506507,0.000000,0.000000,0.000000,0.000000,0.980981,0.000000,0.977666,0.979334


In [None]:
#multi_pred==1의 인덱스와.. np.where(multi1_pred==1) 이 다른지..
multi1_pred==1

array([False, False,  True, ..., False, False,  True])

In [None]:
test_selected=multi1_X_test.loc[multi1_pred==1]

In [None]:
test_selected.index

Int64Index([1730017, 1806777, 1754876, 1705235, 1761427, 1726251, 1807958,
            1635721, 1643410, 1685516,
            ...
            1749991, 1645112, 1748487, 1626617, 1753825, 1653716, 1692272,
            1641434, 1649049, 1648490],
           dtype='int64', length=47410)

In [None]:
test_selected.index==indices1

array([False, False, False, ..., False, False, False])