In [59]:
import time
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn import metrics
from load import test_result,create_pipeline,plot_confusion_matrix

import os



In [60]:

from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
import itertools

In [61]:
# 전처리
# Get Dataset
files={
    'CICI':'/home/irteam/junghye-dcloud-dir/MLAC/new_data/CICI.csv',
    'UNSW':'/home/irteam/junghye-dcloud-dir/MLAC/new_data/UNSW.csv'
}

data = pd.read_csv(files['CICI'])
data=data[np.isfinite(data).all(1)]




In [62]:
#target
binary_t=data['label']
multi1_t=data['nist_category']
multi2_t=data['attack_category'] # 최종
# 마지막 3-class classifier
class_1_data=data[data['nist_category']==1] # 여기서 각각 attack_category예측
class_2_data=data[data['nist_category']==2]
class_3_data=data[data['nist_category']==3]
class_4_data=data[data['nist_category']==4]

for class_data in [class_1_data,class_2_data,class_3_data,class_4_data]:
    class_data.drop(labels=['nist_category','label'],axis=1,inplace=True)

data=data.drop(labels=['label','attack_category','nist_category'],axis=1)

In [63]:
# Define Models
models = []
models.append(('RF', RandomForestClassifier(max_depth=5, n_estimators=5, max_features=3)))    
models.append(('CART', DecisionTreeClassifier(max_depth=5)))
models.append(('NB', GaussianNB()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('QDA', QuadraticDiscriminantAnalysis()))
models.append(('LR', LogisticRegression(solver='lbfgs', max_iter=200)))
models.append(('ABoost', AdaBoostClassifier()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('MLP', MLPClassifier()))

In [64]:
df=pd.DataFrame(columns=['name','b_acc','b_f1','b_rc','b_pc','m_acc','m_f1','m_rc','m_pc']+\
                 ['c1_acc','c1_f1','c1_rc','c1_pc']+\
                    ['c2_acc','c2_f1','c2_rc','c2_pc','c3_acc','c3_f1','c3_rc','c3_pc','c4_acc','c4_f1','c4_rc','c4_pc'])
eval_path='/home/irteam/junghye-dcloud-dir/MLAC/evaluation'
confusion_path='/home/irteam/junghye-dcloud-dir/MLAC/confusion_matrix/hierarchical'
cnt=0


In [65]:
X_train,X_test,y_train,y_test=train_test_split(data,binary_t,test_size=0.3, shuffle=True, stratify=binary_t, random_state=34)

In [66]:
multi1_train=multi1_t.loc[y_train.index]

multi1_test=multi1_t.loc[y_test.index]

# class data, class_1_train, class_y_train, class_y_test 

nist_data={
    1: {'data':class_1_data,'y_train':None,'y_test':None },
    2: {'data':class_2_data,'y_train':None,'y_test':None },
    3: {'data':class_3_data,'y_train':None,'y_test':None },
    4: {'data':class_4_data,'y_train':None,'y_test':None }
}

for cls,data in nist_data.items():
    data['X_train']=data['data'].loc[data['data'].index.isin(y_train.index)]
    data['y_train']=data['X_train']['attack_category']
    data['y_test']=data['data'].loc[data['data'].index.isin(y_test.index), 'attack_category']
    data['X_train'].drop('attack_category',axis=1,inplace=True)



1. 모든 classifier을 동시에 다 train (있는 데이터 다 데리고)
    그리고 test할 떄만 계층적으로 내려옴 (test 데이터를 쪼개고쪼개고..??)

2. 한 classifier을 train -> train set으로 예측 진행 1로 분류된 데이터 -> 

In [67]:
model_eval=[]
    
def train_and_predict(model, X_train, y_train, multi1_X_test, indices):
    model.fit(X_train,y_train)
    test_selected = multi1_X_test.iloc[indices]
    predictions = model.predict(test_selected)
    result = test_result(model, multi2_t.loc[indices], predictions)
    model_eval.extend(result)



In [68]:
for name, model in models:
    # all models training (다 같은 trainset으로)
    model_eval=[]
    model_eval.append(name)
    print('binary train starting...')
    # binary classification
    binary_model=model
    
    binary_model.fit(X_train,y_train)

        # 순차적으로 test 진행
    print('binary test starting...')
    binary_pred=binary_model.predict(X_test) #  
    binary_result=test_result(binary_model,y_test,binary_pred)
    model_eval.extend(binary_result)
    

    print('multi1 train starting...')
    #2-step training
    # 주의 : label이 0인 것 빼고 training
    
    multi1_model=create_pipeline(model)
    multi1_model.fit(X_train[multi1_train!=0],multi1_train[multi1_train!=0])
    multi1_X_test=X_test.iloc[np.where(binary_pred==1)[0]] # 1로 분류된것만 이게 빈 걸로 반환됨 
    
    print('multi1 test starting...')
    multi1_pred=multi1_model.predict(multi1_X_test)
    # 리스트에서 1인 곳 의 값 반환 -> list 
    multi1_test_selected = multi1_test.iloc[np.where(binary_pred==1)[0]]# label중에서도 1로 분류된 것들만 
    multi1_result=test_result(multi1_model,multi1_test_selected,multi1_pred)
    model_eval.extend(multi1_result)
    # last-step training
    # all 4 models

    # step 2의 결과로 1,2,3,4 로 분류된 것 
    indices1 = np.where(multi1_pred == 1)[0]
    indices2 = np.where(multi1_pred == 2)[0]
    indices3 = np.where(multi1_pred == 3)[0]
    indices4 = np.where(multi1_pred == 4)[0]
    indices=[indices1,indices2,indices3,indices3]
    #model, X_train, y_train, step2_X_test, y_test, indices
    for cls,data in nist_data.items():
        print('class'+str(cls)+' train & test starting')
    
        train_and_predict(model,data['X_train'],data['y_train'],multi1_X_test,indices[cls-1])

    

   

    df.loc[cnt]=model_eval
    cnt=cnt+1
    
    

df.to_csv(os.path.join(eval_path,'hierarchical.csv'),index=False)
    

binary train starting...
binary test starting...
RandomForestClassifier(max_depth=5, max_features=3, n_estimators=5) result , acc:0.9715362699978085, f1:0.9698248032018093,recall:0.9715362699978085,precision:0.9718991311794214
multi1 train starting...
multi1 test starting...
Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', MinMaxScaler()),
                ('model',
                 RandomForestClassifier(max_depth=5, max_features=3,
                                        n_estimators=5))]) result , acc:0.9180790259989834, f1:0.9153392304651311,recall:0.9180790259989834,precision:0.928008294868656
class1 train & test starting
RandomForestClassifier(max_depth=5, max_features=3, n_estimators=5) result , acc:0.0, f1:0.0,recall:0.0,precision:0.0
class2 train & test starting
RandomForestClassifier(max_depth=5, max_features=3, n_estimators=5) result , acc:0.0, f1:0.0,recall:0.0,precision:0.0
class3 train & test starting
RandomForestClassifier(max_depth=5, max_features=3, n_estimators

ValueError: cannot set a row with mismatched columns

In [82]:
select_test_X=multi1_X_test.iloc[indices1] # 1(binary_pred)결과 , 1(multi1_pred)로 분류된 애들을 x test셋으로
model=RandomForestClassifier(max_depth=5, n_estimators=5, max_features=3)
y_test=multi2_t.loc[indices1]
model.fit(nist_data[1]['X_train'],nist_data[1]['y_train']) # 그럼 여기서 y_test에는 없는 값을 제외해야할듯 

y_pred=model.predict(select_test_X)


In [88]:
set(y_pred)

{16}

In [87]:
y_test.unique()

array([ 0, 20, 22, 21])

In [90]:
nist_data[1]['y_train'].unique()


array([20, 22, 21, 14, 16, 17])

In [93]:
nist_data[1]['y_train']

12636      20
12642      20
12699      20
15235      20
15237      20
           ..
6442797    17
6443287    17
6443689    17
6443886    17
6444114    17
Name: attack_category, Length: 123521, dtype: int64

In [None]:
class_1_test

In [None]:
df.head()

In [None]:
index_list=[i for i in np.where(multi1_pred==1)[0] if i in class_1_test.index]
#class_1_test.loc[index_list,:]
print(np.where(multi1_pred==1)[0])


In [None]:
set(multi1_pred)

In [None]:
print(len(multi1_test))
print(len(binary_pred))

In [None]:
from collections import Counter


result = Counter(binary_pred)
print(result)

for key in result:
    print(key, result[key])

# 값만 출력도 가능
result = Counter(binary_pred).values()
print(result)

In [None]:
y_test.value_counts()

In [None]:
multi1_X_test

In [None]:
binary_t.value_counts()
