In [31]:
import time
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn import metrics
from load import create_pipeline
import os



In [32]:
# 전처리
# Get Dataset
files={
    'CICI':'/home/irteam/junghye-dcloud-dir/MLAC/data/encoded_ConcatedCICI.csv',
    'UNSW': '/home/irteam/junghye-dcloud-dir/MLAC/data/encoded_ConcatedUNSW.csv'
}

data = pd.read_csv(files['CICI'])
data=data[np.isfinite(data).all(1)]
#target = data['attack_category']
binary_target=data['label']
multiclass_labels_1=data['nist_category']
multiclass_labels_2=data['attack_category']
class_1_data=data[data['nist_category']==0]
class_2_data=data[data['nist_category']==1]
class_3_data=data[data['nist_category']==2]
class_4_data=data[data['nist_category']==3]
data=data.drop(labels=['label','attack_category','nist_category','Unnamed: 0'],axis=1)


In [33]:
def data_loader():
    files={
    'CICI':'/home/irteam/junghye-dcloud-dir/MLAC/data/encoded_ConcatedCICI.csv',
    'UNSW': '/home/irteam/junghye-dcloud-dir/MLAC/data/encoded_ConcatedUNSW.csv'
}

    data = pd.read_csv(files['CICI'])
    data=data[np.isfinite(data).all(1)]
#tar   get = data['attack_category']
    binary_target=data['label']
    multiclass_labels_1=data['nist_category']
    multiclass_labels_2=data['attack_category']
    class_1_data=data[data['nist_category']==0]
    class_2_data=data[data['nist_category']==1]
    class_3_data=data[data['nist_category']==2]
    class_4_data=data[data['nist_category']==3]
    data=data.drop(labels=['label','attack_category','nist_category','Unnamed: 0'],axis=1)




In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2157007 entries, 0 to 2157006
Data columns (total 77 columns):
 #   Column             Dtype  
---  ------             -----  
 0   protocol           float64
 1   flow_duration      float64
 2   tot_fwd_pkts       float64
 3   tot_bwd_pkts       float64
 4   tot_len_fwd_pkts   float64
 5   tot_len_bwd_pkts   float64
 6   fwd_pkt_len_max    float64
 7   fwd_pkt_len_min    float64
 8   fwd_pkt_len_mean   float64
 9   fwd_pkt_len_std    float64
 10  bwd_pkt_len_max    float64
 11  bwd_pkt_len_min    float64
 12  bwd_pkt_len_mean   float64
 13  bwd_pkt_len_std    float64
 14  flow_byts_s        float64
 15  flow_pkts_s        float64
 16  flow_iat_mean      float64
 17  flow_iat_std       float64
 18  flow_iat_max       float64
 19  flow_iat_min       float64
 20  fwd_iat_tot        float64
 21  fwd_iat_mean       float64
 22  fwd_iat_std        float64
 23  fwd_iat_max        float64
 24  fwd_iat_min        float64
 25  bwd_iat_tot       

In [34]:
# Define Models
models = []
models.append(('RF', RandomForestClassifier(max_depth=5, n_estimators=5, max_features=3)))    
models.append(('CART', DecisionTreeClassifier(max_depth=5)))
models.append(('NB', GaussianNB()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('QDA', QuadraticDiscriminantAnalysis()))
models.append(('LR', LogisticRegression(solver='lbfgs', max_iter=200)))
models.append(('ABoost', AdaBoostClassifier()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('MLP', MLPClassifier()))

In [35]:
df=pd.DataFrame(columns=['name','binary_acc','binary_recall','multi_1_acc','multi_1_recall']+\
                 ['class_1_acc','class_1_recall','class_2_acc','class_2_recall','class_3_acc','class_3_recall','class_4_acc','class_4_recall'])
outpath='/home/irteam/junghye-dcloud-dir/MLAC/result'
cnt=0

In [36]:
for name, model in models:
    data_loader()
    # binary classification
    binary_model=create_pipeline(model)
   
    X_train,X_test,y_train,y_test=train_test_split(data,binary_target,test_size=0.3, shuffle=True, stratify=binary_target, random_state=34)
    binary_model.fit(X_train,y_train)
    #evaluation
    binary_pred=binary_model.predict(X_test)
    #evaluation result
    model_eval=[]
    model_eval.append(name)
    
    acc = accuracy_score(y_test, binary_pred)
    recall_ma = recall_score(y_test, binary_pred, average='macro')
    model_eval.append(acc)
    model_eval.append(recall_ma)

    print('binary_result','acc',acc,'recall',recall_ma)

    # malicious index 추출
    malicious_indices=np.where(binary_pred==1)[0]
    print('total multiclass labels #', len(multiclass_labels_1))
    print('predicted malicious attack #:',len(malicious_indices))

    # multiclass_labels_1 추출
    try : 
        multiclass_labels_1=[multiclass_labels_1[idx] for idx in malicious_indices]
    except:
        print('out of bound')
    print('extracted multiclass labels #', len(multiclass_labels_1))
    X_test=X_test.iloc[malicious_indices]

    multiclass_model_1=create_pipeline(model)

    multiclass_model_1.fit(X_train.iloc[malicious_indices],multiclass_labels_1)
    #evaluation
    multiclass_pred_1=multiclass_model_1.predict(X_test)
    #evaluation result
    
    print('multi_result','acc',acc,'recall',recall_ma)
    # acc, recall값 구하는게 잘못됨.. 
    acc = accuracy_score(y_test.iloc[malicious_indices], multiclass_pred_1)
    recall_ma = recall_score(y_test.iloc[malicious_indices], multiclass_pred_1, average='macro')
    model_eval.append(acc)
    model_eval.append(recall_ma)
   


    # multiclass_labels_2 추출
    multiclass_labels_2=multiclass_labels_2[malicious_indices]
    
    # 4개 클래스로 데이터를 분류 
    # x_train
    # 각 class로 예측된 거에 한해서 subclass로 분류하도록 코드를 짜야함 
    class_1_data=class_1_data.loc[[idx for idx in class_1_data.index if idx in malicious_indices]]
    class_2_data=class_2_data.loc[[idx for idx in class_2_data.index if idx in malicious_indices]]
    class_3_data=class_3_data.loc[[idx for idx in class_3_data.index if idx in malicious_indices]]
    class_4_data=class_4_data.loc[[idx for idx in class_4_data.index if idx in malicious_indices]]    
    

    # 각 classifier

    for class_data in [class_1_data,class_2_data,class_3_data,class_4_data]:
        X_train,X_test,y_train,y_test=train_test_split(
            class_data.drop(labels=['nist_category','label','attack_category'],axis=1),
            class_data['attack_category'],
            test_size=0.3,
            random_state=42
        )

        classifier=create_pipeline(model)

        classifier.fit(X_train,y_train)

        classifier_pred=classifier.predict(X_test)

        acc = accuracy_score(y_test, classifier_pred)
        recall_ma = recall_score(y_test, classifier_pred, average='macro')
        model_eval.append(acc)
        model_eval.append(recall_ma)
        print('class_result','acc',acc,'recall',recall_ma)

    df.loc[cnt]=model_eval


    

binary_result acc 0.9471861512000408 recall 0.8990315135063198
total multiclass labels # 2157007
predicted malicious attack #: 134321
extracted multiclass labels # 134321
multi_result acc 0.9471861512000408 recall 0.8990315135063198
class_result acc 0.978494623655914 recall 0.9891304347826086
class_result acc 1.0 recall 1.0
class_result acc 0.9959423818218706 recall 0.7925919647397864
class_result acc 1.0 recall 1.0


KeyboardInterrupt: 

In [None]:
df

Unnamed: 0,name,binary_acc,binary_recall,multi_1_acc,binary_recall.1,class_1_acc,class_1_recall,class_2_acc,class_2_recall,class_3_acc,class_3_recall,class_4_acc,class_4_recall
0,RF,0.959969,0.934766,0.0,0.0,0.981481,0.75,0.995717,0.973684,0.994221,0.766846,1.0,1.0


In [46]:
def binary_classification(model,X,y):
    imputer=SimpleImputer(strategy='mean')
    scaler=MinMaxScaler()
    pipeline=Pipeline(steps=[('imputer',imputer),('scaler',scaler),('model',model)])
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3, shuffle=True, stratify=y, random_state=34)
    pipeline.fit(X_train,y_train)
    #evaluation
    binary_pred=pipeline.predict(X_test)
    return y_test,binary_pred
    

def multiclass_classification(model,X,y):
    imputer=SimpleImputer(strategy='mean')
    scaler=MinMaxScaler()
    pipeline=Pipeline(steps=[('imputer',imputer),('scaler',scaler),('model',model)])
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3, shuffle=True, stratify=y, random_state=34)
    



AttributeError: 'list' object has no attribute 'shape'

In [14]:
# malicious_indices 와 multiclass_labels_1 
X_train.iloc[malicious_indices]

Unnamed: 0,protocol,flow_duration,tot_fwd_pkts,tot_bwd_pkts,tot_len_fwd_pkts,tot_len_bwd_pkts,fwd_pkt_len_max,fwd_pkt_len_min,fwd_pkt_len_mean,fwd_pkt_len_std,...,fwd_act_data_pkts,fwd_seg_size_min,active_mean,active_std,active_max,active_min,idle_mean,idle_std,idle_max,idle_min
1270609,1.000000,0.329329,0.479980,0.368368,0.560060,0.479479,0.460961,0.798799,0.605105,0.000000,...,0.512513,0.671171,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000
1717603,0.389389,0.050050,0.307307,0.139139,0.000000,0.383383,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.987487,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000
1146048,0.389389,0.066567,0.307307,0.139139,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.888388,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000
182069,0.389389,0.878067,0.736236,0.713714,0.716049,0.800063,0.784785,0.000000,0.677205,0.809368,...,0.811812,0.671171,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000
540200,0.389389,0.889641,0.736236,0.713714,0.717585,0.792207,0.787266,0.000000,0.679450,0.810493,...,0.811812,0.671171,0.966569,0.00000,0.957522,0.979462,0.913807,0.000000,0.90870,0.917916
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1790791,0.389389,0.808081,0.756757,0.657157,0.848036,0.709030,0.915415,0.000000,0.856703,0.913585,...,0.763263,0.888388,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000
172578,0.389389,0.931700,0.938939,0.972306,0.921787,0.984044,0.902903,0.000000,0.558133,0.829334,...,0.984735,0.671171,0.970580,0.98705,0.976422,0.974139,0.940440,0.959783,0.92993,0.943940
1180527,1.000000,0.348348,0.479980,0.368368,0.580581,0.463463,0.506006,0.839339,0.651652,0.000000,...,0.512513,0.671171,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000
1597909,0.389389,0.104104,0.307307,0.139139,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.888388,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000


In [17]:
print(len(multiclass_labels_1))

161007


In [69]:
print(X_train.iloc[malicious_indices].shape)
print(len(multiclass_labels_1))



(137413, 77)
27241


In [65]:
type(multiclass_labels_1)
np.array(multiclass_labels_1)[np.where(np.isin(list(range(len(multiclass_labels_1))),malicious_indices)==True)]

array([3., 3., 3., ..., 3., 3., 3.])

In [9]:
class_1_data.loc[[idx for idx in class_1_data.index if idx in malicious_indices]]

Unnamed: 0.1,Unnamed: 0,protocol,flow_duration,tot_fwd_pkts,tot_bwd_pkts,tot_len_fwd_pkts,tot_len_bwd_pkts,fwd_pkt_len_max,fwd_pkt_len_min,fwd_pkt_len_mean,...,active_std,active_max,active_min,idle_mean,idle_std,idle_max,idle_min,attack_category,label,nist_category
12711,12711,0.389389,0.879267,0.698699,0.657157,0.891724,0.734644,0.920469,0.0,0.942913,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,1.0,0.0
18667,18667,0.389389,0.851569,0.599600,0.139139,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,1.0,0.0
18786,18786,0.389389,0.915558,0.983884,0.975694,0.998388,0.978425,0.922480,0.0,0.960461,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,1.0,0.0
18830,18830,0.389389,0.892476,0.641642,0.625125,0.681181,0.680082,0.868535,0.0,0.864796,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,1.0,0.0
18998,18998,0.389389,0.881956,0.599600,0.139139,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91512,91512,0.389389,0.090090,0.307307,0.139139,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,1.0,0.0
92475,92475,0.389389,0.874099,0.663664,0.572072,0.819648,0.765043,0.919360,0.0,0.903536,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,1.0,0.0
92583,92583,0.389389,0.874198,0.682182,0.625125,0.822956,0.784236,0.922405,0.0,0.886063,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,1.0,0.0
93235,93235,0.389389,0.087087,0.307307,0.139139,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,1.0,0.0


In [73]:
# 악성으로 분류된 것 개수
print(len(malicious_indices))
print(malicious_indices)
print([multiclass_labels_1[[idx]] for idx in malicious_indices])

137413
[     1      3      7 ... 647091 647096 647098]


IndexError: index 27241 is out of bounds for axis 0 with size 27241

In [None]:
df.to_csv(os.path.join(outpath,'evaluation'),header=True)