In [1]:
import os
import sys

import copy
import time
import random

import pandas as pd
import numpy as np

import collections
from tabulate import tabulate

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import sklearn
from sklearn.decomposition import * 
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import *
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.metrics import recall_score, f1_score, accuracy_score, confusion_matrix

from imblearn.over_sampling import SMOTE

import xgboost, lightgbm
from mlxtend.classifier import EnsembleVoteClassifier

import arff

In [2]:
#

""" 以样本为单位，确定其类别，按照重复量降序排序 """
def process(dataset):
    print("Origin Dataset:", dataset.shape)
    
    index = np.argsort(dataset.sum(axis=1))[::-1]
    new_dataset = dataset[index]
    return pd.DataFrame(new_dataset)
""" 构建N*10的矩阵，ij表示第i个样本在10类中分别占的个数 """
def stat(dataset, classes=10):
    X, y = dataset
    print(f"X={X.shape}, y={y.shape}")
    
    new_X, index, index_re, count_X = np.unique(X, return_index=True, return_inverse=True, return_counts=True, axis=0)
    print("New X={}, Index_re={}, Count={}, maxRepeat={}".format(new_X.shape, index_re.shape, sum(count_X), max(count_X)))
    
    class_X = np.zeros((new_X.shape[0], classes), dtype=int)
    
    assert y.shape[0] == len(index_re)
    for i in range(y.shape[0]):
        class_X[index_re[i]][y[i]] += 1
    
    # 将重复多的放在前面
    i = np.argsort(count_X, axis=0)[::-1]
    count_X = count_X[i]
    new_X = new_X[i]
    class_X = class_X[i]
    
    assert sum([count_X[i] == sum(class_X[i]) for i in range(len(count_X))]) == new_X.shape[0]
    
    print("return new_X={}, class_X={}".format(new_X.shape, class_X.shape))
    print("End")
    return new_X, class_X

## UNSW-NB15

In [4]:

def get_NBSW_NB15(file):
    data = pd.read_csv(file)
    data = data.drop(['id'], 1)
    print("data:\t{}".format(data.shape))
    return data

def preprocess_for_UNSW_NB15(s_dataset="UNSW NB15"):   
    dataset = s_dataset.drop(['label'], axis=1)
    data_x = dataset.drop(['attack_cat'], axis=1) # 'proto', 'service', 'state'
    #data_x = data_x.apply(lambda x: (x - x.min()) / (x.max() - x.min())).values
    data_y = dataset['attack_cat'].values # ['label']#

    le = LabelEncoder()
    
    vector = list(set(data_y))
    data_y = le.fit_transform(data_y)
    map_label_num = list(zip(vector, le.transform(vector)))

    text_dict = {'proto':{}, 'service':{}, 'state':{}}
    data_x = data_x.values
    c = 1
    for i in text_dict:
        tmp = dataset[i].values
        text = list(set(tmp))
        tmp = le.fit_transform(tmp)
        text_dict[i] = dict(list(zip(le.transform(text), text)))
        #data_x = np.concatenate((data_x, tmp.reshape(-1, 1)), axis=1)
        data_x[:, c] = tmp
        c += 1
    
    print("Feature", data_x.shape, 'Label', data_y.shape)
    
    return data_x, data_y, map_label_num, text_dict

### Trainning Set

In [5]:
"Training Set"
file = 'UNSW-NB15 - CSV Files/a part of training and testing set/UNSW_NB15_testing-set.csv'

In [6]:
df = pd.read_csv(file)#, header=None)
df

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.121478,tcp,-,FIN,6,4,258,172,74.087490,...,1,1,0,0,0,1,1,0,Normal,0
1,2,0.649902,tcp,-,FIN,14,38,734,42014,78.473372,...,1,2,0,0,0,1,6,0,Normal,0
2,3,1.623129,tcp,-,FIN,8,16,364,13186,14.170161,...,1,3,0,0,0,2,6,0,Normal,0
3,4,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,...,1,3,1,1,0,2,1,0,Normal,0
4,5,0.449454,tcp,-,FIN,10,6,534,268,33.373826,...,1,40,0,0,0,2,39,0,Normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175336,175337,0.000009,udp,dns,INT,2,0,114,0,111111.107200,...,13,24,0,0,0,24,24,0,Generic,1
175337,175338,0.505762,tcp,-,FIN,10,8,620,354,33.612649,...,1,2,0,0,0,1,1,0,Shellcode,1
175338,175339,0.000009,udp,dns,INT,2,0,114,0,111111.107200,...,3,13,0,0,0,3,12,0,Generic,1
175339,175340,0.000009,udp,dns,INT,2,0,114,0,111111.107200,...,14,30,0,0,0,30,30,0,Generic,1


In [7]:
unsw = get_NBSW_NB15(file)
col_name = unsw.columns.values[0:42]
col_name

data:	(175341, 44)


array(['dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss',
       'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb',
       'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean',
       'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl',
       'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm',
       'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd',
       'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports'], dtype=object)

In [8]:
data_x, data_y, map_label_num, text_dict = preprocess_for_UNSW_NB15(unsw)
print(map_label_num)
data_x

Feature (175341, 42) Label (175341,)
[('DoS', 2), ('Exploits', 3), ('Fuzzers', 4), ('Worms', 9), ('Generic', 5), ('Backdoor', 1), ('Shellcode', 8), ('Reconnaissance', 7), ('Normal', 6), ('Analysis', 0)]


array([[0.121478, 113, 0, ..., 1, 1, 0],
       [0.6499020000000001, 113, 0, ..., 1, 6, 0],
       [1.623129, 113, 0, ..., 2, 6, 0],
       ...,
       [9e-06, 119, 2, ..., 3, 12, 0],
       [9e-06, 119, 2, ..., 30, 30, 0],
       [9e-06, 119, 2, ..., 30, 30, 0]], dtype=object)

In [9]:
map_dict = {i[1]:i[0] for i in map_label_num}
map_dict

{2: 'DoS',
 3: 'Exploits',
 4: 'Fuzzers',
 9: 'Worms',
 5: 'Generic',
 1: 'Backdoor',
 8: 'Shellcode',
 7: 'Reconnaissance',
 6: 'Normal',
 0: 'Analysis'}

In [10]:
counter = collections.Counter(data_y)#source_dataset["attack_cat"]) 
print("CICAndMal2020-dataset ==>", data_y.shape[0])
d = [(map_dict[i[0]], i[1]) for i in counter.most_common()]
print(tabulate(d, headers = ['Type','Occurences']))

CICAndMal2020-dataset ==> 175341
Type              Occurences
--------------  ------------
Normal                 56000
Generic                40000
Exploits               33393
Fuzzers                18184
DoS                    12264
Reconnaissance         10491
Analysis                2000
Backdoor                1746
Shellcode               1133
Worms                    130


In [11]:
X_train_pv, y_train_pv = stat([np.array(data_x, dtype=float), data_y], len(np.unique(data_y)))
map_vec_cat = [map_dict[i] for i in range(len(map_dict))]
c = pd.DataFrame(y_train_pv, dtype=int, columns=map_vec_cat)
c

X=(175341, 42), y=(175341,)
New X=(101040, 42), Index_re=(175341,), Count=175341, maxRepeat=415
return new_X=(101040, 42), class_X=(101040, 10)
End


Unnamed: 0,Analysis,Backdoor,DoS,Exploits,Fuzzers,Generic,Normal,Reconnaissance,Shellcode,Worms
0,0,0,0,0,0,0,0,415,0,0
1,0,0,0,0,0,277,0,0,0,0
2,0,0,0,0,0,238,0,0,0,0
3,10,10,66,76,10,6,0,10,0,0
4,0,0,0,0,0,0,0,181,0,0
...,...,...,...,...,...,...,...,...,...,...
101035,0,0,0,0,0,0,0,1,0,0
101036,0,0,0,0,0,0,0,1,0,0
101037,0,0,0,0,1,0,0,0,0,0
101038,0,0,0,0,0,0,0,1,0,0


In [12]:
X = pd.DataFrame(X_train_pv, index=range(len(X_train_pv)), columns=col_name)
X['proto'] = X['proto'].map(lambda x: text_dict['proto'][int(x)])
X['service'] = X['service'].map(lambda x: text_dict['service'][int(x)])
X['state'] = X['state'].map(lambda x: text_dict['state'][int(x)])
X.insert(0, 'id', range(1, len(X)+1), allow_duplicates=False)

y_train_pv01 = (y_train_pv>0) + 0
Y = pd.DataFrame(y_train_pv01, dtype=int, columns=["cat_"+i for i in map_vec_cat])

X = X.join(Y)
X

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,cat_Analysis,cat_Backdoor,cat_DoS,cat_Exploits,cat_Fuzzers,cat_Generic,cat_Normal,cat_Reconnaissance,cat_Shellcode,cat_Worms
0,1,0.000009,udp,-,INT,2.0,0.0,168.0,0.0,111111.107200,...,0,0,0,0,0,0,0,1,0,0
1,2,0.000009,udp,dns,INT,2.0,0.0,114.0,0.0,111111.107200,...,0,0,0,0,0,1,0,0,0,0
2,3,0.000009,udp,dns,INT,2.0,0.0,114.0,0.0,111111.107200,...,0,0,0,0,0,1,0,0,0,0
3,4,0.000009,unas,-,INT,2.0,0.0,200.0,0.0,111111.107200,...,1,1,1,1,1,1,0,1,0,0
4,5,0.000003,udp,-,INT,2.0,0.0,168.0,0.0,333333.321500,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101035,101036,0.671251,tcp,-,FIN,10.0,8.0,564.0,354.0,25.325847,...,0,0,0,0,0,0,0,1,0,0
101036,101037,0.671264,tcp,-,FIN,10.0,8.0,564.0,354.0,25.325357,...,0,0,0,0,0,0,0,1,0,0
101037,101038,0.671274,tcp,-,FIN,22.0,10.0,17786.0,856.0,46.180844,...,0,0,0,0,1,0,0,0,0,0
101038,101039,0.671291,tcp,-,FIN,10.0,8.0,564.0,354.0,25.324338,...,0,0,0,0,0,0,0,1,0,0


In [13]:
""" 写入CSV文件 """
#将DataFrame存储为csv,index表示是否显示行名，default=True
X.to_csv("/dev-sdb2-data/xj-data/MLD-Model-Publicdata/unsw_training-set_multi-label.csv",index=False)

### Test set

In [17]:
"Test Set"
file = 'UNSW-NB15 - CSV Files/a part of training and testing set/UNSW_NB15_training-set.csv'

In [18]:
df = pd.read_csv(file)#, header=None)
df

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.000011,udp,-,INT,2,0,496,0,90909.090200,...,1,2,0,0,0,1,2,0,Normal,0
1,2,0.000008,udp,-,INT,2,0,1762,0,125000.000300,...,1,2,0,0,0,1,2,0,Normal,0
2,3,0.000005,udp,-,INT,2,0,1068,0,200000.005100,...,1,3,0,0,0,1,3,0,Normal,0
3,4,0.000006,udp,-,INT,2,0,900,0,166666.660800,...,1,3,0,0,0,2,3,0,Normal,0
4,5,0.000010,udp,-,INT,2,0,2126,0,100000.002500,...,1,3,0,0,0,2,3,0,Normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82327,82328,0.000005,udp,-,INT,2,0,104,0,200000.005100,...,1,2,0,0,0,2,1,0,Normal,0
82328,82329,1.106101,tcp,-,FIN,20,8,18062,354,24.410067,...,1,1,0,0,0,3,2,0,Normal,0
82329,82330,0.000000,arp,-,INT,1,0,46,0,0.000000,...,1,1,0,0,0,1,1,1,Normal,0
82330,82331,0.000000,arp,-,INT,1,0,46,0,0.000000,...,1,1,0,0,0,1,1,1,Normal,0


In [19]:
unsw = get_NBSW_NB15(file)
col_name = unsw.columns.values[0:42]
col_name

data:	(82332, 44)


array(['dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss',
       'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb',
       'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean',
       'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl',
       'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm',
       'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd',
       'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports'], dtype=object)

In [20]:
data_x, data_y, map_label_num, text_dict = preprocess_for_UNSW_NB15(unsw)
print(map_label_num)
data_x

Feature (82332, 42) Label (82332,)
[('DoS', 2), ('Exploits', 3), ('Fuzzers', 4), ('Worms', 9), ('Generic', 5), ('Backdoor', 1), ('Shellcode', 8), ('Reconnaissance', 7), ('Normal', 6), ('Analysis', 0)]


array([[1.1e-05, 117, 0, ..., 1, 2, 0],
       [8e-06, 117, 0, ..., 1, 2, 0],
       [5e-06, 117, 0, ..., 1, 3, 0],
       ...,
       [0.0, 6, 0, ..., 1, 1, 1],
       [0.0, 6, 0, ..., 1, 1, 1],
       [9e-06, 117, 0, ..., 1, 1, 0]], dtype=object)

In [21]:
map_dict = {i[1]:i[0] for i in map_label_num}
map_dict

{2: 'DoS',
 3: 'Exploits',
 4: 'Fuzzers',
 9: 'Worms',
 5: 'Generic',
 1: 'Backdoor',
 8: 'Shellcode',
 7: 'Reconnaissance',
 6: 'Normal',
 0: 'Analysis'}

In [22]:
counter = collections.Counter(data_y)#source_dataset["attack_cat"]) 
print("CICAndMal2020-dataset ==>", data_y.shape[0])
d = [(map_dict[i[0]], i[1]) for i in counter.most_common()]
print(tabulate(d, headers = ['Type','Occurences']))

CICAndMal2020-dataset ==> 82332
Type              Occurences
--------------  ------------
Normal                 37000
Generic                18871
Exploits               11132
Fuzzers                 6062
DoS                     4089
Reconnaissance          3496
Analysis                 677
Backdoor                 583
Shellcode                378
Worms                     44


In [23]:
X_test_pv, y_test_pv = stat([np.array(data_x, dtype=float), data_y], len(np.unique(data_y)))
map_vec_cat = [map_dict[i] for i in range(len(map_dict))]
c = pd.DataFrame(y_test_pv, dtype=int, columns=map_vec_cat)
c

X=(82332, 42), y=(82332,)
New X=(53946, 42), Index_re=(82332,), Count=82332, maxRepeat=88
return new_X=(53946, 42), class_X=(53946, 10)
End


Unnamed: 0,Analysis,Backdoor,DoS,Exploits,Fuzzers,Generic,Normal,Reconnaissance,Shellcode,Worms
0,0,0,0,0,0,88,0,0,0,0
1,0,0,40,35,5,0,0,5,0,0
2,0,0,0,0,0,84,0,0,0,0
3,0,0,0,0,0,81,0,0,0,0
4,3,0,32,28,4,0,0,4,0,0
...,...,...,...,...,...,...,...,...,...,...
53941,0,0,0,1,0,0,0,0,0,0
53942,0,0,0,0,1,0,0,0,0,0
53943,0,0,0,0,0,0,1,0,0,0
53944,0,0,0,0,0,0,1,0,0,0


In [24]:
X = pd.DataFrame(X_test_pv, index=range(len(X_test_pv)), columns=col_name)
X['proto'] = X['proto'].map(lambda x: text_dict['proto'][int(x)])
X['service'] = X['service'].map(lambda x: text_dict['service'][int(x)])
X['state'] = X['state'].map(lambda x: text_dict['state'][int(x)])
X.insert(0, 'id', range(1, len(X)+1), allow_duplicates=False)

y_test_pv01 = (y_test_pv>0) + 0
Y = pd.DataFrame(y_test_pv01, dtype=int, columns=["cat_"+i for i in map_vec_cat])

X = X.join(Y)
X

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,cat_Analysis,cat_Backdoor,cat_DoS,cat_Exploits,cat_Fuzzers,cat_Generic,cat_Normal,cat_Reconnaissance,cat_Shellcode,cat_Worms
0,1,0.000004,udp,dns,INT,2.0,0.0,114.0,0.0,250000.000600,...,0,0,0,0,0,1,0,0,0,0
1,2,0.000008,unas,-,INT,2.0,0.0,200.0,0.0,125000.000300,...,0,0,1,1,1,0,0,1,0,0
2,3,0.000010,udp,dns,INT,2.0,0.0,114.0,0.0,100000.002500,...,0,0,0,0,0,1,0,0,0,0
3,4,0.000011,udp,dns,INT,2.0,0.0,114.0,0.0,90909.090200,...,0,0,0,0,0,1,0,0,0,0
4,5,0.000008,unas,-,INT,2.0,0.0,200.0,0.0,125000.000300,...,1,0,1,1,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53941,53942,0.594707,tcp,-,FIN,8.0,6.0,1910.0,268.0,21.859503,...,0,0,0,1,0,0,0,0,0,0
53942,53943,0.594696,tcp,ftp,FIN,12.0,12.0,1592.0,682.0,38.675223,...,0,0,0,0,1,0,0,0,0,0
53943,53944,0.594696,tcp,-,FIN,10.0,8.0,534.0,354.0,28.586035,...,0,0,0,0,0,0,1,0,0,0
53944,53945,0.594686,tcp,-,FIN,10.0,6.0,534.0,268.0,25.223396,...,0,0,0,0,0,0,1,0,0,0


In [25]:
""" 写入CSV文件 """
#将DataFrame存储为csv,index表示是否显示行名，default=True
X.to_csv("/dev-sdb2-data/xj-data/MLD-Model-Publicdata/unsw_test-set_multi-label.csv",index=False)

In [28]:
# clean cache
X = Y = X_train_pv=y_test_pv=y_test_pv01=y_train_pv=y_train_pv01=data_x=data_y=X_test_pv=X_train_pv=c=df=unsw=0

## CIC-AndMal-2020

In [3]:

def get_CICAndMal2020(path, benign_path):
    combined_data = pd.read_csv(benign_path, header=None, low_memory=False) # train 和 test 颠倒了
    combined_data[0] = "Benign"
    print("[{}] from {}".format(combined_data.shape, benign_path))
    for i in os.listdir(path):
        data = pd.read_csv(path+"/"+i, header=None, low_memory=False) # train 和 test 颠倒了
        data[0] = i.split('.')[0]
        print("[{}] from {} to {}".format(data.shape, i, combined_data.shape))
        if combined_data.shape[0] == 0:
            combined_data = data
        else:
            combined_data = pd.concat([combined_data, data]) #.drop(['id'], 1)
        #return combined_data
    
    print("combined_data:\t{}".format(combined_data.shape))
    return combined_data

def preprocess_for_CICAndMal2020(dataset=""):
    le = LabelEncoder()
    vector = list(set(dataset[0]))

    dataset[0] = le.fit_transform(dataset[0])
    map_label_num = list(zip(vector, le.transform(vector)))
    # label是binary attack_act是multi-class
    data_x = dataset.drop([0], axis=1)
    data_y = dataset[0] # ['label']#
    print("Feature", data_x.shape, 'Label', data_y.shape)

    return data_x, data_y, map_label_num


In [4]:
%%time
source_data = get_CICAndMal2020("/dev-sdb2-data/xj-data/CICAndMal2020/Malicious-CSVs/", "/dev-sdb2-data/xj-data/CICAndMal2020/BenignCSVs/Ben0.csv")
col_name = source_data.columns.values[1:]
col_name

[(32084, 9504)] from /dev-sdb2-data/xj-data/CICAndMal2020/BenignCSVs/Ben0.csv
[(669, 9504)] from FileInfector.csv to (32084, 9504)
[(13559, 9504)] from Trojan.csv to (32753, 9504)
[(887, 9504)] from Banker.csv to (46312, 9504)
[(3540, 9504)] from Spy.csv to (47199, 9504)
[(2302, 9504)] from Dropper.csv to (50739, 9504)
[(3125, 9504)] from SMS.csv to (53041, 9504)
[(6202, 9504)] from Ransomware.csv to (56166, 9504)
[(2296, 9504)] from NoCategory.csv to (62368, 9504)
[(1538, 9504)] from Backdoor.csv to (64664, 9504)
[(13340, 9504)] from Zeroday.csv to (66202, 9504)
[(97349, 9504)] from Riskware.csv to (79542, 9504)
[(47210, 9504)] from Adware.csv to (176891, 9504)
[(2051, 9504)] from PUA.csv to (224101, 9504)
[(1556, 9504)] from Scareware.csv to (226152, 9504)
combined_data:	(227708, 9504)
CPU times: user 4min 10s, sys: 1min 3s, total: 5min 14s
Wall time: 5min 18s


array([   1,    2,    3, ..., 9501, 9502, 9503])

In [5]:
source_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9494,9495,9496,9497,9498,9499,9500,9501,9502,9503
0,Benign,52,0,0,44,3,1,0,2,1,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
1,Benign,71,0,0,24,8,0,1,11,1,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
2,Benign,688,0,0,76,7,3,4,13,3,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
3,Benign,295,0,0,134,7,3,6,14,3,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
4,Benign,319,0,0,87,4,6,6,10,1,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1551,Scareware,238,0,0,19,0,0,0,1,1,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
1552,Scareware,202,0,0,17,0,0,0,1,1,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
1553,Scareware,91,0,1,26,15,11,1,21,6,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
1554,Scareware,92,0,0,30,6,4,0,5,4,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0


In [6]:
data_x, data_y, map_label_num = preprocess_for_CICAndMal2020(source_data)
map_dict = {i[1]:i[0] for i in map_label_num}
print(map_label_num)
del source_data

Feature (227708, 9503) Label (227708,)
[('Zeroday', 14), ('Riskware', 9), ('Spy', 12), ('FileInfector', 5), ('Dropper', 4), ('Benign', 3), ('Ransomware', 8), ('NoCategory', 6), ('Scareware', 11), ('Backdoor', 1), ('SMS', 10), ('Trojan', 13), ('Adware', 0), ('PUA', 7), ('Banker', 2)]


In [7]:
data_x

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,9494,9495,9496,9497,9498,9499,9500,9501,9502,9503
0,52,0,0,44,3,1,0,2,1,0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
1,71,0,0,24,8,0,1,11,1,0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
2,688,0,0,76,7,3,4,13,3,2,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
3,295,0,0,134,7,3,6,14,3,3,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
4,319,0,0,87,4,6,6,10,1,1,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1551,238,0,0,19,0,0,0,1,1,0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
1552,202,0,0,17,0,0,0,1,1,0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
1553,91,0,1,26,15,11,1,21,6,0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
1554,92,0,0,30,6,4,0,5,4,0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0


In [8]:
data_x = data_x.values
data_y = data_y.values

In [9]:
''' 去掉这些值 '''
index = np.where(np.isnan(data_x))[0]
print("极小值", len(index))
data_x = np.delete(data_x, index, axis=0)
data_y = np.delete(data_y, index, axis=0)

index = np.where(np.isinf(data_x))[0]
print("极大值", len(index))
data_x = np.delete(data_x, index, axis=0)
data_y = np.delete(data_y, index, axis=0)

data_x.shape, data_y.shape

极小值 165
极大值 0


((227666, 9503), (227666,))

In [10]:
counter = collections.Counter(data_y)#source_dataset["attack_cat"]) 
print("dataset ==>", data_y.shape[0])
d = [(map_dict[i[0]], i[1]) for i in counter.most_common()]
print(tabulate(d, headers = ['Type','Occurences']))

dataset ==> 227666
Type            Occurences
------------  ------------
Riskware             97349
Adware               47198
Benign               32084
Trojan               13542
Zeroday              13327
Ransomware            6202
Spy                   3540
SMS                   3125
Dropper               2302
NoCategory            2296
PUA                   2051
Scareware             1556
Backdoor              1538
Banker                 887
FileInfector           669


In [11]:
X_pv, y_pv = stat([data_x, data_y], len(map_dict))
map_vec_cat = [map_dict[i] for i in range(len(map_dict))]
c = pd.DataFrame(y_pv, dtype=int, columns=map_vec_cat)
c

X=(227666, 9503), y=(227666,)
New X=(59440, 9503), Index_re=(227666,), Count=227666, maxRepeat=15741
return new_X=(59440, 9503), class_X=(59440, 15)
End


Unnamed: 0,Adware,Backdoor,Banker,Benign,Dropper,FileInfector,NoCategory,PUA,Ransomware,Riskware,SMS,Scareware,Spy,Trojan,Zeroday
0,0,0,0,0,0,0,0,0,0,15741,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,15588,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,6114,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,6107,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,3612,2363
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59435,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
59436,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
59437,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
59438,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [12]:
del c

In [13]:
X = pd.DataFrame(X_pv, index=range(len(X_pv)), columns=col_name)
X.insert(0, 'id', range(1, len(X)+1), allow_duplicates=False)

y_pv01 = (y_pv>0) + 0
Y = pd.DataFrame(y_pv01, dtype=int, columns=["cat_"+i for i in map_vec_cat])

X = X.join(Y)
X

Unnamed: 0,id,1,2,3,4,5,6,7,8,9,...,cat_FileInfector,cat_NoCategory,cat_PUA,cat_Ransomware,cat_Riskware,cat_SMS,cat_Scareware,cat_Spy,cat_Trojan,cat_Zeroday
0,1,84.0,0.0,0.0,14.0,22.0,11.0,2.0,28.0,12.0,...,0,0,0,0,1,0,0,0,0,0
1,2,84.0,0.0,0.0,15.0,22.0,13.0,3.0,31.0,13.0,...,0,0,0,0,1,0,0,0,0,0
2,3,2.0,0.0,0.0,1.0,8.0,4.0,8.0,35.0,23.0,...,0,0,0,0,1,0,0,0,0,0
3,4,309.0,0.0,0.0,4.0,13.0,10.0,13.0,37.0,18.0,...,0,0,0,0,1,0,0,0,0,0
4,5,23.0,0.0,0.0,12.0,5.0,2.0,1.0,30.0,5.0,...,0,0,0,0,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59435,59436,333.0,0.0,0.0,5.0,6.0,2.0,0.0,4.0,1.0,...,0,0,0,0,0,0,0,0,0,0
59436,59437,333.0,0.0,0.0,5.0,3.0,3.0,3.0,14.0,1.0,...,0,0,0,0,0,0,0,0,0,0
59437,59438,333.0,0.0,0.0,5.0,1.0,2.0,0.0,4.0,1.0,...,0,0,0,0,0,0,0,0,0,0
59438,59439,333.0,0.0,0.0,4.0,14.0,5.0,4.0,10.0,1.0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
""" 写入CSV文件 """
#将DataFrame存储为csv,index表示是否显示行名，default=True
X.to_csv("/dev-sdb2-data/xj-data/MLD-Model-Publicdata/CIC_AndMal2020_data_multi-label.csv",index=False)

In [19]:
X[:1000].to_csv("/dev-sdb2-data/xj-data/MLD-Model-Publicdata/partial_CIC_AndMal2020_data_multi-label.csv",index=False)