In [54]:
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import chi2, mutual_info_classif, RFE, SelectFromModel, SelectKBest, VarianceThreshold
from sklearn.linear_model import LogisticRegression
import warnings

In [2]:
pd.set_option('display.max_columns', None)
pd.options.mode.use_inf_as_na = True
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv('G:\IDS2018\datasets\Thursday-15-02-2018_TrafficForML_CICFlowMeter.csv')

# 数据预处理

In [4]:
numerical_fea = list(data.select_dtypes(exclude=['object']).columns)
category_fea = list(filter(lambda x: x not in numerical_fea, list(data.columns)))

In [5]:
def get_numerical_serial_fea(data, feas):
    numerical_serial_fea = []
    numerical_noserial_fea = []
    for fea in feas:
        temp = data[fea].nunique()
        if temp <= 10:
            numerical_noserial_fea.append(fea)
        else:
            numerical_serial_fea.append(fea)
    return numerical_serial_fea, numerical_noserial_fea
numerical_serial_fea, numerical_noserial_fea = get_numerical_serial_fea(data, numerical_fea)

In [36]:
data.isnull().sum().to_dict()

{'Dst Port': 0,
 'Protocol': 0,
 'Timestamp': 0,
 'Flow Duration': 0,
 'Tot Fwd Pkts': 0,
 'Tot Bwd Pkts': 0,
 'TotLen Fwd Pkts': 0,
 'TotLen Bwd Pkts': 0,
 'Fwd Pkt Len Max': 0,
 'Fwd Pkt Len Min': 0,
 'Fwd Pkt Len Mean': 0,
 'Fwd Pkt Len Std': 0,
 'Bwd Pkt Len Max': 0,
 'Bwd Pkt Len Min': 0,
 'Bwd Pkt Len Mean': 0,
 'Bwd Pkt Len Std': 0,
 'Flow Byts/s': 8027,
 'Flow Pkts/s': 8027,
 'Flow IAT Mean': 0,
 'Flow IAT Std': 0,
 'Flow IAT Max': 0,
 'Flow IAT Min': 0,
 'Fwd IAT Tot': 0,
 'Fwd IAT Mean': 0,
 'Fwd IAT Std': 0,
 'Fwd IAT Max': 0,
 'Fwd IAT Min': 0,
 'Bwd IAT Tot': 0,
 'Bwd IAT Mean': 0,
 'Bwd IAT Std': 0,
 'Bwd IAT Max': 0,
 'Bwd IAT Min': 0,
 'Fwd PSH Flags': 0,
 'Bwd PSH Flags': 0,
 'Fwd URG Flags': 0,
 'Bwd URG Flags': 0,
 'Fwd Header Len': 0,
 'Bwd Header Len': 0,
 'Fwd Pkts/s': 0,
 'Bwd Pkts/s': 0,
 'Pkt Len Min': 0,
 'Pkt Len Max': 0,
 'Pkt Len Mean': 0,
 'Pkt Len Std': 0,
 'Pkt Len Var': 0,
 'FIN Flag Cnt': 0,
 'SYN Flag Cnt': 0,
 'RST Flag Cnt': 0,
 'PSH Flag Cnt': 0,
 

In [6]:
data[numerical_serial_fea] = data[numerical_serial_fea].fillna(data[numerical_serial_fea].mean())

In [9]:
data.isnull().sum().to_dict()

{'Dst Port': 0,
 'Protocol': 0,
 'Timestamp': 0,
 'Flow Duration': 0,
 'Tot Fwd Pkts': 0,
 'Tot Bwd Pkts': 0,
 'TotLen Fwd Pkts': 0,
 'TotLen Bwd Pkts': 0,
 'Fwd Pkt Len Max': 0,
 'Fwd Pkt Len Min': 0,
 'Fwd Pkt Len Mean': 0,
 'Fwd Pkt Len Std': 0,
 'Bwd Pkt Len Max': 0,
 'Bwd Pkt Len Min': 0,
 'Bwd Pkt Len Mean': 0,
 'Bwd Pkt Len Std': 0,
 'Flow Byts/s': 0,
 'Flow Pkts/s': 0,
 'Flow IAT Mean': 0,
 'Flow IAT Std': 0,
 'Flow IAT Max': 0,
 'Flow IAT Min': 0,
 'Fwd IAT Tot': 0,
 'Fwd IAT Mean': 0,
 'Fwd IAT Std': 0,
 'Fwd IAT Max': 0,
 'Fwd IAT Min': 0,
 'Bwd IAT Tot': 0,
 'Bwd IAT Mean': 0,
 'Bwd IAT Std': 0,
 'Bwd IAT Max': 0,
 'Bwd IAT Min': 0,
 'Fwd PSH Flags': 0,
 'Bwd PSH Flags': 0,
 'Fwd URG Flags': 0,
 'Bwd URG Flags': 0,
 'Fwd Header Len': 0,
 'Bwd Header Len': 0,
 'Fwd Pkts/s': 0,
 'Bwd Pkts/s': 0,
 'Pkt Len Min': 0,
 'Pkt Len Max': 0,
 'Pkt Len Mean': 0,
 'Pkt Len Std': 0,
 'Pkt Len Var': 0,
 'FIN Flag Cnt': 0,
 'SYN Flag Cnt': 0,
 'RST Flag Cnt': 0,
 'PSH Flag Cnt': 0,
 'ACK F

In [7]:
data['Timestamp'] = pd.to_datetime(data['Timestamp'],format='%d/%m/%Y %H:%M:%S')

In [8]:
data['Label'].replace(to_replace='Benign', value=0, inplace=True)
data['Label'].replace(to_replace='DoS attacks-GoldenEye', value=1, inplace=True)
data['Label'].replace(to_replace='DoS attacks-Slowloris', value=1, inplace=True)

# 异常值处理

In [10]:
def find_outliers_by_3segama(data, fea):
    data_std = np.std(data[fea])
    data_mean = np.mean(data[fea])
    outliers_cut_off = 3 * data_std
    lower_rule = data_mean - outliers_cut_off
    upper_rule = data_mean + outliers_cut_off
    data[fea+'_outliers'] = data[fea].apply(lambda x: str('异常值') if x > upper_rule or x < lower_rule else '正常值')
    return data

In [27]:
for fea in numerical_serial_fea:
    data = find_outliers_by_3segama(data, fea)
    print(data[fea+'_outliers'].value_counts())
    print(data.groupby(fea+'_outliers')['Label'].sum())
    print('*'*10)

正常值    1039789
异常值       8786
Name: Dst Port_outliers, dtype: int64
Dst Port_outliers
异常值        0
正常值    52498
Name: Label, dtype: int64
**********
正常值    991764
异常值     56811
Name: Flow Duration_outliers, dtype: int64
Flow Duration_outliers
异常值      280
正常值    52218
Name: Label, dtype: int64
**********
正常值    1046745
异常值       1830
Name: Tot Fwd Pkts_outliers, dtype: int64
Tot Fwd Pkts_outliers
异常值        0
正常值    52498
Name: Label, dtype: int64
**********
正常值    1046952
异常值       1623
Name: Tot Bwd Pkts_outliers, dtype: int64
Tot Bwd Pkts_outliers
异常值        0
正常值    52498
Name: Label, dtype: int64
**********
正常值    1048549
异常值         26
Name: TotLen Fwd Pkts_outliers, dtype: int64
TotLen Fwd Pkts_outliers
异常值        0
正常值    52498
Name: Label, dtype: int64
**********
正常值    1047070
异常值       1505
Name: TotLen Bwd Pkts_outliers, dtype: int64
TotLen Bwd Pkts_outliers
异常值        0
正常值    52498
Name: Label, dtype: int64
**********
正常值    1038021
异常值      10554
Name: Fwd Pkt Len Max_ou

正常值    1034351
异常值      14224
Name: Active Std_outliers, dtype: int64
Active Std_outliers
异常值     4935
正常值    47563
Name: Label, dtype: int64
**********
正常值    1032734
异常值      15841
Name: Active Max_outliers, dtype: int64
Active Max_outliers
异常值     5557
正常值    46941
Name: Label, dtype: int64
**********
正常值    1035242
异常值      13333
Name: Active Min_outliers, dtype: int64
Active Min_outliers
异常值     5463
正常值    47035
Name: Label, dtype: int64
**********
正常值    996157
异常值     52418
Name: Idle Mean_outliers, dtype: int64
Idle Mean_outliers
异常值     4229
正常值    48269
Name: Label, dtype: int64
**********
正常值    1037445
异常值      11130
Name: Idle Std_outliers, dtype: int64
Idle Std_outliers
异常值     4788
正常值    47710
Name: Label, dtype: int64
**********
正常值    988121
异常值     60454
Name: Idle Max_outliers, dtype: int64
Idle Max_outliers
异常值     8686
正常值    43812
Name: Label, dtype: int64
**********
正常值    996490
异常值     52085
Name: Idle Min_outliers, dtype: int64
Idle Min_outliers
异常值     4277

# 特征选择

In [47]:
selector = VarianceThreshold(threshold=3)
selector = selector.fit(data[numerical_serial_fea])
features_mask = selector.get_support(indices=True)
selected_features = np.array(numerical_serial_fea)[features_mask]
print('Selected:', selected_features)
print('Deleted: ', [fea for fea in numerical_serial_fea if fea not in selected_features])

Selected: ['Dst Port' 'Flow Duration' 'Tot Fwd Pkts' 'Tot Bwd Pkts'
 'TotLen Fwd Pkts' 'TotLen Bwd Pkts' 'Fwd Pkt Len Max' 'Fwd Pkt Len Min'
 'Fwd Pkt Len Mean' 'Fwd Pkt Len Std' 'Bwd Pkt Len Max' 'Bwd Pkt Len Min'
 'Bwd Pkt Len Mean' 'Bwd Pkt Len Std' 'Flow Byts/s' 'Flow Pkts/s'
 'Flow IAT Mean' 'Flow IAT Std' 'Flow IAT Max' 'Flow IAT Min'
 'Fwd IAT Tot' 'Fwd IAT Mean' 'Fwd IAT Std' 'Fwd IAT Max' 'Fwd IAT Min'
 'Bwd IAT Tot' 'Bwd IAT Mean' 'Bwd IAT Std' 'Bwd IAT Max' 'Bwd IAT Min'
 'Fwd Header Len' 'Bwd Header Len' 'Fwd Pkts/s' 'Bwd Pkts/s' 'Pkt Len Min'
 'Pkt Len Max' 'Pkt Len Mean' 'Pkt Len Std' 'Pkt Len Var' 'Pkt Size Avg'
 'Fwd Seg Size Avg' 'Bwd Seg Size Avg' 'Subflow Fwd Pkts'
 'Subflow Fwd Byts' 'Subflow Bwd Pkts' 'Subflow Bwd Byts'
 'Init Fwd Win Byts' 'Init Bwd Win Byts' 'Fwd Act Data Pkts'
 'Fwd Seg Size Min' 'Active Mean' 'Active Std' 'Active Max' 'Active Min'
 'Idle Mean' 'Idle Std' 'Idle Max' 'Idle Min']
Deleted:  ['Down/Up Ratio']


In [117]:
pearsonr_result = []
for fea in numerical_serial_fea:
    pearsonr_result.append((fea, pearsonr(data[fea], data['Label'])))
sorted(pearsonr_result, key=lambda x: x[1][0], reverse=True)[:10]

[('Fwd Seg Size Min', (0.4827216248545034, 0.0)),
 ('Bwd IAT Mean', (0.30192251129739955, 0.0)),
 ('Init Fwd Win Byts', (0.2684559690822708, 0.0)),
 ('Bwd IAT Min', (0.2500177147030524, 0.0)),
 ('Flow IAT Std', (0.21858340497356774, 0.0)),
 ('Bwd IAT Max', (0.19606408350148533, 0.0)),
 ('Bwd IAT Std', (0.18719441138249784, 0.0)),
 ('Idle Max', (0.18100280839192964, 0.0)),
 ('Fwd Pkt Len Std', (0.16886516326958462, 0.0)),
 ('Idle Std', (0.15481787043716266, 0.0))]

In [21]:
# find out the features can not be applied by chi2
for fea in [fea for fea in numerical_serial_fea if fea not in ['Init Fwd Win Byts', 'Init Bwd Win Byts']]:
    print(fea)
    print(chi2(np.array(data[fea]).reshape(-1, 1), np.array(data['Label']).reshape(-1, 1)))

Dst Port
(array([3.99751145e+08]), array([0.]))
Flow Duration
(array([3.59172003e+11]), array([0.]))
Tot Fwd Pkts
(array([3778.26711379]), array([0.]))
Tot Bwd Pkts
(array([147047.64832782]), array([0.]))
TotLen Fwd Pkts
(array([634718.39574114]), array([0.]))
TotLen Bwd Pkts
(array([2.24665013e+08]), array([0.]))
Fwd Pkt Len Max
(array([3126399.65719183]), array([0.]))
Fwd Pkt Len Min
(array([672290.30465466]), array([0.]))
Fwd Pkt Len Mean
(array([1008178.14621648]), array([0.]))
Fwd Pkt Len Std
(array([4910010.25013736]), array([0.]))
Bwd Pkt Len Max
(array([2426276.39917122]), array([0.]))
Bwd Pkt Len Min
(array([1859688.16916162]), array([0.]))
Bwd Pkt Len Mean
(array([14682.06813255]), array([0.]))
Bwd Pkt Len Std
(array([6324182.09067248]), array([0.]))
Flow Byts/s
(array([1.5768512e+10]), array([0.]))
Flow Pkts/s
(array([1.00582909e+09]), array([0.]))
Flow IAT Mean
(array([2.96727475e+11]), array([0.]))
Flow IAT Std
(array([9.98169939e+11]), array([0.]))
Flow IAT Max
(array([1.

In [49]:
chi2_test_fea = [fea for fea in numerical_serial_fea if fea not in ['Init Fwd Win Byts', 'Init Bwd Win Byts']]
selector = SelectKBest(chi2, k=5)
selector = selector.fit(data[chi2_test_fea], data['Label'])
features_mask = selector.get_support(indices=True)
selected_features = np.array(chi2_test_fea)[features_mask]
print('Selected:', selected_features)

Selected: ['Bwd IAT Mean' 'Bwd IAT Max' 'Bwd IAT Min' 'Idle Mean' 'Idle Max']


In [50]:
selector = SelectKBest(mutual_info_classif, k=5)
selector = selector.fit(data[numerical_serial_fea], data['Label'])
features_mask = selector.get_support(indices=True)
selected_features = np.array(numerical_serial_fea)[features_mask]
print('Selected:', selected_features)

Selected: ['Flow IAT Max' 'Fwd Header Len' 'Pkt Len Max' 'Init Fwd Win Byts'
 'Fwd Seg Size Min']


In [56]:
features = [fea for fea in data.columns if fea not in ['Timestamp', 'Label']]
selector = RFE(DecisionTreeClassifier(), n_features_to_select=5, step=1)
selector = selector.fit(data[features], data['Label'])
features_mask = selector.get_support(indices=True)
selected_features = np.array(features)[features_mask]
print('Selected:', selected_features)

Selected: ['Dst Port' 'Fwd Pkts/s' 'Down/Up Ratio' 'Init Fwd Win Byts'
 'Fwd Seg Size Min']


In [52]:
selector = SelectFromModel(LogisticRegression(penalty='l2', C=10))
selector = selector.fit(data[features], data['Label'])
features_mask = selector.get_support(indices=True)
selected_features = np.array(features)[features_mask]
print('Selected:', selected_features)

Selected: ['Dst Port' 'Flow Duration' 'Flow Byts/s' 'Flow Pkts/s' 'Flow IAT Max'
 'Fwd IAT Tot' 'Fwd IAT Mean' 'Fwd IAT Std' 'Fwd IAT Max' 'Fwd IAT Min'
 'Bwd IAT Tot' 'Bwd IAT Mean' 'Bwd IAT Std' 'Bwd IAT Max' 'Bwd IAT Min'
 'Fwd Pkts/s' 'Pkt Len Var' 'Idle Mean' 'Idle Std' 'Idle Max' 'Idle Min']


In [55]:
selector = SelectFromModel(DecisionTreeClassifier())
selector = selector.fit(data[features], data['Label'])
features_mask = selector.get_support(indices=True)
selected_features = np.array(features)[features_mask]
print('Selected:', selected_features)

Selected: ['Dst Port' 'Fwd Seg Size Min']
