In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from IPython.display import display
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
# 匯入資料 「防火牆判斷是否要讓網路封包通過」
df = pd.read_csv("internet-firewall-dataset.csv")
df.head()

Unnamed: 0,Source Port,Destination Port,NAT Source Port,NAT Destination Port,Action,Bytes,Bytes Sent,Bytes Received,Packets,Elapsed Time (sec),pkts_sent,pkts_received
0,57222,53,54587,53,allow,177,94,83,2,30,1,1
1,56258,3389,56258,3389,allow,4768,1600,3168,19,17,10,9
2,6881,50321,43265,50321,allow,238,118,120,2,1199,1,1
3,50553,3389,50553,3389,allow,3327,1438,1889,15,17,8,7
4,50002,443,45848,443,allow,25358,6778,18580,31,16,13,18


In [3]:
# 檢查資料
print ("空值：", df.isnull().values.sum())
print ("資料筆數：", df.shape)
print ("資料欄位名稱：\n", df.keys())
print ("第一筆資料：\n", df.iloc[0,::])
print ("資料型態：\n", df.dtypes)

空值： 0
資料筆數： (65532, 12)
資料欄位名稱：
 Index(['Source Port', 'Destination Port', 'NAT Source Port',
       'NAT Destination Port', 'Action', 'Bytes', 'Bytes Sent',
       'Bytes Received', 'Packets', 'Elapsed Time (sec)', 'pkts_sent',
       'pkts_received'],
      dtype='object')
第一筆資料：
 Source Port             57222
Destination Port           53
NAT Source Port         54587
NAT Destination Port       53
Action                  allow
Bytes                     177
Bytes Sent                 94
Bytes Received             83
Packets                     2
Elapsed Time (sec)         30
pkts_sent                   1
pkts_received               1
Name: 0, dtype: object
資料型態：
 Source Port              int64
Destination Port         int64
NAT Source Port          int64
NAT Destination Port     int64
Action                  object
Bytes                    int64
Bytes Sent               int64
Bytes Received           int64
Packets                  int64
Elapsed Time (sec)       int64
pkts_sent       

In [4]:
# 共線性檢查-相關係數
display(df.corr().style.applymap(lambda x: 'color :red ' if 1 > x > 0.7 else  'color : black'))
# 刪除相關性高的自變數
df2 = df.drop(columns=['Bytes','Packets','pkts_sent','pkts_received'])
display(df2.corr().style.applymap(lambda x: 'color :red ' if 1 > x > 0.7 else  'color : black'))

Unnamed: 0,Source Port,Destination Port,NAT Source Port,NAT Destination Port,Bytes,Bytes Sent,Bytes Received,Packets,Elapsed Time (sec),pkts_sent,pkts_received
Source Port,1.0,-0.332246,0.145391,-0.024843,0.000221,-0.000931,0.00195,-0.001742,-0.046515,-0.001422,-0.001962
Destination Port,-0.332246,1.0,-0.281676,0.410042,-0.005297,0.001675,-0.014684,-0.006063,0.023537,-0.002134,-0.010909
NAT Source Port,0.145391,-0.281676,1.0,0.178435,0.010659,0.002242,0.020827,0.012633,0.141485,0.00718,0.018772
NAT Destination Port,-0.024843,0.410042,0.178435,1.0,0.003975,0.007904,-0.003216,0.004605,0.219776,0.006136,0.001747
Bytes,0.000221,-0.005297,0.010659,0.003975,1.0,0.933462,0.830225,0.974379,0.148834,0.966548,0.850209
Bytes Sent,-0.000931,0.001675,0.002242,0.007904,0.933462,1.0,0.575047,0.887596,0.126039,0.973976,0.639098
Bytes Received,0.00195,-0.014684,0.020827,-0.003216,0.830225,0.575047,1.0,0.843067,0.143601,0.690959,0.946039
Packets,-0.001742,-0.006063,0.012633,0.004605,0.974379,0.887596,0.843067,1.0,0.147074,0.961286,0.916978
Elapsed Time (sec),-0.046515,0.023537,0.141485,0.219776,0.148834,0.126039,0.143601,0.147074,1.0,0.135101,0.143954
pkts_sent,-0.001422,-0.002134,0.00718,0.006136,0.966548,0.973976,0.690959,0.961286,0.135101,1.0,0.77155


Unnamed: 0,Source Port,Destination Port,NAT Source Port,NAT Destination Port,Bytes Sent,Bytes Received,Elapsed Time (sec)
Source Port,1.0,-0.332246,0.145391,-0.024843,-0.000931,0.00195,-0.046515
Destination Port,-0.332246,1.0,-0.281676,0.410042,0.001675,-0.014684,0.023537
NAT Source Port,0.145391,-0.281676,1.0,0.178435,0.002242,0.020827,0.141485
NAT Destination Port,-0.024843,0.410042,0.178435,1.0,0.007904,-0.003216,0.219776
Bytes Sent,-0.000931,0.001675,0.002242,0.007904,1.0,0.575047,0.126039
Bytes Received,0.00195,-0.014684,0.020827,-0.003216,0.575047,1.0,0.143601
Elapsed Time (sec),-0.046515,0.023537,0.141485,0.219776,0.126039,0.143601,1.0


In [5]:
# 欄位'Action'轉換為數值
LE = LabelEncoder()
temp = df2['Action']
df2['Action'] = LE.fit_transform(df2['Action'])
print (set(zip(temp,df2['Action'])))

df2['Action'].replace({2: 1, 3: 1}, inplace=True)
print (set(zip(temp,df2['Action'])))

{('allow', 0), ('reset-both', 3), ('deny', 1), ('drop', 2)}
{('allow', 0), ('reset-both', 1), ('drop', 1), ('deny', 1)}


In [6]:
# 設定 X、Y
x = df2.loc[:,~df2.columns.isin(['Action'])]
y = df2.loc[:,df2.columns.isin(['Action'])]
# 共線性檢查-Variance Inflation Factor
VIF = pd.DataFrame(columns=['VIF'],index = x.columns)
VIF['VIF'] = [variance_inflation_factor(x.values,i) for i in range(len(x.columns))]
VIF.style.applymap(lambda x : 'color : red' if x >= 10 else 'color : black')

Unnamed: 0,VIF
Source Port,2.163898
Destination Port,1.559872
NAT Source Port,2.036032
NAT Destination Port,1.474508
Bytes Sent,1.498988
Bytes Received,1.508764
Elapsed Time (sec),1.13857


In [7]:
# 建立訓練集、測試集
train_x, test_x , train_y, test_y = train_test_split(x,y,test_size=0.3,random_state=42)

In [8]:
# LinearSVC
model = svm.LinearSVC(C=1)
model.fit(train_x,train_y)
print ('正確率：', model.score(test_x ,test_y))
pred_y = model.predict(test_x)
print(confusion_matrix(test_y , pred_y))

  y = column_or_1d(y, warn=True)


正確率： 0.9846388606307223
[[11071   259]
 [   43  8287]]




In [9]:
# SVM_linear
model = svm.SVC(kernel = 'linear')
model.fit(train_x,train_y)
print ('正確率：', model.score(test_x ,test_y))
pred_y = model.predict(test_x)
print(confusion_matrix(test_y , pred_y))

  y = column_or_1d(y, warn=True)


正確率： 0.9980162767039674
[[11301    29]
 [   10  8320]]


In [10]:
# SVM_rbf
model = svm.SVC(kernel = 'rbf')
model.fit(train_x,train_y)
print ('正確率：', model.score(test_x ,test_y))
pred_y = model.predict(test_x)
print(confusion_matrix(test_y , pred_y))

  y = column_or_1d(y, warn=True)


正確率： 0.8891658189216683
[[9155 2175]
 [   4 8326]]


In [11]:
# SVM_poly
model = svm.SVC(kernel = 'poly')
model.fit(train_x,train_y)
print ('正確率：', model.score(test_x ,test_y))
pred_y = model.predict(test_x)
print(confusion_matrix(test_y , pred_y))

  y = column_or_1d(y, warn=True)


正確率： 0.5762970498474059
[[11330     0]
 [ 8330     0]]
