In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import cross_val_score
import numpy as np
import joblib

In [2]:
file_path='/Users/anguschen/Library/CloudStorage/OneDrive-國立宜蘭大學/無線網路/UNSW_NB15/'

# 讀取CSV檔案
data_train = pd.read_csv(file_path + 'training.csv')
data_test = pd.read_csv(file_path + 'testing.csv')

# 列出 dataset 的資訊
print("Training Data :", data_train.shape)
for column in data_train.columns:
    unique_values = data_train[column].unique()
    print(f"'{column}' ",end=" ")

print("\n", "Testing Data :", data_test.shape)
for column in data_test.columns:
    unique_values = data_test[column].unique()
    print(f"'{column}' ",end=" ")

Training Data : (82332, 45)
'id'  'dur'  'proto'  'service'  'state'  'spkts'  'dpkts'  'sbytes'  'dbytes'  'rate'  'sttl'  'dttl'  'sload'  'dload'  'sloss'  'dloss'  'sinpkt'  'dinpkt'  'sjit'  'djit'  'swin'  'stcpb'  'dtcpb'  'dwin'  'tcprtt'  'synack'  'ackdat'  'smean'  'dmean'  'trans_depth'  'response_body_len'  'ct_srv_src'  'ct_state_ttl'  'ct_dst_ltm'  'ct_src_dport_ltm'  'ct_dst_sport_ltm'  'ct_dst_src_ltm'  'is_ftp_login'  'ct_ftp_cmd'  'ct_flw_http_mthd'  'ct_src_ltm'  'ct_srv_dst'  'is_sm_ips_ports'  'attack_cat'  'label'  
 Testing Data : (175341, 45)
'id'  'dur'  'proto'  'service'  'state'  'spkts'  'dpkts'  'sbytes'  'dbytes'  'rate'  'sttl'  'dttl'  'sload'  'dload'  'sloss'  'dloss'  'sinpkt'  'dinpkt'  'sjit'  'djit'  'swin'  'stcpb'  'dtcpb'  'dwin'  'tcprtt'  'synack'  'ackdat'  'smean'  'dmean'  'trans_depth'  'response_body_len'  'ct_srv_src'  'ct_state_ttl'  'ct_dst_ltm'  'ct_src_dport_ltm'  'ct_dst_sport_ltm'  'ct_dst_src_ltm'  'is_ftp_login'  'ct_ftp_cmd'  

In [3]:
# 刪除 'id' 列
for data in [data_train, data_test]:
    if 'id' in data.columns:
        data.drop(columns=['id'], inplace=True)

print("Training Data :", data_train.shape)
print("Testing Data :", data_test.shape)

Training Data : (82332, 44)
Testing Data : (175341, 44)


In [4]:
# 合併訓練集和測試集
merged_data = pd.concat([data_train, data_test], ignore_index=True)
print("Merged Data :", merged_data.shape)

Merged Data : (257673, 44)


In [5]:
# 假設 'label' 目標屬性
X = merged_data.drop(columns=['label'])
y = merged_data['label']

# 選擇要進行 One-Hot Encoding 的類別型特徵
categories = ['proto', 'service', 'state', 'attack_cat']

# 使用 get_dummies() 函數對類別型特徵進行 One-Hot Encoding
data_encoded = pd.get_dummies(X, columns=categories)

# 重新組合特徵
X = pd.concat([X.drop(columns=categories), data_encoded], axis=1)

print(X.info())
print(y.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 257673 entries, 0 to 257672
Columns: 245 entries, dur to attack_cat_Worms
dtypes: bool(167), float64(22), int64(56)
memory usage: 194.4 MB
None
<class 'pandas.core.series.Series'>
RangeIndex: 257673 entries, 0 to 257672
Series name: label
Non-Null Count   Dtype
--------------   -----
257673 non-null  int64
dtypes: int64(1)
memory usage: 2.0 MB
None


In [6]:
# 標準化資料
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 顯示標準化後的部分資料以確認結果
print(pd.DataFrame(X_scaled, columns=X.columns).head())
print(type(X_scaled))
print(X_scaled.shape)

        dur     spkts     dpkts    sbytes    dbytes      rate      sttl  \
0 -0.208678 -0.130765 -0.165331 -0.046480 -0.098409 -0.002151  0.722026   
1 -0.208679 -0.130765 -0.165331 -0.039194 -0.098409  0.210460  0.722026   
2 -0.208679 -0.130765 -0.165331 -0.043188 -0.098409  0.678204  0.722026   
3 -0.208679 -0.130765 -0.165331 -0.044155 -0.098409  0.470318  0.722026   
4 -0.208678 -0.130765 -0.165331 -0.037100 -0.098409  0.054546  0.722026   

       dttl     sload    dload  ...  attack_cat_Analysis  attack_cat_Backdoor  \
0 -0.751628  0.590935 -0.27285  ...            -0.102461            -0.095504   
1 -0.751628  4.363255 -0.27285  ...            -0.102461            -0.095504   
2 -0.751628  4.220037 -0.27285  ...            -0.102461            -0.095504   
3 -0.751628  2.850314 -0.27285  ...            -0.102461            -0.095504   
4 -0.751628  4.198501 -0.27285  ...            -0.102461            -0.095504   

   attack_cat_DoS  attack_cat_Exploits  attack_cat_Fuzzers  \


In [7]:
# 將資料分割為訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 初始化核支持向量機（Kernel SVM）模型，這裡使用 RBF 核函數
svm_model = SVC(kernel='rbf', random_state=42)

# 訓練核支持向量機模型
svm_model.fit(X_train, y_train)

In [8]:
# 進行 K-fold cross-validation for each evaluation metric
scoring_metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
num_folds = 5

for metric in scoring_metrics:
    cv_scores = cross_val_score(svm_model, X_train, y_train, cv=num_folds, scoring=metric)
    avg_score = np.mean(cv_scores)
    print(f"{metric.capitalize()} Scores: {', '.join([f'{score:.5f}' for score in cv_scores])}")
    print(f"Average {num_folds}-fold cross-validation {metric}: {avg_score:.5f}\n")

Accuracy Scores: 0.99983, 0.99990, 0.99983, 0.99985, 0.99971
Average 5-fold cross-validation accuracy: 0.99983

Precision Scores: 0.99977, 0.99996, 0.99985, 0.99985, 0.99962
Average 5-fold cross-validation precision: 0.99981

Recall Scores: 0.99996, 0.99989, 0.99989, 0.99992, 0.99992
Average 5-fold cross-validation recall: 0.99992

F1 Scores: 0.99987, 0.99992, 0.99987, 0.99989, 0.99977
Average 5-fold cross-validation f1: 0.99986

Roc_auc Scores: 1.00000, 1.00000, 1.00000, 1.00000, 1.00000
Average 5-fold cross-validation roc_auc: 1.00000



In [9]:
# 預測測試集
pred_test = svm_model.predict(X_test)

# 計算各種指標
test_accuracy = accuracy_score(y_test, pred_test)
test_precision = precision_score(y_test, pred_test)
test_recall = recall_score(y_test, pred_test)
test_f1 = f1_score(y_test, pred_test)
test_roc_auc = roc_auc_score(y_test, pred_test)
test_conf_matrix = confusion_matrix(y_test, pred_test)

# 顯示測試集的模型效能指標
print("Testing performance:\n")
print(f"Accuracy: {test_accuracy*100:.4f}%", "\n")
print(f"Precision: {test_precision*100:.4f}%", "\n")
print(f"Recall: {test_recall*100:.4f}%", "\n")
print(f"F1-Score: {test_f1*100:.4f}%", "\n")
print(f"ROC AUC: {test_roc_auc*100:.4f}%", "\n")
print("Confusion Matrix:\n", test_conf_matrix)

Testing performance:

Accuracy: 99.9787% 

Precision: 99.9757% 

Recall: 99.9909% 

F1-Score: 99.9833% 

ROC AUC: 99.9740% 

Confusion Matrix:
 [[18667     8]
 [    3 32857]]


In [10]:
# 儲存模型
save = input("儲存模型?(y/n): ")

if save.lower() == 'y':
    joblib.dump(svm_model, file_path + 'kernel_svm_model.pkl')
    print("Model Saved!")
elif save.lower() == 'n':
    print("Done!")
else:
    print("Invalid input. Not Saved.")

Model Saved!
