In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import cross_val_score
import numpy as np
import joblib

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
file_path='/content/drive/MyDrive/Dataset/NUSW-NB15/'

# 讀取CSV檔案
data_train = pd.read_csv(file_path + 'training-set.csv')
data_test = pd.read_csv(file_path + 'testing-set.csv')

# 列出 dataset 的資訊
print("Training Data :", data_train.shape)
for column in data_train.columns:
    unique_values = data_train[column].unique()
    print(f"'{column}' ",end=" ")

print("\n", "Testing Data :", data_test.shape)
for column in data_test.columns:
    unique_values = data_test[column].unique()
    print(f"'{column}' ",end=" ")

Training Data : (82332, 45)
'id'  'dur'  'proto'  'service'  'state'  'spkts'  'dpkts'  'sbytes'  'dbytes'  'rate'  'sttl'  'dttl'  'sload'  'dload'  'sloss'  'dloss'  'sinpkt'  'dinpkt'  'sjit'  'djit'  'swin'  'stcpb'  'dtcpb'  'dwin'  'tcprtt'  'synack'  'ackdat'  'smean'  'dmean'  'trans_depth'  'response_body_len'  'ct_srv_src'  'ct_state_ttl'  'ct_dst_ltm'  'ct_src_dport_ltm'  'ct_dst_sport_ltm'  'ct_dst_src_ltm'  'is_ftp_login'  'ct_ftp_cmd'  'ct_flw_http_mthd'  'ct_src_ltm'  'ct_srv_dst'  'is_sm_ips_ports'  'attack_cat'  'label'  
 Testing Data : (175341, 45)
'id'  'dur'  'proto'  'service'  'state'  'spkts'  'dpkts'  'sbytes'  'dbytes'  'rate'  'sttl'  'dttl'  'sload'  'dload'  'sloss'  'dloss'  'sinpkt'  'dinpkt'  'sjit'  'djit'  'swin'  'stcpb'  'dtcpb'  'dwin'  'tcprtt'  'synack'  'ackdat'  'smean'  'dmean'  'trans_depth'  'response_body_len'  'ct_srv_src'  'ct_state_ttl'  'ct_dst_ltm'  'ct_src_dport_ltm'  'ct_dst_sport_ltm'  'ct_dst_src_ltm'  'is_ftp_login'  'ct_ftp_cmd'  

In [4]:
# 刪除 'id' 列
for data in [data_train, data_test]:
    if 'id' in data.columns:
        data.drop(columns=['id'], inplace=True)

print("Training Data :", data_train.shape)
print("Testing Data :", data_test.shape)

Training Data : (82332, 44)
Testing Data : (175341, 44)


In [5]:
# 合併訓練集和測試集
merged_data = pd.concat([data_train, data_test], ignore_index=True)
print("Merged Data :", merged_data.shape)

Merged Data : (257673, 44)


In [6]:
# 選擇要進行 One-Hot Encoding 的類別型特徵
categories = ['proto', 'service', 'state', 'attack_cat']

# 使用 get_dummies() 函數對類別型特徵進行 One-Hot Encoding
data_encoded = pd.get_dummies(merged_data, columns=categories)

merged_data.drop(columns=categories, inplace=True)

# 將 One-Hot Encoding 後的資料加回到 merged_data 中
merged_data = pd.concat([merged_data, data_encoded], axis=1)

# 顯示 merged_data 的部分資料以確認結果
print(merged_data.info())
print(pd.DataFrame(merged_data, columns=merged_data.columns).head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 257673 entries, 0 to 257672
Columns: 247 entries, dur to attack_cat_Worms
dtypes: float64(22), int64(58), uint8(167)
memory usage: 198.3 MB
None
        dur  spkts  dpkts  sbytes  dbytes         rate  sttl  dttl  \
0  0.000011      2      0     496       0   90909.0902   254     0   
1  0.000008      2      0    1762       0  125000.0003   254     0   
2  0.000005      2      0    1068       0  200000.0051   254     0   
3  0.000006      2      0     900       0  166666.6608   254     0   
4  0.000010      2      0    2126       0  100000.0025   254     0   

         sload  dload  ...  attack_cat_Analysis  attack_cat_Backdoor  \
0  180363632.0    0.0  ...                    0                    0   
1  881000000.0    0.0  ...                    0                    0   
2  854400000.0    0.0  ...                    0                    0   
3  600000000.0    0.0  ...                    0                    0   
4  850400000.0    0.0  .

In [7]:
# 假設 'label' 目標屬性
X = merged_data.drop(columns=['attack_cat_DoS'])#label
y = merged_data['attack_cat_DoS']
print(X.info())
print(y.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 257673 entries, 0 to 257672
Columns: 246 entries, dur to attack_cat_Worms
dtypes: float64(22), int64(58), uint8(166)
memory usage: 198.1 MB
None
<class 'pandas.core.series.Series'>
RangeIndex: 257673 entries, 0 to 257672
Series name: attack_cat_DoS
Non-Null Count   Dtype
--------------   -----
257673 non-null  uint8
dtypes: uint8(1)
memory usage: 251.8 KB
None


In [8]:
# 標準化資料
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 顯示標準化後的部分資料以確認結果
print(pd.DataFrame(X_scaled, columns=X.columns).head())
print(type(X_scaled))
print(X_scaled.shape)

        dur     spkts     dpkts    sbytes    dbytes      rate      sttl  \
0 -0.208678 -0.130765 -0.165331 -0.046480 -0.098409 -0.002151  0.722026   
1 -0.208679 -0.130765 -0.165331 -0.039194 -0.098409  0.210460  0.722026   
2 -0.208679 -0.130765 -0.165331 -0.043188 -0.098409  0.678204  0.722026   
3 -0.208679 -0.130765 -0.165331 -0.044155 -0.098409  0.470318  0.722026   
4 -0.208678 -0.130765 -0.165331 -0.037100 -0.098409  0.054546  0.722026   

       dttl     sload    dload  ...  state_no  attack_cat_Analysis  \
0 -0.751628  0.590935 -0.27285  ...  -0.00197            -0.102461   
1 -0.751628  4.363255 -0.27285  ...  -0.00197            -0.102461   
2 -0.751628  4.220037 -0.27285  ...  -0.00197            -0.102461   
3 -0.751628  2.850314 -0.27285  ...  -0.00197            -0.102461   
4 -0.751628  4.198501 -0.27285  ...  -0.00197            -0.102461   

   attack_cat_Backdoor  attack_cat_Exploits  attack_cat_Fuzzers  \
0            -0.095504            -0.457047           -0.3222

In [9]:
# 將資料分割為訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 初始化核支持向量機（Kernel SVM）模型，這裡使用 RBF 核函數
svm_model = SVC(kernel='rbf', random_state=42)

# 訓練核支持向量機模型
svm_model.fit(X_train, y_train)

In [None]:
# 進行 K-fold cross-validation for each evaluation metric
scoring_metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
num_folds = 5

for metric in scoring_metrics:
    cv_scores = cross_val_score(svm_model, X_train, y_train, cv=num_folds, scoring=metric)
    avg_score = np.mean(cv_scores)
    print(f"{metric.capitalize()} Scores: {', '.join([f'{score:.5f}' for score in cv_scores])}")
    print(f"Average {num_folds}-fold cross-validation {metric}: {avg_score:.5f}\n")

In [10]:
# 預測測試集
pred_test = svm_model.predict(X_test)

# 計算各種指標
test_accuracy = accuracy_score(y_test, pred_test)
test_precision = precision_score(y_test, pred_test)
test_recall = recall_score(y_test, pred_test)
test_f1 = f1_score(y_test, pred_test)
test_roc_auc = roc_auc_score(y_test, pred_test)
test_conf_matrix = confusion_matrix(y_test, pred_test)

# 顯示測試集的模型效能指標
print("Testing performance:\n")
print(f"Accuracy: {test_accuracy*100:.4f}%", "\n")
print(f"Precision: {test_precision*100:.4f}%", "\n")
print(f"Recall: {test_recall*100:.4f}%", "\n")
print(f"F1-Score: {test_f1*100:.4f}%", "\n")
print(f"ROC AUC: {test_roc_auc*100:.4f}%", "\n")
print("Confusion Matrix:\n", test_conf_matrix)

Testing performance:

Accuracy: 99.1559% 

Precision: 99.0728% 

Recall: 87.6101% 

F1-Score: 92.9895% 

ROC AUC: 93.7771% 

Confusion Matrix:
 [[48215    27]
 [  408  2885]]


In [None]:
# 儲存模型
save = input("儲存模型?(y/n): ")

if save.lower() == 'y':
    joblib.dump(svm_model, file_path + 'kernel_svm_model.pkl')
    print("Model Saved!")
elif save.lower() == 'n':
    print("Done!")
else:
    print("Invalid input. Not Saved.")