In [56]:
import pathlib
import time

import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd

from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [69]:
PATH_TO_ROOT = pathlib.Path("C://Users/Dzagcoffee/Documents/repos/ОПИСиС/")
PATH_TO_DATA = PATH_TO_ROOT / "data"

In [71]:
data = pd.read_csv(PATH_TO_DATA / "UNSW_NB15_training-set.csv")

In [None]:
data

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.000011,udp,-,INT,2,0,496,0,90909.090200,...,1,2,0,0,0,1,2,0,Normal,0
1,2,0.000008,udp,-,INT,2,0,1762,0,125000.000300,...,1,2,0,0,0,1,2,0,Normal,0
2,3,0.000005,udp,-,INT,2,0,1068,0,200000.005100,...,1,3,0,0,0,1,3,0,Normal,0
3,4,0.000006,udp,-,INT,2,0,900,0,166666.660800,...,1,3,0,0,0,2,3,0,Normal,0
4,5,0.000010,udp,-,INT,2,0,2126,0,100000.002500,...,1,3,0,0,0,2,3,0,Normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82327,82328,0.000005,udp,-,INT,2,0,104,0,200000.005100,...,1,2,0,0,0,2,1,0,Normal,0
82328,82329,1.106101,tcp,-,FIN,20,8,18062,354,24.410067,...,1,1,0,0,0,3,2,0,Normal,0
82329,82330,0.000000,arp,-,INT,1,0,46,0,0.000000,...,1,1,0,0,0,1,1,1,Normal,0
82330,82331,0.000000,arp,-,INT,1,0,46,0,0.000000,...,1,1,0,0,0,1,1,1,Normal,0


In [33]:
# Убираем записи, протоколы в которых во всем наборе
# данных встречались менее, чем 100 раз
print("value_counts of each protocol:")
count_per_protocol_type = data['proto'].value_counts()
print("before:")
print(data['proto'].value_counts(), end="\n\n")

protocols_to_remove = count_per_protocol_type[count_per_protocol_type < 100].index

data = data[~data['proto'].isin(protocols_to_remove)]

print("after:")
print(data['proto'].value_counts())

value_counts of each protocol:
before:
tcp           43095
udp           29418
unas           3515
arp             987
ospf            676
              ...  
br-sat-mon       32
pvp              32
wsn              32
ib               31
igmp             30
Name: proto, Length: 131, dtype: int64

after:
tcp     43095
udp     29418
unas     3515
arp       987
ospf      676
sctp      324
Name: proto, dtype: int64


In [34]:
data = pd.get_dummies(data, columns=['state', 'service'])
data['attack_cat'] = data['attack_cat'].apply(
    lambda x: 0 if x == 'Normal' else 1
)

In [35]:
# Разделение на признаки и целевую переменную
features = data.drop('proto', axis=1)
target = data['proto']

# Разделение на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.6, random_state=42
)

# Масштабирование признаков
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Кодирование меток
labeler = LabelEncoder()
y_train_encoded = labeler.fit_transform(y_train)
y_test_encoded = labeler.transform(y_test)

print("Обучающая выборка: {} записей".format(len(X_train)))
print(y_train.value_counts())

print("Проверочная выборка: {} записей".format(len(X_test)))
print(y_test.value_counts())

Обучающая выборка: 31206 записей
tcp     17157
udp     11847
unas     1420
arp       390
ospf      276
sctp      116
Name: proto, dtype: int64
Проверочная выборка: 46809 записей
tcp     25938
udp     17571
unas     2095
arp       597
ospf      400
sctp      208
Name: proto, dtype: int64


In [37]:
unencoded_labels = "; ".join([
    f"{i}: {labeler.inverse_transform([i])[0]}"
    for i in range(5)
])

print(unencoded_labels)

0: arp; 1: ospf; 2: sctp; 3: tcp; 4: udp


In [60]:
model_results = []

In [61]:
model_name = "Метод опорных векторов (Scikit-learn)"

model = SVC()

start_time = time.time()

model.fit(X_train_scaled, y_train_encoded)

end_time = time.time()

y_pred = model.predict(X_test_scaled)

print(f"Модель: {model_name}")
print(f"Время обучения: {round(end_time - start_time, 2)}")
print(classification_report(y_test_encoded, y_pred))
print(f"где {unencoded_labels}")

model_results.append({
    "model_name": model_name,
    "model_train_time": round(end_time - start_time, 2),
    "model acc, f1": [
        accuracy_score(y_test_encoded, y_pred),
        f1_score(y_test_encoded, y_pred, average="macro")
    ]
})

Модель: Метод опорных векторов (Scikit-learn)
Время обучения: 12.88
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       597
           1       0.97      0.95      0.96       400
           2       0.85      0.24      0.37       208
           3       1.00      1.00      1.00     25938
           4       0.99      0.99      0.99     17571
           5       0.93      0.96      0.94      2095

    accuracy                           0.99     46809
   macro avg       0.96      0.85      0.88     46809
weighted avg       0.99      0.99      0.99     46809

где 0: arp; 1: ospf; 2: sctp; 3: tcp; 4: udp


In [62]:
model_name = "Деревья принятия решений (Scikit-learn.DecisionTreeClassifier)"

model = DecisionTreeClassifier()

start_time = time.time()

model.fit(X_train_scaled, y_train_encoded)

end_time = time.time()

y_pred = model.predict(X_test_scaled)

print(f"Модель: {model_name}")
print(f"Время обучения: {round(end_time - start_time, 2)}")
print(classification_report(y_test_encoded, y_pred))
print(f"где {unencoded_labels}")

model_results.append({
    "model_name": model_name,
    "model_train_time": round(end_time - start_time, 2),
    "model acc, f1": [
        accuracy_score(y_test_encoded, y_pred),
        f1_score(y_test_encoded, y_pred, average="macro")
    ]
})

Модель: Деревья принятия решений (Scikit-learn.DecisionTreeClassifier)
Время обучения: 0.18
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       597
           1       1.00      1.00      1.00       400
           2       0.94      0.95      0.95       208
           3       1.00      1.00      1.00     25938
           4       1.00      1.00      1.00     17571
           5       1.00      1.00      1.00      2095

    accuracy                           1.00     46809
   macro avg       0.99      0.99      0.99     46809
weighted avg       1.00      1.00      1.00     46809

где 0: arp; 1: ospf; 2: sctp; 3: tcp; 4: udp


In [63]:
model_name = "Бэггинг (Scikit-learn.RandomForestClassifier)"

model = BaggingClassifier(
    base_estimator=RandomForestClassifier(),
    random_state=42,
    n_estimators=10
)

start_time = time.time()

model.fit(X_train_scaled, y_train_encoded)

end_time = time.time()

y_pred = model.predict(X_test_scaled)

print(f"Модель: {model_name}")
print(f"Время обучения: {round(end_time - start_time, 2)}")
print(classification_report(y_test_encoded, y_pred))
print(f"где {unencoded_labels}")

model_results.append({
    "model_name": model_name,
    "model_train_time": round(end_time - start_time, 2),
    "model acc, f1": [
        accuracy_score(y_test_encoded, y_pred),
        f1_score(y_test_encoded, y_pred, average="macro")
    ]
})

Модель: Бэггинг (Scikit-learn.RandomForestClassifier)
Время обучения: 24.78
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       597
           1       0.99      1.00      1.00       400
           2       1.00      0.92      0.96       208
           3       1.00      1.00      1.00     25938
           4       1.00      1.00      1.00     17571
           5       1.00      1.00      1.00      2095

    accuracy                           1.00     46809
   macro avg       1.00      0.99      0.99     46809
weighted avg       1.00      1.00      1.00     46809

где 0: arp; 1: ospf; 2: sctp; 3: tcp; 4: udp


In [64]:
model_name = "Бустинг (CatBoost)"

model = CatBoostClassifier(verbose=40)

start_time = time.time()

model.fit(X_train_scaled, y_train_encoded)

end_time = time.time()

y_pred = model.predict(X_test_scaled)

print(f"Модель: {model_name}")
print(f"Время обучения: {round(end_time - start_time, 2)}")
print(classification_report(y_test_encoded, y_pred))
print(f"где {unencoded_labels}")

model_results.append({
    "model_name": model_name,
    "model_train_time": round(end_time - start_time, 2),
    "model acc, f1": [
        accuracy_score(y_test_encoded, y_pred),
        f1_score(y_test_encoded, y_pred, average="macro")
    ]
})

Learning rate set to 0.094304
0:	learn: 1.3778024	total: 60.6ms	remaining: 1m
40:	learn: 0.0345216	total: 2.54s	remaining: 59.4s
80:	learn: 0.0120710	total: 4.7s	remaining: 53.3s
120:	learn: 0.0081908	total: 6.79s	remaining: 49.3s
160:	learn: 0.0060805	total: 9.17s	remaining: 47.8s
200:	learn: 0.0050440	total: 11.6s	remaining: 46.3s
240:	learn: 0.0042661	total: 14.1s	remaining: 44.4s
280:	learn: 0.0035951	total: 16.2s	remaining: 41.5s
320:	learn: 0.0030447	total: 18.3s	remaining: 38.6s
360:	learn: 0.0026618	total: 20.9s	remaining: 37.1s
400:	learn: 0.0023428	total: 24.2s	remaining: 36.1s
440:	learn: 0.0021271	total: 26.8s	remaining: 33.9s
480:	learn: 0.0019400	total: 29.2s	remaining: 31.6s
520:	learn: 0.0017970	total: 31.3s	remaining: 28.8s
560:	learn: 0.0016901	total: 33.6s	remaining: 26.3s
600:	learn: 0.0015937	total: 35.7s	remaining: 23.7s
640:	learn: 0.0015056	total: 38s	remaining: 21.3s
680:	learn: 0.0014298	total: 40.2s	remaining: 18.9s
720:	learn: 0.0013737	total: 42.4s	remainin

In [65]:
model_name = "PyTorch NN (MLP)"

X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_encoded, dtype=torch.long)
y_test_tensor = torch.tensor(y_test_encoded, dtype=torch.long)


class Net(nn.Module):
    def __init__(self, input_size, num_classes):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, num_classes)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x


input_size = X_train_scaled.shape[1]
num_classes = len(labeler.classes_)

model = Net(input_size, num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

num_epochs = 1
batch_size = 32
total_samples = X_train_tensor.shape[0]
total_batches = total_samples // batch_size

start_time = time.time()
for epoch in range(num_epochs):
    for i in range(total_batches):
        start = i * batch_size
        end = start + batch_size
        batch_X = X_train_tensor[start:end]
        batch_y = y_train_tensor[start:end]

        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

end_time = time.time()

with torch.no_grad():
    model.eval()
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs.data, 1)

print(f"Модель: {model_name}")
print(f"Время обучения: {round(end_time - start_time, 2)}")
print(classification_report(y_test_encoded, predicted))
print(f"где {unencoded_labels}")

model_results.append({
    "model_name": model_name,
    "model_train_time": round(end_time - start_time, 2),
    "model acc, f1": [
        accuracy_score(y_test_encoded, predicted),
        f1_score(y_test_encoded, predicted, average="macro")
    ]
})

Модель: PyTorch NN (MLP)
Время обучения: 2.28
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       597
           1       0.91      0.84      0.87       400
           2       0.00      0.00      0.00       208
           3       1.00      1.00      1.00     25938
           4       0.98      0.98      0.98     17571
           5       0.87      0.96      0.91      2095

    accuracy                           0.99     46809
   macro avg       0.79      0.79      0.79     46809
weighted avg       0.98      0.99      0.98     46809

где 0: arp; 1: ospf; 2: sctp; 3: tcp; 4: udp


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [68]:
res_df = pd.DataFrame({
    "model_name": [
        model_result["model_name"]
        for model_result in model_results
    ],
    "model_train_time, s": [
        model_result["model_train_time"]
        for model_result in model_results
    ],
    "model accuracy": [
        model_result["model acc, f1"][0]
        for model_result in model_results
    ],
    "model f1 (macro)": [
        model_result["model acc, f1"][0]
        for model_result in model_results
    ]
})

res_df

Unnamed: 0,model_name,"model_train_time, s",model accuracy,model f1 (macro)
0,Метод опорных векторов (Scikit-learn),12.88,0.990985,0.990985
1,Деревья принятия решений (Scikit-learn.Decisio...,0.18,0.999338,0.999338
2,Бэггинг (Scikit-learn.RandomForestClassifier),24.78,0.999551,0.999551
3,Бустинг (CatBoost),58.13,0.999445,0.999445
4,PyTorch NN (MLP),2.28,0.986327,0.986327
