# `Библиотеки`

In [2]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.metrics import (
    classification_report,
    roc_auc_score
)

import matplotlib.pyplot as plt

# `Данные: загрузка и препроцессинг`

In [6]:
df = pd.read_csv("network_traffic.csv")

df

Unnamed: 0,time,source_ip_int,destination_ip_int,source_port,destination_port,protocol,duration,packet_count,bytes_sent,bytes_received,label,bytes_per_packet
0,2025-04-07 03:25:53,3232281727,167792955,32237,995,0,2.910802,74,9200,4879,0,124.324324
1,2025-04-07 08:38:03,3232236596,167774143,15995,995,0,4.661168,33,4015,1848,0,121.666667
2,2025-04-07 04:37:03,3232276946,167832337,65426,80,0,1.802558,23,2572,3190,0,111.826087
3,2025-04-07 01:30:53,3232270434,167796473,16433,993,0,4.126773,92,2993,3000,0,32.532609
4,2025-04-06 20:58:03,3232267105,167776023,27110,143,2,1.949097,43,4257,6826,0,99.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
10000,2025-04-06 20:40:18,3232250619,167783555,59711,64660,5,9.643456,78,17193,11333,1,220.423077
10001,2025-04-07 05:20:03,3232295215,167799884,26804,995,1,1.425016,91,240,981,0,2.637363
10002,2025-04-07 05:24:53,3232292523,167822026,63303,23,1,2.121701,43,4877,7670,0,113.418605
10003,2025-04-07 17:11:53,3232252147,167774716,31382,993,1,1.787258,28,6632,366,0,236.857143


In [7]:
print("Пропущенные значения:")
display(df.isna().sum().sort_values(ascending=False).head(20))

Пропущенные значения:


Unnamed: 0,0
time,0
source_ip_int,0
destination_ip_int,0
source_port,0
destination_port,0
protocol,0
duration,0
packet_count,0
bytes_sent,0
bytes_received,0


In [8]:
print("Распределение количества обычного и аномального трафика:")
df["label"].value_counts(normalize=True) * 100

Распределение количества обычного и аномального трафика:


Unnamed: 0_level_0,proportion
label,Unnamed: 1_level_1
0,79.96002
1,20.03998


In [9]:
print("Дубликаты строк:", df.duplicated().sum())

Дубликаты строк: 0


In [10]:
df = df.copy()

df["time"] = pd.to_datetime(df["time"], errors="coerce")

df["hour"] = df["time"].dt.hour.astype("Int64")
df["dayofweek"] = df["time"].dt.dayofweek.astype("Int64")
df["time_unix"] = (df["time"].astype("int64") // 10**9).astype("Int64")

if "bytes_per_packet" not in df.columns:
    df["bytes_per_packet"] = df["bytes_sent"] / df["packet_count"].replace(0, np.nan)

df["bytes_total"] = df["bytes_sent"] + df["bytes_received"]
df["bytes_sent_ratio"] = df["bytes_sent"] / df["bytes_total"].replace(0, np.nan)

df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna(subset=["label"])

df

Unnamed: 0,time,source_ip_int,destination_ip_int,source_port,destination_port,protocol,duration,packet_count,bytes_sent,bytes_received,label,bytes_per_packet,hour,dayofweek,time_unix,bytes_total,bytes_sent_ratio
0,2025-04-07 03:25:53,3232281727,167792955,32237,995,0,2.910802,74,9200,4879,0,124.324324,3,0,1743996353,14079,0.653456
1,2025-04-07 08:38:03,3232236596,167774143,15995,995,0,4.661168,33,4015,1848,0,121.666667,8,0,1744015083,5863,0.684803
2,2025-04-07 04:37:03,3232276946,167832337,65426,80,0,1.802558,23,2572,3190,0,111.826087,4,0,1744000623,5762,0.446373
3,2025-04-07 01:30:53,3232270434,167796473,16433,993,0,4.126773,92,2993,3000,0,32.532609,1,0,1743989453,5993,0.499416
4,2025-04-06 20:58:03,3232267105,167776023,27110,143,2,1.949097,43,4257,6826,0,99.000000,20,6,1743973083,11083,0.384102
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10000,2025-04-06 20:40:18,3232250619,167783555,59711,64660,5,9.643456,78,17193,11333,1,220.423077,20,6,1743972018,28526,0.602713
10001,2025-04-07 05:20:03,3232295215,167799884,26804,995,1,1.425016,91,240,981,0,2.637363,5,0,1744003203,1221,0.196560
10002,2025-04-07 05:24:53,3232292523,167822026,63303,23,1,2.121701,43,4877,7670,0,113.418605,5,0,1744003493,12547,0.388698
10003,2025-04-07 17:11:53,3232252147,167774716,31382,993,1,1.787258,28,6632,366,0,236.857143,17,0,1744045913,6998,0.947699


## `Выбор фичей`

In [16]:
target = "label"

drop_cols = ["time", target]
drop_cols = [c for c in drop_cols if c in df.columns]

X = df.drop(columns=drop_cols).copy()
y = df[target].astype(int).values

X = X.apply(pd.to_numeric, errors="coerce")
X = X.replace([np.inf, -np.inf], np.nan).fillna(0.0)

for c in ["source_ip_int", "destination_ip_int"]:
    if c in X.columns:
        X = X.drop(columns=[c])

X_values = X.values.astype(np.float32)

# `Разделение на train/test и обучение`

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    X_values, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [21]:
class NpDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X)
        self.y = torch.from_numpy(y.astype(np.float32)).view(-1, 1)

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds = NpDataset(X_train, y_train)
test_ds  = NpDataset(X_test, y_test)

train_loader = DataLoader(train_ds, batch_size=256, shuffle=True, num_workers=0)
test_loader  = DataLoader(test_ds, batch_size=512, shuffle=False, num_workers=0)

In [48]:
class MLP(nn.Module):
    def __init__(self, n_in):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_in, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.1),

            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.1),

            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.net(x)

In [98]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MLP(n_in=X_train.shape[1]).to(device)

model

MLP(
  (net): Sequential(
    (0): Linear(in_features=13, out_features=128, bias=True)
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.1, inplace=False)
    (4): Linear(in_features=128, out_features=64, bias=True)
    (5): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.1, inplace=False)
    (8): Linear(in_features=64, out_features=1, bias=True)
  )
)

In [99]:
# neg = (y_train == 0).sum()
# pos = (y_train == 1).sum()
# pos_weight = torch.tensor([neg / max(pos, 1)], dtype=torch.float32, device=device)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, weight_decay=1e-4)

In [100]:
def evaluate(model, loader):
    model.eval()
    all_logits, all_y = [], []

    with torch.no_grad():
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device)
            logits = model(xb)
            all_logits.append(logits.cpu().numpy())
            all_y.append(yb.cpu().numpy())

    logits = np.vstack(all_logits).reshape(-1)
    y_true = np.vstack(all_y).reshape(-1).astype(int)
    proba = 1 / (1 + np.exp(-logits))

    return y_true, proba

In [101]:
best_state = None
history = []

for epoch in range(1, 42):
    model.train()
    losses = []

    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)

        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())

    y_true, proba = evaluate(model, test_loader)
    roc = roc_auc_score(y_true, proba)
    history.append((epoch, float(np.mean(losses)), float(roc)))

    if epoch % 5 == 0 or epoch == 1:
        print(f"epoch={epoch:02d} loss={np.mean(losses):.4f} roc={roc:.4f}")

  proba = 1 / (1 + np.exp(-logits))


epoch=01 loss=0.6828 roc=0.5000
epoch=05 loss=0.6071 roc=0.9341
epoch=10 loss=0.5381 roc=0.8720
epoch=15 loss=0.4922 roc=0.7999
epoch=20 loss=0.4572 roc=0.9302
epoch=25 loss=0.4328 roc=0.9070
epoch=30 loss=0.4077 roc=0.8939
epoch=35 loss=0.3875 roc=0.9420
epoch=40 loss=0.3687 roc=0.9424


# `Качество классификации`

In [102]:
y_true, proba = evaluate(model, test_loader)
pred = (proba >= 0.5).astype(int)

print("\nClassification report:")
print(classification_report(y_true, pred, digits=4))

roc = roc_auc_score(y_true, proba)
print(f"ROC-AUC: {roc:.4f}")


Classification report:
              precision    recall  f1-score   support

           0     0.8753    1.0000    0.9335      1600
           1     1.0000    0.4314    0.6028       401

    accuracy                         0.8861      2001
   macro avg     0.9376    0.7157    0.7681      2001
weighted avg     0.9003    0.8861    0.8672      2001

ROC-AUC: 0.9198
