# `Библиотеки`

In [3]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    roc_auc_score
)

import matplotlib.pyplot as plt

# `Данные: загрузка и препроцессинг`

In [5]:
df = pd.read_csv("network_traffic.csv")

df

Unnamed: 0,time,source_ip_int,destination_ip_int,source_port,destination_port,protocol,duration,packet_count,bytes_sent,bytes_received,label,bytes_per_packet
0,2025-04-07 03:25:53,3232281727,167792955,32237,995,0,2.910802,74,9200,4879,0,124.324324
1,2025-04-07 08:38:03,3232236596,167774143,15995,995,0,4.661168,33,4015,1848,0,121.666667
2,2025-04-07 04:37:03,3232276946,167832337,65426,80,0,1.802558,23,2572,3190,0,111.826087
3,2025-04-07 01:30:53,3232270434,167796473,16433,993,0,4.126773,92,2993,3000,0,32.532609
4,2025-04-06 20:58:03,3232267105,167776023,27110,143,2,1.949097,43,4257,6826,0,99.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
10000,2025-04-06 20:40:18,3232250619,167783555,59711,64660,5,9.643456,78,17193,11333,1,220.423077
10001,2025-04-07 05:20:03,3232295215,167799884,26804,995,1,1.425016,91,240,981,0,2.637363
10002,2025-04-07 05:24:53,3232292523,167822026,63303,23,1,2.121701,43,4877,7670,0,113.418605
10003,2025-04-07 17:11:53,3232252147,167774716,31382,993,1,1.787258,28,6632,366,0,236.857143


In [7]:
print("Пропущенные значения:")
display(df.isna().sum().sort_values(ascending=False).head(20))

Пропущенные значения:


Unnamed: 0,0
time,0
source_ip_int,0
destination_ip_int,0
source_port,0
destination_port,0
protocol,0
duration,0
packet_count,0
bytes_sent,0
bytes_received,0


In [13]:
print("Распределение количества обычного и аномального трафика:")
df["label"].value_counts(normalize=True) * 100

Распределение количества обычного и аномального трафика:


Unnamed: 0_level_0,proportion
label,Unnamed: 1_level_1
0,79.96002
1,20.03998


In [14]:
print("Дубликаты строк:", df.duplicated().sum())

Дубликаты строк: 0


In [16]:
df = df.copy()

df["time"] = pd.to_datetime(df["time"], errors="coerce")

df["hour"] = df["time"].dt.hour.astype("Int64")
df["dayofweek"] = df["time"].dt.dayofweek.astype("Int64")
df["time_unix"] = (df["time"].astype("int64") // 10**9).astype("Int64")

if "bytes_per_packet" not in df.columns:
    df["bytes_per_packet"] = df["bytes_sent"] / df["packet_count"].replace(0, np.nan)

df["bytes_total"] = df["bytes_sent"] + df["bytes_received"]
df["bytes_sent_ratio"] = df["bytes_sent"] / df["bytes_total"].replace(0, np.nan)

df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna(subset=["label"])

df

Unnamed: 0,time,source_ip_int,destination_ip_int,source_port,destination_port,protocol,duration,packet_count,bytes_sent,bytes_received,label,bytes_per_packet,hour,dayofweek,time_unix,bytes_total,bytes_sent_ratio
0,2025-04-07 03:25:53,3232281727,167792955,32237,995,0,2.910802,74,9200,4879,0,124.324324,3,0,1743996353,14079,0.653456
1,2025-04-07 08:38:03,3232236596,167774143,15995,995,0,4.661168,33,4015,1848,0,121.666667,8,0,1744015083,5863,0.684803
2,2025-04-07 04:37:03,3232276946,167832337,65426,80,0,1.802558,23,2572,3190,0,111.826087,4,0,1744000623,5762,0.446373
3,2025-04-07 01:30:53,3232270434,167796473,16433,993,0,4.126773,92,2993,3000,0,32.532609,1,0,1743989453,5993,0.499416
4,2025-04-06 20:58:03,3232267105,167776023,27110,143,2,1.949097,43,4257,6826,0,99.000000,20,6,1743973083,11083,0.384102
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10000,2025-04-06 20:40:18,3232250619,167783555,59711,64660,5,9.643456,78,17193,11333,1,220.423077,20,6,1743972018,28526,0.602713
10001,2025-04-07 05:20:03,3232295215,167799884,26804,995,1,1.425016,91,240,981,0,2.637363,5,0,1744003203,1221,0.196560
10002,2025-04-07 05:24:53,3232292523,167822026,63303,23,1,2.121701,43,4877,7670,0,113.418605,5,0,1744003493,12547,0.388698
10003,2025-04-07 17:11:53,3232252147,167774716,31382,993,1,1.787258,28,6632,366,0,236.857143,17,0,1744045913,6998,0.947699


## `Выбор фичей`

In [18]:
target = "label"

drop_cols = ["time", target]

# Надо убрать ip-шники чтобы было больше обобщаемости
ip_cols = ["source_ip_int", "destination_ip_int"]
drop_cols += [c for c in ip_cols if c in df.columns]

X = df.drop(columns=[c for c in drop_cols if c in df.columns])
y = df[target].astype(int)

X

Unnamed: 0,source_port,destination_port,protocol,duration,packet_count,bytes_sent,bytes_received,bytes_per_packet,hour,dayofweek,time_unix,bytes_total,bytes_sent_ratio
0,32237,995,0,2.910802,74,9200,4879,124.324324,3,0,1743996353,14079,0.653456
1,15995,995,0,4.661168,33,4015,1848,121.666667,8,0,1744015083,5863,0.684803
2,65426,80,0,1.802558,23,2572,3190,111.826087,4,0,1744000623,5762,0.446373
3,16433,993,0,4.126773,92,2993,3000,32.532609,1,0,1743989453,5993,0.499416
4,27110,143,2,1.949097,43,4257,6826,99.000000,20,6,1743973083,11083,0.384102
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10000,59711,64660,5,9.643456,78,17193,11333,220.423077,20,6,1743972018,28526,0.602713
10001,26804,995,1,1.425016,91,240,981,2.637363,5,0,1744003203,1221,0.196560
10002,63303,23,1,2.121701,43,4877,7670,113.418605,5,0,1744003493,12547,0.388698
10003,31382,993,1,1.787258,28,6632,366,236.857143,17,0,1744045913,6998,0.947699


# `Разделение на train/test и обучение`

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [20]:
rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=None,
    min_samples_leaf=2,
    n_jobs=-1,
    random_state=42,
    class_weight="balanced"
)

In [21]:
rf.fit(X_train, y_train)

# `Качество классификации`

In [32]:
proba = rf.predict_proba(X_test)[:, 1]
pred = (proba >= 0.5).astype(int)

print("Результаты классификации:")
print(classification_report(y_test, pred, digits=4))

roc = roc_auc_score(y_test, proba)
print(f"ROC-AUC: {roc:.4f}")

Результаты классификации:
              precision    recall  f1-score   support

           0     1.0000    0.9988    0.9994      1600
           1     0.9950    1.0000    0.9975       401

    accuracy                         0.9990      2001
   macro avg     0.9975    0.9994    0.9984      2001
weighted avg     0.9990    0.9990    0.9990      2001

ROC-AUC: 1.0000
