In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix
import joblib

In [2]:
df = pd.read_csv("data/NF-UNSW-NB15-v3.csv")
df.head()

Unnamed: 0,FLOW_START_MILLISECONDS,FLOW_END_MILLISECONDS,IPV4_SRC_ADDR,L4_SRC_PORT,IPV4_DST_ADDR,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,...,SRC_TO_DST_IAT_MIN,SRC_TO_DST_IAT_MAX,SRC_TO_DST_IAT_AVG,SRC_TO_DST_IAT_STDDEV,DST_TO_SRC_IAT_MIN,DST_TO_SRC_IAT_MAX,DST_TO_SRC_IAT_AVG,DST_TO_SRC_IAT_STDDEV,Label,Attack
0,1424242193040,1424242193043,59.166.0.2,4894,149.171.126.3,53,17,5.0,146,2,...,0,0,0,0,0,0,0,0,0,Benign
1,1424242192744,1424242193079,59.166.0.4,52671,149.171.126.6,31992,6,11.0,4704,28,...,0,91,12,19,0,90,12,19,0,Benign
2,1424242190649,1424242193109,59.166.0.0,47290,149.171.126.9,6881,6,37.0,13662,238,...,0,1843,10,119,0,1843,5,88,0,Benign
3,1424242193145,1424242193146,59.166.0.8,43310,149.171.126.7,53,17,5.0,146,2,...,0,0,0,0,0,0,0,0,0,Benign
4,1424242193239,1424242193241,59.166.0.1,45870,149.171.126.1,53,17,5.0,130,2,...,0,0,0,0,0,0,0,0,0,Benign


In [3]:
features = [
    "IN_BYTES", "OUT_BYTES", "IN_PKTS", "OUT_PKTS",
    "FLOW_DURATION_MILLISECONDS",
    "LONGEST_FLOW_PKT", "SHORTEST_FLOW_PKT",
    "MIN_IP_PKT_LEN", "MAX_IP_PKT_LEN",
    "SRC_TO_DST_SECOND_BYTES", "DST_TO_SRC_SECOND_BYTES",
    "SRC_TO_DST_AVG_THROUGHPUT", "DST_TO_SRC_AVG_THROUGHPUT",
    "RETRANSMITTED_IN_BYTES", "RETRANSMITTED_OUT_BYTES",
    "RETRANSMITTED_IN_PKTS", "RETRANSMITTED_OUT_PKTS",
    "SRC_TO_DST_IAT_MIN", "SRC_TO_DST_IAT_MAX", "SRC_TO_DST_IAT_AVG", "SRC_TO_DST_IAT_STDDEV",
    "DST_TO_SRC_IAT_MIN", "DST_TO_SRC_IAT_MAX", "DST_TO_SRC_IAT_AVG", "DST_TO_SRC_IAT_STDDEV"
]

X = df[features]
y = df["Label"]       # 0 = Normal, 1 = Attack
attack_type = df["Attack"]

In [4]:
X.dtypes

IN_BYTES                        int64
OUT_BYTES                       int64
IN_PKTS                         int64
OUT_PKTS                        int64
FLOW_DURATION_MILLISECONDS      int64
LONGEST_FLOW_PKT                int64
SHORTEST_FLOW_PKT               int64
MIN_IP_PKT_LEN                  int64
MAX_IP_PKT_LEN                  int64
SRC_TO_DST_SECOND_BYTES       float64
DST_TO_SRC_SECOND_BYTES       float64
SRC_TO_DST_AVG_THROUGHPUT       int64
DST_TO_SRC_AVG_THROUGHPUT       int64
RETRANSMITTED_IN_BYTES          int64
RETRANSMITTED_OUT_BYTES         int64
RETRANSMITTED_IN_PKTS           int64
RETRANSMITTED_OUT_PKTS          int64
SRC_TO_DST_IAT_MIN              int64
SRC_TO_DST_IAT_MAX              int64
SRC_TO_DST_IAT_AVG              int64
SRC_TO_DST_IAT_STDDEV           int64
DST_TO_SRC_IAT_MIN              int64
DST_TO_SRC_IAT_MAX              int64
DST_TO_SRC_IAT_AVG              int64
DST_TO_SRC_IAT_STDDEV           int64
dtype: object

In [5]:
print(X.isna().sum())   # check NaNs
print(np.isinf(X).sum())  # check infinities


IN_BYTES                          0
OUT_BYTES                         0
IN_PKTS                           0
OUT_PKTS                          0
FLOW_DURATION_MILLISECONDS        0
LONGEST_FLOW_PKT                  0
SHORTEST_FLOW_PKT                 0
MIN_IP_PKT_LEN                    0
MAX_IP_PKT_LEN                    0
SRC_TO_DST_SECOND_BYTES       63425
DST_TO_SRC_SECOND_BYTES           0
SRC_TO_DST_AVG_THROUGHPUT         0
DST_TO_SRC_AVG_THROUGHPUT         0
RETRANSMITTED_IN_BYTES            0
RETRANSMITTED_OUT_BYTES           0
RETRANSMITTED_IN_PKTS             0
RETRANSMITTED_OUT_PKTS            0
SRC_TO_DST_IAT_MIN                0
SRC_TO_DST_IAT_MAX                0
SRC_TO_DST_IAT_AVG                0
SRC_TO_DST_IAT_STDDEV             0
DST_TO_SRC_IAT_MIN                0
DST_TO_SRC_IAT_MAX                0
DST_TO_SRC_IAT_AVG                0
DST_TO_SRC_IAT_STDDEV             0
dtype: int64
IN_BYTES                           0
OUT_BYTES                          0
IN_PKTS      

In [6]:
X = (X.replace([np.inf, -np.inf], 0)
      .fillna(0))