In [1]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.metrics import ConfusionMatrixDisplay, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from torch.optim import Adam
from torch.utils.data import DataLoader

sys.path.append("../")

from fwc2.model import FWC2
from fwc2.loss import NTXent
from fwc2.dataset import load_pretraining, Data, FWC2Dataset

SEED = 42

In [12]:
from fwc2.dataset import load

df = load()

['enp0s3-monday-pvt.pcap_Flow.csv', 'enp0s3-monday.pcap_Flow.csv', 'enp0s3-public-thursday.pcap_Flow.csv', 'enp0s3-public-tuesday.pcap_Flow.csv', 'enp0s3-public-wednesday.pcap_Flow.csv', 'enp0s3-pvt-thursday.pcap_Flow.csv', 'enp0s3-pvt-tuesday.pcap_Flow.csv', 'enp0s3-pvt-wednesday.pcap_Flow.csv', 'enp0s3-tcpdump-friday.pcap_Flow.csv', 'enp0s3-tcpdump-pvt-friday.pcap_Flow.csv'] 10
enp0s3-monday-pvt.pcap_Flow.csv shape of data = (3404, 85)
enp0s3-monday.pcap_Flow.csv shape of data = (8728, 85)
enp0s3-public-thursday.pcap_Flow.csv shape of data = (9685, 85)
enp0s3-public-tuesday.pcap_Flow.csv shape of data = (29242, 85)
enp0s3-public-wednesday.pcap_Flow.csv shape of data = (17487, 85)
enp0s3-pvt-thursday.pcap_Flow.csv shape of data = (4114, 85)
enp0s3-pvt-tuesday.pcap_Flow.csv shape of data = (2615, 85)
enp0s3-pvt-wednesday.pcap_Flow.csv shape of data = (1437, 85)
enp0s3-tcpdump-friday.pcap_Flow.csv shape of data = (7361, 85)
enp0s3-tcpdump-pvt-friday.pcap_Flow.csv shape of data = (2618, 

In [13]:
df.shape

(86691, 84)

# Pretraining
Here we load the pretrainig data, apply simple preprocessing to data (normalization) and then train the model

In [6]:
train_x, train_y, test_x, test_y = load_pretraining(subsets=['cicids17'], train_ratio=0.7)

scaler = StandardScaler()

train_x = pd.DataFrame(scaler.fit_transform(train_x), columns=train_x.columns)
test_x = pd.DataFrame(scaler.fit_transform(test_x), columns=test_x.columns)

train_ds = FWC2Dataset(train_x.to_numpy(), train_x.to_numpy(), columns=train_x.columns)
test_ds = FWC2Dataset(test_x.to_numpy(), test_y.to_numpy(), columns=test_x.columns)

['Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv', 'Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv', 'Friday-WorkingHours-Morning.pcap_ISCX.csv', 'Monday-WorkingHours.pcap_ISCX.csv', 'Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv', 'Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv', 'Tuesday-WorkingHours.pcap_ISCX.csv', 'Wednesday-workingHours.pcap_ISCX.csv'] 8
Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv shape of data = (225745, 85)
Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv shape of data = (286467, 85)
Friday-WorkingHours-Morning.pcap_ISCX.csv shape of data = (191033, 85)
Monday-WorkingHours.pcap_ISCX.csv shape of data = (529918, 85)
Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv shape of data = (288602, 85)


  df = pd.read_csv(file_path)


Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv shape of data = (458968, 85)
Tuesday-WorkingHours.pcap_ISCX.csv shape of data = (445909, 85)
Wednesday-workingHours.pcap_ISCX.csv shape of data = (692703, 85)
zero_var columns = ['bwd_psh_flags' 'bwd_urg_flags' 'fwd_avg_bytes/bulk'
 'fwd_avg_packets/bulk' 'fwd_avg_bulk_rate' 'bwd_avg_bytes/bulk'
 'bwd_avg_packets/bulk' 'bwd_avg_bulk_rate']


In [7]:
train_x.shape, test_x.shape

((1590410, 68), (1237466, 68))

In [8]:
def train_epoch(model, criterion, train_loader, optimizer, device):
    model.train()
    epoch_loss = 0.0

    for x in train_loader:
        x = x.to(device)
        emb_anchor, emb_positive = model(x)
        loss = criterion(emb_anchor, emb_positive)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        epoch_loss += loss.item()

    return epoch_loss / len(train_loader.dataset)

In [9]:
batch_size = 2048
epochs = 100
corruption_rate = 0.4
tau = 1.0
device = 'cpu'

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

model = FWC2(
    input_dim=train_ds.shape[1],
    features_low=train_ds.features_low,
    features_high=train_ds.features_high,
    dims_hidden_encoder=[128, 64, 32, 16],
    dims_hidden_head=[8, 8],
    corruption_rate=corruption_rate,
    dropout=0.1,
).to(device)

optimizer = Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
ntxent_loss = NTXent(tau)

loss_history = []

for epoch in range(1, epochs + 1):
    epoch_loss = train_epoch(model, ntxent_loss, train_loader, optimizer, device)
    loss_history.append(epoch_loss)

    print(f"epoch {epoch}/{epochs} - loss: {loss_history[-1]:.4f}")


fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(loss_history)
ax.set_xlabel("epoch")
ax.set_ylabel("loss")

fig.savefig(f'pretraining-loss--whp(cp={corruption_rate},tau={tau}).svg', format='svg')

epoch 1/100 - loss: 0.0041
epoch 2/100 - loss: 0.0041
epoch 3/100 - loss: 0.0041
epoch 4/100 - loss: 0.0040


KeyboardInterrupt: 