In [1]:
import warnings
import sys
sys.path.append('../../../../')
warnings.filterwarnings("ignore")
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

from models import *
from pytorch_lightning.loggers import TensorBoardLogger, CSVLogger
from pytorch_lightning.callbacks import LearningRateMonitor
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
data_path = '../../../../data/CIC_2018/no_preprocess/df_equal_DDoS attacks-LOIC-HTTP.parquet'
df = pd.read_parquet(data_path)
df.head()

Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,,,45498,,22,6,1519115647,888751,11,11,...,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1,,,0,,0,0,1519115602,112642816,3,0,...,0,0.0,0.0,0.0,0.0,56300000.0,7.071068,56300000.0,56300000.0,Benign
2,,,0,,0,0,1519115771,112642712,3,0,...,0,0.0,0.0,0.0,0.0,56300000.0,18.384777,56300000.0,56300000.0,Benign
3,,,0,,0,0,1519115940,112642648,3,0,...,0,0.0,0.0,0.0,0.0,56300000.0,5.656854,56300000.0,56300000.0,Benign
4,,,0,,0,0,1519116109,112642702,3,0,...,0,0.0,0.0,0.0,0.0,56300000.0,65.053825,56300000.0,56300000.0,Benign


In [3]:
df.drop(['Flow ID', 'Src IP', 'Dst IP'], axis=1, inplace=True)

In [4]:
df['Label'].isna().sum()

0

In [5]:
import numpy as np

infinite_values = df.isin([np.inf, -np.inf]).sum().sum()
print(f"Number of infinite values in the dataframe: {infinite_values}")

df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

infinite_values = df.isin([np.inf, -np.inf]).sum().sum()
print(f"Number of infinite values in the dataframe: {infinite_values}")


Number of infinite values in the dataframe: 82139
Number of infinite values in the dataframe: 0


In [6]:
df['Label'].value_counts()

Label
Benign                    7313104
DDoS attacks-LOIC-HTTP     576191
Name: count, dtype: int64

In [7]:
df = df.drop(index=df.loc[df['Label'] == 'Label'].index)

In [8]:
df = df.dropna()

In [9]:
X = df.drop('Label', axis=1)
y = df['Label']
y = y.map({'Benign': 0, 'DDoS attacks-LOIC-HTTP': 1})
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, stratify=y)

In [10]:
import gc
del X, y, df
gc.collect()

80

In [11]:
df_train = pd.concat([X_train, y_train], axis=1)
df_test = pd.concat([X_test, y_test], axis=1)

In [12]:
df_train.isna().sum().sum()

0

In [13]:
df_test.isna().sum().sum()

0

In [14]:
df_train['Label'].value_counts()

Label
0    5850483
1     460953
Name: count, dtype: int64

In [15]:
df_train.info()
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6311436 entries, 3555049 to 4557639
Data columns (total 81 columns):
 #   Column             Dtype   
---  ------             -----   
 0   Src Port           int32   
 1   Dst Port           int32   
 2   Protocol           int8    
 3   Timestamp          int32   
 4   Flow Duration      int32   
 5   Tot Fwd Pkts       int32   
 6   Tot Bwd Pkts       int16   
 7   TotLen Fwd Pkts    float32 
 8   TotLen Bwd Pkts    float32 
 9   Fwd Pkt Len Max    float16 
 10  Fwd Pkt Len Min    float16 
 11  Fwd Pkt Len Mean   float16 
 12  Fwd Pkt Len Std    float16 
 13  Bwd Pkt Len Max    float16 
 14  Bwd Pkt Len Min    float16 
 15  Bwd Pkt Len Mean   float16 
 16  Bwd Pkt Len Std    float16 
 17  Flow Byts/s        float64 
 18  Flow Pkts/s        float64 
 19  Flow IAT Mean      float32 
 20  Flow IAT Std       float32 
 21  Flow IAT Max       float32 
 22  Flow IAT Min       float32 
 23  Fwd IAT Tot        float32 
 24  Fwd IAT Mean       floa

In [16]:
df_test['Label'].value_counts()

Label
0    1462621
1     115238
Name: count, dtype: int64

In [17]:
datamodule = ExpertPretrainDataModule(df_train, df_test, binarize_on_label=1)

In [18]:
expert_model = ExpertModel(79, [256, 128])
print(expert_model)

ExpertModel(
  (model): Sequential(
    (0): BatchNorm1d(79, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): Linear(in_features=79, out_features=256, bias=True)
    (2): LeakyReLU(negative_slope=0.01)
    (3): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): Linear(in_features=256, out_features=128, bias=True)
    (5): LeakyReLU(negative_slope=0.01)
    (6): Linear(in_features=128, out_features=1, bias=True)
  )
)


In [None]:
logger = TensorBoardLogger(f"../../../../logs/expert/no_preprocess_no_autoencoder_loic_http", name="expert_tensorboard")
csv_logger = CSVLogger(f"../../../../logs/expert/no_preprocess_no_autoencoder_loic_http", name="expert_csv")
lr_monitor = LearningRateMonitor(logging_interval='epoch')


trainer = pl.Trainer(
        max_epochs=2,
        logger=[logger, csv_logger],
        callbacks=[lr_monitor],
        accelerator='gpu',
        precision='64-true',
)

trainer.fit(expert_model, datamodule=datamodule)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3060 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
2024-10-05 22:07:10.903195: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-05 22:07:10.937901: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-05 22:07:10.947680: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1