In [1]:
import warnings
import sys
sys.path.append('../../../')
warnings.filterwarnings("ignore")
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

from models import *
from pytorch_lightning.loggers import TensorBoardLogger, CSVLogger
from pytorch_lightning.callbacks import LearningRateMonitor
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
data_path = '../../../data/CIC_2018/no_preprocess/df_equal_FTP-BruteForce.parquet'
df = pd.read_parquet(data_path)
df.head()

Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,0,1518597061,112641719,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320860.0,139.300034,56320958,56320761,Benign
1,0,0,1518597230,112641466,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320732.0,114.5513,56320814,56320652,Benign
2,0,0,1518597399,112638623,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56319312.0,301.934601,56319525,56319098,Benign
3,22,6,1518597613,6453966,15,10,1239,2273,744,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign
4,22,6,1518597623,8804066,14,11,1143,2209,744,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign


In [3]:
df['Label'].isna().sum()

0

In [4]:
df['Label'].value_counts()

Label
Benign            667626
FTP-BruteForce    193360
SSH-Bruteforce    187589
Name: count, dtype: int64

In [5]:
df = df.drop(index=df.loc[df['Label'] == 'Label'].index)

In [6]:
df = df.dropna()

In [7]:
X = df.drop('Label', axis=1)
y = df['Label']
y = y.map({'Benign': 0, 'FTP-BruteForce': 1, 'SSH-Bruteforce': 2})
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, stratify=y)

In [8]:
df_train = pd.concat([X_train, y_train], axis=1)
df_test = pd.concat([X_test, y_test], axis=1)

In [9]:
df_train.isna().sum().sum()

0

In [10]:
df_test.isna().sum().sum()

0

In [11]:
df_train['Label'].value_counts()

Label
0    532284
1    154683
2    150071
Name: count, dtype: int64

In [12]:
df_train.info()
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 837038 entries, 821177 to 798743
Data columns (total 80 columns):
 #   Column             Non-Null Count   Dtype   
---  ------             --------------   -----   
 0   Dst Port           837038 non-null  int32   
 1   Protocol           837038 non-null  int8    
 2   Timestamp          837038 non-null  int32   
 3   Flow Duration      837038 non-null  int64   
 4   Tot Fwd Pkts       837038 non-null  int16   
 5   Tot Bwd Pkts       837038 non-null  int16   
 6   TotLen Fwd Pkts    837038 non-null  int32   
 7   TotLen Bwd Pkts    837038 non-null  int32   
 8   Fwd Pkt Len Max    837038 non-null  int32   
 9   Fwd Pkt Len Min    837038 non-null  int16   
 10  Fwd Pkt Len Mean   837038 non-null  float16 
 11  Fwd Pkt Len Std    837038 non-null  float16 
 12  Bwd Pkt Len Max    837038 non-null  int16   
 13  Bwd Pkt Len Min    837038 non-null  int16   
 14  Bwd Pkt Len Mean   837038 non-null  float16 
 15  Bwd Pkt Len Std    837038 non-null

In [13]:
datamodule = ExpertPretrainDataModule(df_train, df_test, binarize_on_label=1)

In [14]:
expert_model = ExpertModel(79, [256, 128])
print(expert_model)

ExpertModel(
  (model): Sequential(
    (0): BatchNorm1d(79, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): Linear(in_features=79, out_features=256, bias=True)
    (2): LeakyReLU(negative_slope=0.01)
    (3): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): Linear(in_features=256, out_features=128, bias=True)
    (5): LeakyReLU(negative_slope=0.01)
    (6): Linear(in_features=128, out_features=1, bias=True)
  )
)


In [15]:
logger = TensorBoardLogger(f"../../../logs/expert/no_preprocess_no_autoencoder", name="expert_tensorboard")
csv_logger = CSVLogger(f"../../../logs/expert/no_preprocess_no_autoencoder", name="expert_csv")
lr_monitor = LearningRateMonitor(logging_interval='epoch')


trainer = pl.Trainer(
        max_epochs=50,
        logger=[logger, csv_logger],
        callbacks=[lr_monitor],
        accelerator='gpu',
        precision='64-true',
)

trainer.fit(expert_model, datamodule=datamodule)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3060 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
2024-10-04 12:45:16.705828: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-04 12:45:16.741417: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-04 12:45:16.751235: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

torch.isnan(x).sum()=tensor(0, device='cuda:0')
x.dtype=torch.float64
y_hat.dtype=torch.float64
torch.isnan(y_hat).sum()=tensor(9, device='cuda:0')
y_hat.min().item()=nan
y_hat.max().item()=nan
torch.isnan(x).sum()=tensor(0, device='cuda:0')
x.dtype=torch.float64
y_hat.dtype=torch.float64
torch.isnan(y_hat).sum()=tensor(9, device='cuda:0')
y_hat.min().item()=nan
y_hat.max().item()=nan
np.isnan(y_scores).sum()=18
np.isnan(y_true).sum()=0


ValueError: Input contains NaN.