In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tabgan.sampler import GANGenerator
from torch.utils.data import Dataset, DataLoader

  from tqdm.autonotebook import tqdm


# Data Loading

In [2]:
# Set up the label encoder for consistency in target labels
le = LabelEncoder()
labels = ['DDoS', 'Reconnaissance', 'injection', 'DoS', 'Brute Force', 'password', 'xss', 'Infilteration', 'Exploits', 'scanning', 'Fuzzers', 'Backdoor', 'Bot', 'Generic', 'Analysis', 'Theft', 'Shellcode', 'mitm', 'Worms', 'ransomware']
le.fit(labels)

In [3]:
# Load the data
attacks_dataset = pd.read_csv('data/cleaned/NF-UQ-NIDS-ATTACKS.csv')
attacks_dataset = attacks_dataset.head(10000)
attacks_dataset = attacks_dataset.drop(columns={'Label', 'Dataset'})

# Encode attack column
attacks_dataset['Attack'] = le.transform(attacks_dataset['Attack'])

In [4]:
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(attacks_dataset.loc[:, attacks_dataset.columns != 'Attack'], attacks_dataset['Attack'], train_size=0.8, test_size=0.2, shuffle=True, random_state=475)

# Convert targets back to dataframe from series
y_train = y_train.to_frame()
y_test = y_test.to_frame()

In [5]:
print(attacks_dataset.columns)

Index(['L4_SRC_PORT', 'L4_DST_PORT', 'PROTOCOL', 'L7_PROTO', 'IN_BYTES',
       'OUT_BYTES', 'IN_PKTS', 'OUT_PKTS', 'TCP_FLAGS',
       'FLOW_DURATION_MILLISECONDS', 'Attack', 'sIP31', 'sIP30', 'sIP29',
       'sIP28', 'sIP27', 'sIP26', 'sIP25', 'sIP24', 'sIP23', 'sIP22', 'sIP21',
       'sIP20', 'sIP19', 'sIP18', 'sIP17', 'sIP16', 'sIP15', 'sIP14', 'sIP13',
       'sIP12', 'sIP11', 'sIP10', 'sIP9', 'sIP8', 'sIP7', 'sIP6', 'sIP5',
       'sIP4', 'sIP3', 'sIP2', 'sIP1', 'sIP0', 'dIP31', 'dIP30', 'dIP29',
       'dIP28', 'dIP27', 'dIP26', 'dIP25', 'dIP24', 'dIP23', 'dIP22', 'dIP21',
       'dIP20', 'dIP19', 'dIP18', 'dIP17', 'dIP16', 'dIP15', 'dIP14', 'dIP13',
       'dIP12', 'dIP11', 'dIP10', 'dIP9', 'dIP8', 'dIP7', 'dIP6', 'dIP5',
       'dIP4', 'dIP3', 'dIP2', 'dIP1', 'dIP0'],
      dtype='object')


In [6]:
categorical_columns = ['L4_SRC_PORT', 'L4_DST_PORT', 'PROTOCOL', 'L7_PROTO', 'TCP_FLAGS',
                        'sIP31', 'sIP30', 'sIP29', 'sIP28', 'sIP27', 'sIP26', 'sIP25',
                        'sIP24', 'sIP23', 'sIP22', 'sIP21', 'sIP20', 'sIP19', 'sIP18',
                        'sIP17', 'sIP16', 'sIP15', 'sIP14', 'sIP13', 'sIP12', 'sIP11',
                        'sIP10', 'sIP9', 'sIP8', 'sIP7', 'sIP6', 'sIP5','sIP4', 'sIP3',
                        'sIP2', 'sIP1', 'sIP0', 'dIP31', 'dIP30', 'dIP29', 'dIP28', 'dIP27',
                        'dIP26', 'dIP25', 'dIP24', 'dIP23', 'dIP22', 'dIP21', 'dIP20', 'dIP19',
                        'dIP18', 'dIP17', 'dIP16', 'dIP15', 'dIP14', 'dIP13', 'dIP12', 'dIP11',
                        'dIP10', 'dIP9', 'dIP8', 'dIP7', 'dIP6', 'dIP5', 'dIP4', 'dIP3', 'dIP2', 'dIP1', 'dIP0']

# Train GAN and Generate Data

In [7]:
# Use tabgan to generate synthetic data based on the attacks data
gen_x, gen_y = GANGenerator(gen_x_times=60, cat_cols=categorical_columns).generate_data_pipe(X_train, y_train, X_test, only_adversarial=False, use_adversarial=True, only_generated_data=True)

Fitting CTGAN transformers for each column:   0%|          | 0/75 [00:00<?, ?it/s]

Fitting CTGAN transformers for each column: 100%|██████████| 75/75 [00:01<00:00, 44.05it/s]
Training CTGAN, epochs::  12%|█▏        | 61/500 [02:56<21:12,  2.90s/it]


[LightGBM] [Info] Number of positive: 838, number of negative: 837
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000111 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 517
[LightGBM] [Info] Number of data points in the train set: 1675, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500299 -> initscore=0.001194
[LightGBM] [Info] Start training from score 0.001194
[LightGBM] [Info] Number of positive: 838, number of negative: 837
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000225 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 516
[LightGBM] [Info] Number of data points in the train set: 1675, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500299 -> initscore=0.001194
[LightGBM] [Info]

In [8]:
print(gen_x)
print(gen_y)

      L4_SRC_PORT  L4_DST_PORT  PROTOCOL  L7_PROTO  IN_BYTES  OUT_BYTES  \
0               0           80         6     10.16      2107       4712   
1               0          111         6     13.00     28429       6508   
2               0            0         6      0.00      2035       9132   
3               0           80         6     11.00      3123       5304   
4               0          514        17      7.00      1906       4895   
...           ...          ...       ...       ...       ...        ...   
1042        49190         5060       198     41.00     25791       3959   
1043         8567          445        11     11.00     36609      25166   
1044         1043           80         6      0.00       950         25   
1045        65312          520         6     20.00     56780      15028   
1046         8567           25        17      7.00      1030       3135   

      IN_PKTS  OUT_PKTS  TCP_FLAGS  FLOW_DURATION_MILLISECONDS  ...  dIP9  \
0           2         

# Format Data and Write to CSV

In [9]:
# Transform targets back into attack categories
gen_y = le.inverse_transform(gen_y)
print(gen_y)

['Fuzzers' 'Generic' 'Fuzzers' ... 'Analysis' 'DoS' 'Exploits']


In [10]:
# Write the generated data to a csv file
gen_x['Attack'] = gen_y

gen_x.to_csv("data/synthetic/NF-UQ-NIDS-ATTACKS-SYNTHETIC.csv", index=False)