In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tabgan.sampler import GANGenerator
from torch.utils.data import Dataset, DataLoader

# Data Loading

In [None]:
to_train = {'ransomware', 'worms', 'mitm', 'shellcode', 'theft', 'analysis', 'generic', 'bot', 'backdoor', 'fuzzers', 'scanning', 'exploits', 'infilteration'}
attack_type = 'ransomware'

In [None]:
# Set up the label encoder for consistency in target labels
le = LabelEncoder()
labels = ['DDoS', 'Reconnaissance', 'injection', 'DoS', 'Brute Force', 'password', 'xss', 'Infilteration', 'Exploits', 'scanning', 'Fuzzers', 'Backdoor', 'Bot', 'Generic', 'Analysis', 'Theft', 'Shellcode', 'mitm', 'Worms', 'ransomware']
le.fit(labels)

In [None]:
# Load the data
attacks_dataset = pd.read_csv(f'data/cleaned/attacks/NF-UQ-NIDS-{str.upper(attack_type)}.csv')
attacks_dataset = attacks_dataset.drop(columns={'Label'}, errors='ignore')

# Encode attack column
attacks_dataset['Attack'] = le.transform(attacks_dataset['Attack'])

In [None]:
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(attacks_dataset.loc[:, attacks_dataset.columns != 'Attack'], attacks_dataset['Attack'], train_size=0.8, test_size=0.2, shuffle=True, random_state=475)
del attacks_dataset

# Convert targets back to dataframe from series
y_train = y_train.to_frame()
y_test = y_test.to_frame()

In [None]:
print(attacks_dataset.columns)

In [None]:
categorical_columns = ['L4_SRC_PORT', 'L4_DST_PORT', 'PROTOCOL', 'L7_PROTO', 'TCP_FLAGS',
                        'sIP31', 'sIP30', 'sIP29', 'sIP28', 'sIP27', 'sIP26', 'sIP25',
                        'sIP24', 'sIP23', 'sIP22', 'sIP21', 'sIP20', 'sIP19', 'sIP18',
                        'sIP17', 'sIP16', 'sIP15', 'sIP14', 'sIP13', 'sIP12', 'sIP11',
                        'sIP10', 'sIP9', 'sIP8', 'sIP7', 'sIP6', 'sIP5','sIP4', 'sIP3',
                        'sIP2', 'sIP1', 'sIP0', 'dIP31', 'dIP30', 'dIP29', 'dIP28', 'dIP27',
                        'dIP26', 'dIP25', 'dIP24', 'dIP23', 'dIP22', 'dIP21', 'dIP20', 'dIP19',
                        'dIP18', 'dIP17', 'dIP16', 'dIP15', 'dIP14', 'dIP13', 'dIP12', 'dIP11',
                        'dIP10', 'dIP9', 'dIP8', 'dIP7', 'dIP6', 'dIP5', 'dIP4', 'dIP3', 'dIP2', 'dIP1', 'dIP0']

# Train GAN and Generate Data

In [None]:
# Use tabgan to generate synthetic data based on the attacks data
gen_x, gen_y = GANGenerator(gen_x_times=1.1, cat_cols=categorical_columns).generate_data_pipe(X_train, y_train, X_test, only_adversarial=False, use_adversarial=True, only_generated_data=True)

# Format Data and Write to CSV

In [None]:
# Transform targets back into attack categories
gen_y = le.inverse_transform(gen_y)
gen_x['Attack'] = gen_y

# Write the generated data to a csv file
gen_x.to_csv(f"data/synthetic/NF-UQ-NIDS-ATTACKS-{str.upper(attack_type)}-SYNTHETIC.csv", index=False)