In [None]:
%load_ext autoreload
%autoreload 2
import sys

In [None]:
import tgan
from sklearn.ensemble import IsolationForest
from pyod.models.cblof import CBLOF
from tgan.model import TGANModel
import tensorflow as tf
tf.compat.v1.disable_eager_execution()

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, matthews_corrcoef
pd.set_option('display.max_colwidth', None)
MKL_NUM_THREADS=6
OMP_NUM_THREADS=6

In [None]:
# scale and move the coordinates so they fit [0; 1] range
def scale_to_01_range(x):
    # compute the distribution range
    value_range = (np.max(x) - np.min(x))

    # move the distribution so that it starts from zero
    # by extracting the minimal value from all its values
    starts_from_zero = x - np.min(x)

    # make the distribution fit [0; 1] by dividing by its range
    if(value_range!=0):
        return starts_from_zero / value_range
    else:
        return starts_from_zero

def compute_TSNE(perplexity,X_data):
    tsne =TSNE(n_components=2, perplexity=perplexity,
     early_exaggeration=12.0, learning_rate='auto',
     n_iter=1000, n_iter_without_progress=300,
     min_grad_norm=1e-07, metric='euclidean',#init='pca',
     verbose=0, 
     method='barnes_hut', angle=0.5, n_jobs=12).fit_transform(X_data)
    
    
    # extract x and y coordinates representing the positions of the images on T-SNE plot
    tx = tsne[:, 0]
    ty = tsne[:, 1]

    tx = scale_to_01_range(tx)
    ty = scale_to_01_range(ty)
    return tx, ty

In [None]:
ADFANet_train_X=np.load('/home/notebook/attack_generation/saved_attacks_generated/adfa/train_X_no_attack.npy')
ADFANet_train_Y=np.zeros((ADFANet_train_X.shape[0],1))
ADFANet_test_X=np.load('/home/notebook/attack_generation/saved_attacks_generated/adfa/test_X.npy')
ADFANet_test_Y=np.load('/home/notebook/attack_generation/saved_attacks_generated/adfa/test_Y.npy')

In [None]:
ADFANet_train_X_no_attack=pd.DataFrame(ADFANet_train_X, columns= ["packets", "bytes", "Duration"])
ADFANet_train_X_no_attack

In [None]:
discrete_columns = ["packets", "bytes", "Duration"] #con solo packets, Duration funzionava alla grande...
continuous_columns= []

In [None]:
tgan = TGANModel(continuous_columns,
                 max_epoch=10,
                 steps_per_epoch=100,
                 save_checkpoints=True,
                 restore_session=False,
                 batch_size=1000,
                 z_dim=200,
                 noise=0.2,
                 l2norm=0.00001,
                 learning_rate=0.001,
                 num_gen_rnn=100,
                 num_gen_feature=100,
                 num_dis_layers=1,
                 num_dis_hidden=100,
                 optimizer='AdamOptimizer'
                )

tgan.fit(ADFANet_train_X_no_attack)

In [None]:
num_samples = 50000
X_ood = tgan.sample(num_samples)

In [None]:
attacks_saved=X_ood.apply(pd.to_numeric)

In [None]:
np.save('/home/notebook/attack_generation/saved_attacks_generated/adfa/TabGAN.npy', attacks_saved.to_numpy())

In [None]:
#remove identical elements from the generated attacks
a= ADFANet_train_X_no_attack.to_numpy()
b=X_ood.to_numpy()
res = (b[:, None] == a).all(-1).any(-1)
c=b[np.array(res==False)]
X_ood=c
ADFANet_train_X_no_attack.to_numpy()
ADFANet_train_X_no_attack.shape, X_ood.shape

In [None]:
from scipy.spatial import distance
from sklearn.metrics.pairwise import euclidean_distances

In [None]:
ADFANet_train_X__no_attacks_augmented=np.vstack((ADFANet_train_X_no_attack, X_ood))
ADFANet_train_Y__no_attacks_augmented=np.vstack((np.zeros((ADFANet_train_X_no_attack.shape[0],1)),
                                                 np.ones((X_ood.shape[0], 1))))

In [None]:
xb=xgb.XGBClassifier(objective='binary:logistic', use_label_encoder=False )
xb.fit(ADFANet_train_X__no_attacks_augmented, ADFANet_train_Y__no_attacks_augmented)
predicted=xb.predict(ADFANet_test_X)

accuracy=accuracy_score(ADFANet_test_Y, predicted)
tn, fp, fn, tp = confusion_matrix(ADFANet_test_Y, predicted).ravel()
mcc=matthews_corrcoef(ADFANet_test_Y, predicted)
print("tp, tn, fp, fn, accuracy, mcc:")
tp, tn, fp, fn, accuracy, mcc

In [None]:
normal_in_training_set=np.unique(ADFANet_train_X_no_attack.shape[0])
original_attack_in_training_set=0
normal_in_test_set=np.unique(ADFANet_test_Y, return_counts=True)[1][0]
attacks_in_test_set=np.unique(ADFANet_test_Y, return_counts=True)[1][1]
generated_attacks=X_ood.shape[0]
PATH='/data/notebook/ganzata_data/'
adfa_competitors=open(PATH+"adfa_competitors.csv", "a")
adfa_competitors.write('ADFANet, '+
                       'TabGAN, '+
                       'XGBoost, '+
                       '0.6--0.4, '+
                       str(normal_in_training_set)+', '+
                       str(original_attack_in_training_set)+', '+
                       str(generated_attacks)+', '+
                       str(normal_in_test_set)+', '+
                       str(attacks_in_test_set)+', '+
                       str(ADFANet_test_X.shape[1])+', '+
                       ' REMOVE IDENTICAL ELEMENTS, '+
                       '{}, {}, {}, {}, {:3f}, {:3f} \n'.format(tp, tn, fp, fn, accuracy, mcc))
adfa_competitors.flush()

In [None]:
contamination=0.2

total=int(ADFANet_train_X_no_attack.shape[0]*contamination)

if(total > X_ood.shape[0]):
    print("contamination is larger than the available attacks")
    sys.exit(0)

    
idx = np.random.randint(X_ood.shape[0], size=total)
attacks=X_ood[idx,:]
ADFANet_train_X__no_attacks_augmented=np.vstack((ADFANet_train_X_no_attack, X_ood))
ADFANet_train_Y__no_attacks_augmented=np.vstack((np.zeros((ADFANet_train_X_no_attack.shape[0],1)),
                                                 np.ones((X_ood.shape[0], 1))))

contamination=contamination/(1+contamination)
                                                 
contamination

In [None]:
isol_1=IsolationForest(n_estimators=300,
                     max_samples='auto',
                     contamination=contamination,
                     max_features=1.0,
                     bootstrap=False,
                     n_jobs=12,warm_start=False).fit(ADFANet_train_X__no_attacks_augmented)

predicted=isol_1.predict(ADFANet_test_X)

Y=ADFANet_test_Y#.to_numpy().T[0]
predicted=np.where(predicted==-1, 1, 0)
accuracy=accuracy_score(Y, predicted)
mcc=matthews_corrcoef(Y, predicted)
tn, fp, fn, tp= confusion_matrix(Y, predicted).ravel()
tp, tn, fp, fn, accuracy, mcc

In [None]:
normal_in_training_set=np.unique(ADFANet_train_X_no_attack.shape[0])
original_attack_in_training_set=0
normal_in_test_set=np.unique(ADFANet_test_Y, return_counts=True)[1][0]
attacks_in_test_set=np.unique(ADFANet_test_Y, return_counts=True)[1][1]
generated_attacks=X_ood.shape[0]
adfa_competitors.write('ADFANet, '+
                       'TabGAN, '+
                       'Isolation Forest, '+
                       '0.6--0.4, '+
                       str(normal_in_training_set)+', '+
                       str(original_attack_in_training_set)+', '+
                       str(generated_attacks)+', '+
                       str(normal_in_test_set)+', '+
                       str(attacks_in_test_set)+', '+
                       str(ADFANet_test_X.shape[1])+', '+
                       ' REMOVE IDENTICAL ELEMENTS, '+
                       '{}, {}, {}, {}, {:3f}, {:3f} \n'.format(tp, tn, fp, fn, accuracy, mcc))
adfa_competitors.flush()

In [None]:
clf = CBLOF(n_jobs=12, contamination=contamination)#, n_clusters=8
clf.fit(ADFANet_train_X__no_attacks_augmented)
predicted = clf.predict(ADFANet_test_X)  # outlier labels (0 or 1)
Y=ADFANet_test_Y#.to_numpy().T[0]
accuracy=accuracy_score(Y, predicted)
mcc=matthews_corrcoef(Y, predicted)
tn, fp, fn, tp= confusion_matrix(Y, predicted).ravel()
tp, tn, fp, fn, accuracy, mcc

In [None]:
normal_in_training_set=np.unique(ADFANet_train_X_no_attack.shape[0])
original_attack_in_training_set=0
normal_in_test_set=np.unique(ADFANet_test_Y, return_counts=True)[1][0]
attacks_in_test_set=np.unique(ADFANet_test_Y, return_counts=True)[1][1]
generated_attacks=X_ood.shape[0]
adfa_competitors.write('ADFANet, '+
                       'TabGAN, '+
                       'CBLOF, '+
                       '0.6--0.4, '+
                       str(normal_in_training_set)+', '+
                       str(original_attack_in_training_set)+', '+
                       str(generated_attacks)+', '+
                       str(normal_in_test_set)+', '+
                       str(attacks_in_test_set)+', '+
                       str(ADFANet_test_X.shape[1])+', '+
                       ' REMOVE IDENTICAL ELEMENTS, '+
                       '{}, {}, {}, {}, {:3f}, {:3f} \n'.format(tp, tn, fp, fn, accuracy, mcc))
adfa_competitors.flush()