In [0]:
#Importamos la segunda versión de tensorflow
%tensorflow_version 2.x

In [0]:
#Instalamos las dependencias necesarias que no posee Google Colab.
!pip install bayesian-optimization
!pip install mlxtend --upgrade --no-deps

In [0]:
#Importamos los paquetes necesarios para el proyecto
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.utils import get_file
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from PIL import Image
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn import metrics
from keras import backend as K
from keras.utils.generic_utils import get_custom_objects
from bayes_opt import BayesianOptimization
from sklearn.model_selection import StratifiedKFold
from mlxtend.plotting import plot_confusion_matrix
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE


from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import Dense, Conv2D,LSTM, MaxPooling2D,UpSampling2D,Conv2DTranspose, Dropout, Flatten, Activation, LeakyReLU, ReLU, Input,concatenate,BatchNormalization
from tensorflow.keras.optimizers import Adam
from keras.regularizers import l1
from tensorflow.keras.callbacks import EarlyStopping

In [0]:
#Descargamos el set de datos subido previamente al Google Drive (Tarda menos que descargarlo directamente de la página).
path_train = "/content/drive/My Drive/TFG/UNSW_NB15_training-set.csv"
path_test = "/content/drive/My Drive/TFG/UNSW_NB15_testing-set.csv"

#Leemos los datos
df_train=pd.read_csv(path_train,dtype='unicode')
df_test=pd.read_csv(path_test,dtype='unicode')

#Quitamos las columnas no necesarias
df_train.drop('id', axis=1, inplace=True)
df_train.drop('attack_cat', axis=1, inplace=True)
df_test.drop('id', axis=1, inplace=True)
df_test.drop('attack_cat', axis=1, inplace=True)

In [0]:
#Concatenamos el set de entrenamiento y el de test para manipular directamente el conjunto.
df = pd.concat([df_train,df_test],axis=0)

In [0]:
#División del dataset por clases.
df_class_Normal = df[df['label'] == '0']
df_class_Attack = df[df['label'] == '1']

#Under-sampling de la categoría attack hasta alcanzar el mismo valor que la categoria Normal
df_class_Attack = df_class_Attack.sample(df_class_Normal.shape[0])

#Concatenado
df = pd.concat([df_class_Normal,df_class_Attack], axis=0)

In [0]:
#Visualización en una gráfica circular del peso de cada clase
print("Shape of dataFrame: {} \n".format(df.shape))
print("Number of attack samples")
display(df['label'].value_counts())
print("")
print("Plotting balance of dataFrame")
df_plot = (df['label'].value_counts(normalize=True) *100)
df_plot.plot(kind='pie',figsize=(10,10),title='Balance of dataset (%)')

In [0]:
#Conversión de los datos tipo string a enteros en un rango entre 0 y 255 (1 Byte de información)
def encode_string_byte (df,name):
  df[name] = LabelEncoder().fit_transform(df[name])

encode_string_byte (df,'proto')
encode_string_byte (df,'state') 
encode_string_byte (df,'service') 

In [0]:
#Normalización de los números enteros en valores decimales en rango entre 0 y 1
def numerical_minmax_normalization (df, name):
  x = df[name].values.reshape(-1,1)
  min_max_scaler = preprocessing.MinMaxScaler()
  x_scaled = min_max_scaler.fit_transform(x)
  df[name] = x_scaled

numerical_minmax_normalization(df,'dur')
numerical_minmax_normalization(df,'spkts')
numerical_minmax_normalization(df,'dpkts')
numerical_minmax_normalization(df,'sbytes')
numerical_minmax_normalization(df,'dbytes')
numerical_minmax_normalization(df,'rate')
numerical_minmax_normalization(df,'sttl')
numerical_minmax_normalization(df,'dttl')
numerical_minmax_normalization(df,'sload')
numerical_minmax_normalization(df,'dload')
numerical_minmax_normalization(df,'sloss')
numerical_minmax_normalization(df,'dloss')
numerical_minmax_normalization(df,'sinpkt')
numerical_minmax_normalization(df,'dinpkt')
numerical_minmax_normalization(df,'sjit')
numerical_minmax_normalization(df,'djit')
numerical_minmax_normalization(df,'swin')
numerical_minmax_normalization(df,'stcpb')
numerical_minmax_normalization(df,'dtcpb')
numerical_minmax_normalization(df,'dwin')
numerical_minmax_normalization(df,'tcprtt')
numerical_minmax_normalization(df,'synack')
numerical_minmax_normalization(df,'ackdat')
numerical_minmax_normalization(df,'smean')
numerical_minmax_normalization(df,'dmean')
numerical_minmax_normalization(df,'trans_depth')
numerical_minmax_normalization(df,'response_body_len')
numerical_minmax_normalization(df,'ct_srv_src')
numerical_minmax_normalization(df,'ct_state_ttl')
numerical_minmax_normalization(df,'ct_dst_ltm')
numerical_minmax_normalization(df,'ct_src_dport_ltm')
numerical_minmax_normalization(df,'ct_dst_sport_ltm')
numerical_minmax_normalization(df,'ct_dst_src_ltm')
numerical_minmax_normalization(df,'is_ftp_login')
numerical_minmax_normalization(df,'ct_ftp_cmd')
numerical_minmax_normalization(df,'ct_flw_http_mthd')
numerical_minmax_normalization(df,'ct_src_ltm')
numerical_minmax_normalization(df,'ct_srv_dst')
numerical_minmax_normalization(df,'is_sm_ips_ports')

In [0]:
#Mapeo de los valores normalizados del paso anterior a valores enteros entre 0 y 255 (1 Byte de información)
def numerical_split_ohe (df,name):
  pd_to_np = df[name].tolist()
  np_split = []
  
  categories = np.linspace(0, 1, num=256,endpoint=False)
  quantization = range(0,256)

  for value in pd_to_np:
    for i in range(len(categories)-1):
      if (categories[i] <= float(value) <= categories[i+1]):
        np_split.append(quantization[i])
        break
      if (float(value) > categories[-1]):
        np_split.append(quantization[-1])
        break
  
  df[name] = np_split


numerical_split_ohe(df,'dur')
numerical_split_ohe(df,'spkts')
numerical_split_ohe(df,'dpkts')
numerical_split_ohe(df,'sbytes')
numerical_split_ohe(df,'dbytes')
numerical_split_ohe(df,'rate')
numerical_split_ohe(df,'sttl')
numerical_split_ohe(df,'dttl')
numerical_split_ohe(df,'sload')
numerical_split_ohe(df,'dload')
numerical_split_ohe(df,'sloss')
numerical_split_ohe(df,'dloss')
numerical_split_ohe(df,'sinpkt')
numerical_split_ohe(df,'dinpkt')
numerical_split_ohe(df,'sjit')
numerical_split_ohe(df,'djit')
numerical_split_ohe(df,'swin')
numerical_split_ohe(df,'stcpb')
numerical_split_ohe(df,'dtcpb')
numerical_split_ohe(df,'dwin')
numerical_split_ohe(df,'tcprtt')
numerical_split_ohe(df,'synack')
numerical_split_ohe(df,'ackdat')
numerical_split_ohe(df,'smean')
numerical_split_ohe(df,'dmean')
numerical_split_ohe(df,'trans_depth')
numerical_split_ohe(df,'response_body_len')
numerical_split_ohe(df,'ct_srv_src')
numerical_split_ohe(df,'ct_state_ttl')
numerical_split_ohe(df,'ct_dst_ltm')
numerical_split_ohe(df,'ct_src_dport_ltm')
numerical_split_ohe(df,'ct_dst_sport_ltm')
numerical_split_ohe(df,'ct_dst_src_ltm')
numerical_split_ohe(df,'is_ftp_login')
numerical_split_ohe(df,'ct_ftp_cmd')
numerical_split_ohe(df,'ct_flw_http_mthd')
numerical_split_ohe(df,'ct_src_ltm')
numerical_split_ohe(df,'ct_srv_dst')
numerical_split_ohe(df,'is_sm_ips_ports')

In [0]:
#Quitando la columna attack_cat y guardandola en la variable y.
y_column = df['label']
df.drop('label',axis=1,inplace=True)
dummies = pd.get_dummies(y_column) 
y = dummies.values

In [0]:
#Normalización de los valores entre -0,5 y 0,5
x = []
for image in np.array(df.to_numpy()):
  x.append((image/255 - 0.5))
x = np.array(x)


In [0]:
#Separación del dataset en un set de entrenamiento y otro de validación.
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)

for train_index, test_index in sss.split(x,y):
  x_train, x_test = x[train_index], x[test_index]
  y_train, y_test = y[train_index], y[test_index]

In [0]:
#Definición del modelo final.
def LSTM_model():

  input_img = Input(shape = (None, 1))
  output = LSTM(128,activation='relu',activity_regularizer=l1(1e-5))(input_img)
  output = Dense(128, activation='relu',activity_regularizer=l1(1e-5))(output)
  output = Dense(64, activation='relu',activity_regularizer=l1(1e-5))(output)
  out    = Dense(10, activation='sigmoid')(output)

  model = Model(inputs = input_img, outputs = out)
  model.compile(optimizer='adam', loss="binary_crossentropy", metrics=["accuracy"])

  return model

In [0]:
#Proceso de entrenamiento
model = LSTM_model()
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, min_delta=0.01, patience=10, restore_best_weights=True)
history = model.fit(x_train,y_train,validation_data=(x_test,y_test), verbose=1, batch_size=256, epochs=200, callbacks=[es])

In [0]:
#Predicciones de la red entrenada, medidad con Accuracy, precision, reacall y F1.
y_pred = model.predict(x)
y_pred = np.argmax(y_pred,axis=1) 
y_true = np.argmax(y,axis=1)

print("Accuracy: {}" .format(metrics.accuracy_score(y_true, y_pred)))
print("Precision: {}" .format(metrics.precision_score(y_true, y_pred, average='macro')))
print("Recall: {}" .format(metrics.recall_score(y_true, y_pred, average='macro')))
print("F1: {}" .format(metrics.f1_score(y_true, y_pred, average='macro')))

In [0]:
#Función para representar la matriz de confusión.
def plot_confusing_matrix (y_compare,pred,n_categories,outcome_labels):

  cm = metrics.confusion_matrix(y_compare, pred, labels = list(range(n_categories)))
  plot_confusion_matrix(conf_mat=cm,figsize=(13,13),class_names = outcome_labels,show_normed=True)
  plt.title('Confusing Matrix')
  plt.ylabel('Target')
  plt.xlabel('Predicted')
  plt.show()

In [0]:
#Matriz de confusión.
outcome_labels = ["Normal","Attack"]
plot_confusing_matrix(y_true,y_pred,2,outcome_labels)