In [None]:
import pandas as pd
import numpy as np
from google.colab import files
 
import tensorflow as tf
from tensorflow import keras
 
from keras.preprocessing.text import Tokenizer
 
from keras.layers import Dense, Dropout, LSTM, Bidirectional, Embedding, Input, GRU, TimeDistributed
from keras.models import Sequential
 
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.metrics import confusion_matrix
 
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

# Analisis dataset de entrenamiento (CTU19)

In [2]:
# Se carga los datasets
url = 'https://raw.githubusercontent.com/DuzzLogic/Botnet_Detection/main/ctu19_s.csv'
ctu19 = pd.read_csv(url)
 
url = 'https://raw.githubusercontent.com/DuzzLogic/Botnet_Detection/main/ctu13_s.csv'
ctu13 = pd.read_csv(url)

In [3]:
#Distribucion de clases CTU19
ctu19['Label'].value_counts()

Botnet    25818
Normal     1596
Name: Label, dtype: int64

# Funciones

In [4]:
import os
from subprocess import getoutput
getoutput("git clone -l -s https://github.com/DuzzLogic/Botnet_Detection cloned-repo")
os.chdir('cloned-repo')
import funjr

In [5]:
def to_sequence(x, y):
  # Tokenización a nivel de caracter
  tokenizer = Tokenizer(num_words=51, char_level=True, lower= False)
 
  # Se actualiza el vocabulario interno basado en una lista de cadenas de caracteres.
  tokenizer.fit_on_texts("abcdefghiABCDEFGHIrstuvwxyzRSTUVWXYZ123456789.,+*0")
  
  # Se transforma cada secuencia en una secuencia de números enteros.
  sequence_x = tokenizer.texts_to_sequences(x)
  
  sequence_x = tf.keras.preprocessing.sequence.pad_sequences(
    sequence_x, maxlen = 1000, dtype = 'int32' , padding='post', truncating='post', value=0
  )

  y = np.array([1 if x == "Botnet" else 0 for x in y], dtype='int32')
 
  return sequence_x, y

# Model

In [6]:
# Crea el modelo
def create_model(x_train, y_train):
  model = Sequential()
  model.add(Embedding(input_dim = 51,output_dim= 1000, input_length=1000))
  model.add(Bidirectional(LSTM(units = 128, input_shape=(1000,50), return_sequences=True)))
  #model.add(LSTM(units = 128, input_shape=(1000,50), return_sequences=True))
  model.add(Dropout(0.1))
  model.add(funjr.AttentionWithContext())
  model.add(Dense(1, activation='sigmoid'))
 
  model.compile(optimizer= 'rmsprop', loss='binary_crossentropy', metrics=['acc'])

  model.fit(x_train, y_train,
  epochs = 10,
  #epochs=30,
  validation_split=0.2)
 
  return model

In [7]:
# Se entrena el modelo
def train_model(x_train, y_train):
  sequence_train, y_train = to_sequence(x_train, y_train)

  model = create_model(sequence_train, y_train)

  return model

In [8]:
# Se realiza una prediccion y se evaluan los resultados
def predict_test(model, x_test, y_test):
  sequence_test, y_test = to_sequence(x_test, y_test)

  prediction = [0 if x < 0.5 else 1 for x in model.predict(sequence_test)]

  return funjr.calculate_metrics(prediction, y_test)

In [11]:
x_train = ctu19['State']
y_train = ctu19['Label']

# Montecarlo cross-validation split
#mc = ShuffleSplit(n_splits=30, test_size = 0.3, random_state=1)
mc = ShuffleSplit(n_splits=2, test_size = 0.3, random_state=1)
mc.get_n_splits(x_train)

metrics = []
count = 0
for train, test in mc.split(x_train):
  # Aplicamos over y undersampling 
  x, y = funjr.bothSam(x_train[train], y_train[train], 0.1)

  metrics.append(predict_test(train_model(x, y), x_train[test], y_train[test]))
  count += 1
  print(count) 

# Guardar resultados
metrics = pd.DataFrame(metrics)
metrics = metrics.rename(columns={
                                      0 : 'Specificity',
                                      1 : 'Sensitivity',
                                      2 : 'Balanced Accuracy',
                                      3 : 'F1'
  })


Epoch 9/10
Epoch 10/10
1




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
2


In [12]:
metrics

Unnamed: 0,Specificity,Sensitivity,Balanced Accuracy,F1
0,0.898129,0.948993,0.923561,0.970678
1,0.908277,0.954101,0.931189,0.973885
