In [None]:
pip install tensorflow-addons

In [None]:
import pandas as pd
import numpy as np
import keras
import keras.backend as K
import tensorflow as tf
import tensorflow_addons as tfa
import os
import time
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.preprocessing.text import Tokenizer
from keras.layers import LSTM, Dense, Dropout, Embedding, Flatten
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import SpatialDropout1D
from tensorflow.keras.metrics import AUC, Precision, Recall
from tensorflow.keras import layers

In [None]:
class MultiHeadSelfAttention(layers.Layer):
    def __init__(self, embed_dim, num_heads=8):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        if embed_dim % num_heads != 0:
            raise ValueError(
                f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
            )
        self.projection_dim = embed_dim // num_heads
        self.query_dense = layers.Dense(embed_dim)
        self.key_dense = layers.Dense(embed_dim)
        self.value_dense = layers.Dense(embed_dim)
        self.combine_heads = layers.Dense(embed_dim)

    def attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)
        weights = tf.nn.softmax(scaled_score, axis=-1)
        output = tf.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        # x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = tf.shape(inputs)[0]
        query = self.query_dense(inputs)  # (batch_size, seq_len, embed_dim)
        key = self.key_dense(inputs)  # (batch_size, seq_len, embed_dim)
        value = self.value_dense(inputs)  # (batch_size, seq_len, embed_dim)
        query = self.separate_heads(
            query, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        key = self.separate_heads(
            key, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        value = self.separate_heads(
            value, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        attention, weights = self.attention(query, key, value)
        attention = tf.transpose(
            attention, perm=[0, 2, 1, 3]
        )  # (batch_size, seq_len, num_heads, projection_dim)
        concat_attention = tf.reshape(
            attention, (batch_size, -1, self.embed_dim)
        )  # (batch_size, seq_len, embed_dim)
        output = self.combine_heads(
            concat_attention
        )  # (batch_size, seq_len, embed_dim)
        return output
    
    def get_config(self):
        cfg = super().get_config()
        return cfg

In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)
    
    def get_config(self):
        cfg = super().get_config()
        return cfg

In [None]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions
    def get_config(self):
        cfg = super().get_config()
        return cfg

In [None]:
root_path = "../datasets/"
os.chdir(root_path)

In [None]:
catakOriginal = pd.read_csv("CatakOriginal.csv")
#catakPreprocessed = pd.read_csv("CatakPreprocessed.csv")

In [None]:
catakOriginal.head()

In [None]:
catakOriginal["class"].value_counts()

In [None]:
enc = OneHotEncoder()
Y_sparse = enc.fit_transform(np.array(catakOriginal["class"]).reshape((-1,1)))

In [None]:
enc.categories_

In [None]:
Y = Y_sparse.todense()

In [None]:
Y.shape

In [None]:
vocab_size = 800
maxlen = 500

In [None]:
X = catakOriginal.api

In [None]:
tok = Tokenizer(num_words=vocab_size)

In [None]:
tok.fit_on_texts(X)

In [None]:
print('Found %s unique tokens.' % len(tok.word_index))

In [None]:
X = tok.texts_to_sequences(X.values)

In [None]:
X = sequence.pad_sequences(X, maxlen=maxlen)

In [None]:
print('Shape of data tensor:', X.shape)


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                    test_size=0.2, random_state=75, stratify = Y)

In [None]:
embed_dim = 16  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
#transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(100, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(Y_train.shape[1], activation="softmax")(x)

In [None]:
from sklearn.utils import class_weight

In [None]:
class_weights = class_weight.compute_class_weight(class_weight = 'balanced',
                                                 classes = np.unique(list(Y_train.argmax(1).A1)),
                                                 y = list(Y_train.argmax(1).A1))

In [None]:
weight = {i : class_weights[i] for i in range(Y_train.shape[1])}

In [None]:
weight

In [None]:
skf = StratifiedKFold(n_splits=10)

In [None]:
lossList = []; accList = []; f1List = []; aucList = []; tiList = [];  mccList = []
count = 0
for i, (train_index, val_index) in enumerate(skf.split(X_train, Y_train.argmax(1))):
  training_data_X = X_train[train_index]
  val_data_X = X_train[val_index]
  training_data_Y = Y_train[train_index]
  val_data_Y = Y_train[val_index]
  filepath = f'../best_models/CatakOriginalTransformer/best_model{i}.hdf5'
  es = EarlyStopping(monitor= "val_auc", mode='max', verbose=1, patience = 10)
  checkpoint = ModelCheckpoint(filepath, monitor= "val_auc", verbose=1, save_best_only=True, mode='max')
  callbacks_list = [checkpoint,es]
  start_time = time.time()
  model = keras.Model(inputs=inputs, outputs=outputs)
  model.compile(loss = 'categorical_crossentropy', 
                optimizer='adam',
                metrics=['accuracy',
                         tfa.metrics.F1Score(num_classes=Y_train.shape[1],average="macro"),
                         tfa.metrics.MatthewsCorrelationCoefficient(num_classes=Y_train.shape[1], name= "mcc"),
                         AUC(multi_label = True, num_labels = Y_train.shape[1],name="auc")])
  history = model.fit(training_data_X, training_data_Y, batch_size=128, epochs=20,
                    validation_data=(val_data_X,val_data_Y), verbose=1, callbacks = callbacks_list, class_weight=weight)
  if count == 0:
    model.summary()
    count += 1
  tiList.append(time.time() - start_time)
  model.load_weights(filepath)
  loss, acc, f1, mcc, auc = model.evaluate(val_data_X,val_data_Y)
  aucList.append(auc)
  lossList.append(loss)
  accList.append(acc)
  mccList.append(mcc)
  f1List.append(f1)
print(f'mean accuracy : {np.mean(accList)}, std loss : {np.std(accList)}')
print(f'mean loss : {np.mean(lossList)}, std loss : {np.std(lossList)}')
print(f'mean f1 : {np.mean(f1List)}, std f1 : {np.std(f1List)}')
print(f'mean mcc : {np.mean(mccList)}, std f1 : {np.std(mccList)}')
print(f'mean auc : {np.mean(aucList)}, std auc : {np.std(aucList)}')
print(f'mean time : {np.mean(tiList)}, std time : {np.std(tiList)}')

In [None]:
bestİndex = aucList.index(max(aucList))
filepath = f'../best_models/CatakOriginalTransformer/best_model{bestİndex}.hdf5'
model.load_weights(filepath)
model.evaluate(X_test,Y_test) 

In [None]:
y_test_pred_probs = model.predict(X_test)
y_test_pred = np.argmax(y_test_pred_probs, axis = 1)
cm = confusion_matrix(Y_test.argmax(1), y_test_pred)

In [None]:
enc.categories_

In [None]:
target_names = ["Adware","Backdoor","Downloader","Dropper","Spyware","Trojan","Virus","Worm"]


In [None]:
import numpy as np


def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True):
    """
    given a sklearn confusion matrix (cm), make a nice plot

    Arguments
    ---------
    cm:           confusion matrix from sklearn.metrics.confusion_matrix

    target_names: given classification classes such as [0, 1, 2]
                  the class names, for example: ['high', 'medium', 'low']

    title:        the text to display at the top of the matrix

    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
                  see http://matplotlib.org/examples/color/colormaps_reference.html
                  plt.get_cmap('jet') or plt.cm.Blues

    normalize:    If False, plot the raw numbers
                  If True, plot the proportions

    Usage
    -----
    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by
                                                              # sklearn.metrics.confusion_matrix
                          normalize    = True,                # show proportions
                          target_names = y_labels_vals,       # list of names of the classes
                          title        = best_estimator_name) # title of graph

    Citiation
    ---------
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

    """
    import matplotlib.pyplot as plt
    import numpy as np
    import itertools

    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.show()

In [None]:
plot_confusion_matrix(cm,
                      target_names,
                      title='Confusion matrix',
                      cmap=None,
                      normalize=False)