In [None]:
pip install tensorflow-addons

In [None]:
import pandas as pd
import numpy as np
import keras
import keras.backend as K
import tensorflow as tf
import tensorflow_addons as tfa
import os
import time
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.preprocessing.text import Tokenizer
from keras.layers import LSTM, Dense, Dropout, Embedding, Flatten
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import SpatialDropout1D
from tensorflow.keras.metrics import AUC, Precision, Recall

In [None]:
root_path = "../datasets/"
os.chdir(root_path)

In [None]:
catakOriginal = pd.read_csv("CatakOriginal.csv")
#catakPreprocessed = pd.read_csv("CatakPreprocessed.csv")

In [None]:
catakOriginal.head()

In [None]:
catakOriginal["class"].value_counts()

In [None]:
enc = OneHotEncoder()
Y_sparse = enc.fit_transform(np.array(catakOriginal["class"]).reshape((-1,1)))

In [None]:
enc.categories_

In [None]:
Y = Y_sparse.todense()

In [None]:
Y.shape

In [None]:
max_words = 800
max_len = 500

In [None]:
X = catakOriginal.api

In [None]:
tok = Tokenizer(num_words=max_words)

In [None]:
tok.fit_on_texts(X)

In [None]:
print('Found %s unique tokens.' % len(tok.word_index))

In [None]:
X = tok.texts_to_sequences(X.values)

In [None]:
X = sequence.pad_sequences(X, maxlen=max_len)

In [None]:
print('Shape of data tensor:', X.shape)


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                    test_size=0.2, random_state=75, stratify = Y)

In [None]:
def new_malware_model(act_func = "tanh"):
    model = Sequential()
    model.add(Embedding(max_words, 16, input_length = max_len))
    model.add(LSTM(100, return_sequences = True, activation = act_func))
    model.add(Dropout(0.2))
    model.add(LSTM(100, return_sequences = True, activation = act_func))
    model.add(Flatten())
    model.add(Dense(Y_train.shape[1], name = "out_layer", activation= "softmax"))
    return model

In [None]:
from sklearn.utils import class_weight

In [None]:
class_weights = class_weight.compute_class_weight(class_weight = 'balanced',
                                                 classes = np.unique(list(Y_train.argmax(1).A1)),
                                                 y = list(Y_train.argmax(1).A1))

In [None]:
class_weights

In [None]:
weight = {i : class_weights[i] for i in range(Y_train.shape[1])}

In [None]:
skf = StratifiedKFold(n_splits=10)

In [None]:
lossList = []; accList = []; f1List = []; aucList = []; tiList = [];  mccList = []
count = 0
for i, (train_index, val_index) in enumerate(skf.split(X_train, Y_train.argmax(1))):
  training_data_X = X_train[train_index]
  val_data_X = X_train[val_index]
  training_data_Y = Y_train[train_index]
  val_data_Y = Y_train[val_index]
  filepath = f'../best_models/CatakOriginal/best_model{i}.hdf5'
  es = EarlyStopping(monitor= "val_auc", mode='max', verbose=1, patience = 10)
  checkpoint = ModelCheckpoint(filepath, monitor= "val_auc", verbose=1, save_best_only=True, mode='max')
  callbacks_list = [checkpoint,es]
  start_time = time.time()
  model = new_malware_model()
  model.compile(loss = 'categorical_crossentropy', 
                optimizer='adam',
                metrics=['accuracy',
                         tfa.metrics.F1Score(num_classes=Y_train.shape[1],average="macro"),
                         tfa.metrics.MatthewsCorrelationCoefficient(num_classes=Y_train.shape[1], name= "mcc"),
                         AUC(multi_label = True, num_labels = Y_train.shape[1],name="auc")])
  history = model.fit(training_data_X, training_data_Y, batch_size=128, epochs=20,
                    validation_data=(val_data_X,val_data_Y), verbose=1, callbacks = callbacks_list, class_weight=weight)
  if count == 0:
    model.summary()
    count += 1
  tiList.append(time.time() - start_time)
  model.load_weights(filepath)
  loss, acc, f1, mcc, auc = model.evaluate(val_data_X,val_data_Y)
  aucList.append(auc)
  lossList.append(loss)
  accList.append(acc)
  mccList.append(mcc)
  f1List.append(f1)
print(f'mean accuracy : {np.mean(accList)}, std loss : {np.std(accList)}')
print(f'mean loss : {np.mean(lossList)}, std loss : {np.std(lossList)}')
print(f'mean f1 : {np.mean(f1List)}, std f1 : {np.std(f1List)}')
print(f'mean mcc : {np.mean(mccList)}, std f1 : {np.std(mccList)}')
print(f'mean auc : {np.mean(aucList)}, std auc : {np.std(aucList)}')
print(f'mean time : {np.mean(tiList)}, std time : {np.std(tiList)}')

In [None]:
bestİndex = aucList.index(max(aucList))
filepath = f'../best_models/CatakOriginal/best_model{bestİndex}.hdf5'
model.load_weights(filepath)
model.evaluate(X_test,Y_test)

In [None]:
y_test_pred_probs = model.predict(X_test)
y_test_pred = np.argmax(y_test_pred_probs, axis = 1)
cm = confusion_matrix(Y_test.argmax(1), y_test_pred)

In [None]:
enc.categories_

In [None]:
target_names = ["Adware","Backdoor","Downloader","Dropper","Spyware","Trojan","Virus","Worm "]

In [None]:
import numpy as np


def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True):
    """
    given a sklearn confusion matrix (cm), make a nice plot

    Arguments
    ---------
    cm:           confusion matrix from sklearn.metrics.confusion_matrix

    target_names: given classification classes such as [0, 1, 2]
                  the class names, for example: ['high', 'medium', 'low']

    title:        the text to display at the top of the matrix

    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
                  see http://matplotlib.org/examples/color/colormaps_reference.html
                  plt.get_cmap('jet') or plt.cm.Blues

    normalize:    If False, plot the raw numbers
                  If True, plot the proportions

    Usage
    -----
    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by
                                                              # sklearn.metrics.confusion_matrix
                          normalize    = True,                # show proportions
                          target_names = y_labels_vals,       # list of names of the classes
                          title        = best_estimator_name) # title of graph

    Citiation
    ---------
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

    """
    import matplotlib.pyplot as plt
    import numpy as np
    import itertools

    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.show()

In [None]:
plot_confusion_matrix(cm,
                      target_names,
                      title='Confusion matrix',
                      cmap=None,
                      normalize=False)