# Using Deep Averaging Networks for malware classification


In this notebook we will experiment with the concept of Deep Averaging Networks in our malware classification setting.

Let's start by loading some packages necessary for the experiment.

In [None]:
%load_ext autoreload
%autoreload 2

from sklearn.preprocessing import LabelBinarizer
from sklearn.decomposition import TruncatedSVD
from collections import defaultdict, Counter
from preprocessing import pp_action
from helpers import loader_tfidf
from utilities import constants
import plotly.graph_objs as go
import plotly.offline as ply
import tensorflow as tf
import pandas as pd
import numpy as np
import json
import os

In [None]:
config = json.load(open('config.json', 'r'))
uuids_family = json.load(open(os.path.join(constants.dir_d, constants.json_labels), 'r'))
words = json.load(open(os.path.join(constants.dir_d, constants.json_words), 'r'))
ply.init_notebook_mode(connected=True)

## Data selection

Select a subset of the original dataset. Then the selected subset will be split into a training and a testing set.


In [None]:
samples_data = pp_action.pre_process(config)
pp_action.split_show_data(samples_data)

In [None]:
uuids = samples_data.index[samples_data['selected'] == 1].tolist()
x_train = samples_data.index[samples_data['train'] == 1].tolist()
x_test = samples_data.index[samples_data['test'] == 1].tolist()
y_train = samples_data.fam_num[samples_data['train'] == 1].tolist()
y_test = samples_data.fam_num[samples_data['test'] == 1].tolist()

## Dimensionality Reduction

Since the DAN required a very considerable amount fo time for the training processes we will try reducing the dimensionality of the dataset.

We would also like this approach to be scalable to the entire balanced dataset so we will load sparse representations of the data vectors.

To achieve this we will use Singular Value Decomposition in order to operate on the sparse vectros.


In [None]:
train = loader_tfidf.load_tfidf(config, x_train, dense=False, ordered=True)
test = loader_tfidf.load_tfidf(config, x_test, dense=False, ordered=True)

In [None]:
tsvd = TruncatedSVD(n_components=1000, random_state=42)

In [None]:
tsvd.fit(train)
print(tsvd.explained_variance_ratio_.sum())  

In [None]:
X_train = tsvd.transform(train)
X_test = tsvd.transform(test)

In [None]:
for i in range(X_train.shape[0]):
    X_train[i] = X_train[i] / X_train.shape[1]
X_train = X_train.T

for i in range(X_test.shape[0]):
    X_test[i] = X_test[i] / X_test.shape[1]
X_test = X_test.T

## Labels pre-processing

We will initially convert the true labels into a one-hot vector representation.

In [None]:
classes = sorted(set(y_train))
n_classes = len(classes)

classes_dict = dict(zip(classes, range(n_classes)))
y_train = [classes_dict[i] for i in y_train]
y_test = [classes_dict[i] for i in y_test]

In [None]:
lb = LabelBinarizer()
Y_train = lb.fit_transform(y_train).T
Y_test = lb.fit_transform(y_test).T

In [None]:
print ("X_train shape: " + str(X_train.shape))
print ("Y_train shape: " + str(Y_train.shape))
print ("X_test shape: " + str(X_test.shape))
print ("Y_test shape: " + str(Y_test.shape))

## Setting the Hyper-parameters

Let's set the hyper-paramters, we will try to start with a fast network. 

In [None]:
learning_rate = 0.0005
n_epochs = 1500
minibatch_size = 256
n_h_layers = 3
# n_h_layers = 2
# ls = [[24,X_train.shape[0]], [24,1], [12,24], [12,1], [Y_train.shape[0],12], [Y_train.shape[0],1]]
ls = [[100,X_train.shape[0]], [100,1], [80,100], [80,1], [Y_train.shape[0],80], [Y_train.shape[0],1]]
# ls = [[12,X_train.shape[0]], [12,1], [Y_train.shape[0],12], [Y_train.shape[0],1]]
keep_probs = 0.7

## Model definition

At each step the vectors will go through a softmax function.

First let's define some placeholders for the input X and the labels Y

In [None]:
def init_ph(n_feats, n_classes):
    with tf.device('/gpu:0'):
        X = tf.placeholder(dtype=tf.float32, shape=(n_feats, None))
        Y = tf.placeholder(dtype=tf.float32, shape=(n_classes, None))
        keep_prob = tf.placeholder(tf.float32)
        
        return X,Y, keep_prob

Then we initialize the wiehgts using the Xavier intialization method

In [None]:
def init_weights(n_layers, layer_sizes):
    params = {}
    
    with tf.device('/gpu:0'):
        for i in range(n_layers):
            Wn = 'W{}'.format(i)
            bn = 'b{}'.format(i)
            
            params[Wn] = tf.get_variable(
                Wn, 
                layer_sizes[i * 2], 
                initializer = tf.contrib.layers.xavier_initializer(seed = 1)
            )
            
            params[bn] = tf.get_variable(
                bn, 
                layer_sizes[(i * 2) + 1],
                initializer = tf.zeros_initializer()
            )
    
    return params

Forward propagation

In [None]:
def fwd(X, params, keep_prob):
    Zn = None
    
    with tf.device('/gpu:0'):
        An = X
        
        for i in range(n_h_layers):
            Wn = 'W{}'.format(i)
            bn = 'b{}'.format(i)
            
            Zn = tf.add(tf.matmul(params[Wn], An), params[bn])
            An = tf.nn.dropout(tf.nn.relu(Zn), keep_prob)
            
    return Zn


Cost function

In [None]:
def compute_cost(Zn, Y):
    
    with tf.device('/gpu:0'):
        logits = tf.transpose(Zn)
        labels = tf.transpose(Y)
        
        cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = logits, labels = labels))
    
    return cost

The finally the DAN model

In [None]:
def dan(X_train, Y_train, X_test, Y_test, learning_rate, num_epochs, minibatch_size, n_h_layers, layers, keep_probs):

    with tf.device('/gpu:0'):
        
        tf.reset_default_graph()
        
        X, Y, keep_prob = init_ph(X_train.shape[0], Y_train.shape[0])
        
        params = init_weights(n_h_layers, layers)
        
        Z = fwd(X, params, keep_prob)
        
        cost = compute_cost(Z, Y)
        
        optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost)
        
        init = tf.global_variables_initializer()
        
        with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess:
        
            num_minibatches = int(X_train.shape[1] / minibatch_size)

            sess.run(init)

            for epoch in range(num_epochs):
                epoch_cost = 0.
                
                minibatch_idxs = np.random.permutation(X_train.shape[1])
                
                for i in range(num_minibatches):
                   

                    minibatch_X = np.take(
                        X_train,
                        minibatch_idxs[i * minibatch_size : (i + 1) * minibatch_size], 
                        axis=1
                    )
                    minibatch_Y = np.take(
                        Y_train, 
                        minibatch_idxs[i * minibatch_size : (i + 1) * minibatch_size], 
                        axis=1
                    )

                    _ , minibatch_cost = sess.run(
                        [optimizer, cost], 
                        feed_dict={
                            X: minibatch_X, 
                            Y: minibatch_Y,
                            keep_prob: keep_probs
                        }
                    )

                    epoch_cost += minibatch_cost / num_minibatches

                if epoch % 100 == 0:
                    print ("Cost after epoch %i: %f" % (epoch, epoch_cost))
                if epoch % 5 == 0:
                    costs.append(epoch_cost)


            correct_prediction = tf.equal(tf.argmax(Z), tf.argmax(Y))
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
            
            tr_acc =  accuracy.eval({X: X_train, Y: Y_train, keep_prob: 1.0})
            ts_acc = accuracy.eval({X: X_test, Y: Y_test, keep_prob: 1.0})
            
            print ("Train Accuracy:",tr_acc)
            print ("Test Accuracy:", ts_acc)


        return params, costs, tr_acc, ts_acc
        
        

In [None]:
tf.set_random_seed(1)
costs = []

tf.reset_default_graph()
parameters, cost_list, tr_acc, ts_acc = dan(
    X_train,
    Y_train,
    X_test,
    Y_test,
    learning_rate,
    n_epochs,
    minibatch_size,
    n_h_layers,
    ls,
    keep_probs
)

In [None]:
trace = go.Scatter(
    x = np.arange(len(costs)),
    y = costs
)
ply.iplot([trace], filename='costs')