# Using Support Vector Machines for malware classification

In this notebook we will experiment with SVM (Support Vector Machine) classifiers.

Let's start by importing libraries.

In [None]:
%load_ext autoreload
%autoreload 2

from sklearn.decomposition import IncrementalPCA
from collections import defaultdict, Counter
from sklearn.metrics import f1_score
from sklearn.externals import joblib
from preprocessing import pp_action
from helpers import loader_tfidf
from utilities import constants
import plotly.graph_objs as go
import plotly.offline as ply
from sklearn.svm import SVC
import pandas as pd
import numpy as np
import random
import json
import os

In [None]:
config = json.load(open('config.json', 'r'))
uuids_family = json.load(open(os.path.join(constants.dir_d, constants.json_labels), 'r'))
words = json.load(open(os.path.join(constants.dir_d, constants.json_words), 'r'))
ply.init_notebook_mode(connected=True)
load_batch_size = 1100

## Data selection

Select a subset of the original dataset. Then the selected subset will be split into a training and a testing set.

In [None]:
samples_data = pp_action.pre_process(config)
pp_action.split_show_data(samples_data)

In [None]:
uuids = samples_data.index[samples_data['selected'] == 1].tolist()
x_train = samples_data.index[samples_data['train'] == 1].tolist()
x_dev = samples_data.index[samples_data['dev'] == 1].tolist()
x_test = samples_data.index[samples_data['test'] == 1].tolist()
y_train = samples_data.fam_num[samples_data['train'] == 1].tolist()
y_dev = samples_data.fam_num[samples_data['dev'] == 1].tolist()
y_test = samples_data.fam_num[samples_data['test'] == 1].tolist()

## Dimensionality Reduction

We would also like this approach to be scalable to the entire balanced dataset so we will load sparse representations of the data vectors.

To achieve this we will use the Principal Component Analysis in order to operate on the sparse vectros. Let's define two helper functions first.

In [None]:
def train_pca(config, i_pca, samples, load_batch_size):
    t = 0
    
    while t < len(samples):
        data = loader_tfidf.load_tfidf(config, samples[t : t + load_batch_size], dense=True, ordered=False)
        t += load_batch_size

        i_pca.partial_fit(data)

In [None]:
def transform_data(config, i_pca, samples, load_batch_size):
    new_data = [] 
    t = 0
    
    while t < len(samples):
        data = loader_tfidf.load_tfidf(config, samples[t : t + load_batch_size], dense=True, ordered=True)
        t += load_batch_size

        new_data.append(i_pca.transform(data))
        
    return np.concatenate(new_data)

In [None]:
i_pca = IncrementalPCA(n_components=1024, batch_size=load_batch_size)

We will train the PCA algorithm incrementally only on the trainining dataset

In [None]:
train_pca(config, i_pca, random.sample(x_train, len(x_train)), load_batch_size)
joblib.dump(i_pca, 'temp_pca_1000.pkl')

In [None]:
# or directly load the trained PCA model if available
i_pca = joblib.load('temp_pca_1000.pkl') 

In [None]:
print(i_pca.explained_variance_ratio_.sum()) 

Then we will use the trained algorithm to (incrementally) transform all the data vectors. This will allow us to transform larger dataset than what would fit in RAM.

In [None]:
X_train = transform_data(config, i_pca, x_train, load_batch_size)
X_dev = transform_data(config, i_pca, x_dev, load_batch_size)
X_test = transform_data(config, i_pca, x_test, load_batch_size)

## Labels acquisition

Let's store the true labels somewhere we can find them when needed.

In [None]:
classes = sorted(set(y_train))
n_classes = len(classes)

classes_dict = dict(zip(classes, range(n_classes)))
Y_train = np.array([classes_dict[i] for i in y_train])
Y_dev = np.array([classes_dict[i] for i in y_dev])
Y_test = np.array([classes_dict[i] for i in y_test])

In [None]:
print ("X_train shape: " + str(X_train.shape))
print ("Y_train shape: " + str(Y_train.shape))
print ("X_dev shape: " + str(X_dev.shape))
print ("Y_dev shape: " + str(Y_dev.shape))
print ("X_test shape: " + str(X_test.shape))
print ("Y_test shape: " + str(Y_test.shape))

# Classification

Once the dataset is ready we use the Scikit Learn library implementation of the SVM Classifier to classify our data points.

Let's try different approaches. First we try a RBF (Radial Basis Function) kernel.

In [None]:
svc_r = SVC(kernel='rbf', random_state=42)

In [None]:
svc_r.fit(X_train, Y_train)

In [None]:
train_labels_r = svc_r.predict(X_train)
dev_labels_r = svc_r.predict(X_dev)

In [None]:
train_score_r = f1_score(Y_train, train_labels_r, average='micro')
dev_score_r = f1_score(Y_dev, dev_labels_r, average='micro')

In [None]:
print('F1 score on train set: {}'.format(train_score_r))
print('F1 score on dev set: {}'.format(dev_score_r))

This kind of performance is not really waht we are looking for. Let's try to see if something changes playing with the classification parameters

In [None]:
C_2d_range = [1e-2, 1, 1e2]
gamma_2d_range = [1e-1, 1, 1e1]
classifiers = []
for C in C_2d_range:
    for gamma in gamma_2d_range:
        clf = SVC(kernel='rbf', random_state=42, C=C, gamma=gamma)
        clf.fit(X_train, Y_train)
        classifiers.append((C, gamma, clf))

In [None]:
for c in classifiers:
    c_c, c_g, c_s  = c
    train_labels = c_s.predict(X_train)
    dev_labels = c_s.predict(X_dev)
    train_score = f1_score(Y_train, train_labels, average='micro')
    dev_score = f1_score(Y_dev, dev_labels, average='micro')
    print('SVM with C = {}, gamma = {}'.format(c_c, c_g))
    print('F1 score on train set: {}'.format(train_score))
    print('F1 score on dev set: {}'.format(dev_score))

The performance on the dev set with a RBF kernel does not look very promising. Let's try with a linear kernel instead.

In [None]:
svc_l = SVC(kernel='linear', random_state=42)

In [None]:
svc_l.fit(X_train, Y_train)

In [None]:
train_labels_l = svc_l.predict(X_train)
dev_labels_l = svc_l.predict(X_dev)

In [None]:
train_score_l = f1_score(Y_train, train_labels_l, average='micro')
dev_score_l = f1_score(Y_dev, dev_labels_l, average='micro')

In [None]:
print('F1 score on train set: {}'.format(train_score_l))
print('F1 score on dev set: {}'.format(dev_score_l))

This is definitely a more promising result. Let's see if we can make it better by modifying the C parameter.

In [None]:
C_range = [1e-2, 1, 1e2]
classifiers = []
for C in C_range:
    clf = SVC(kernel='linear', random_state=42, C=C)
    clf.fit(X_train, Y_train)
    classifiers.append((C, clf))

In [None]:
for c in classifiers:
    c_c, c_s  = c
    train_labels = c_s.predict(X_train)
    dev_labels = c_s.predict(X_dev)
    train_score = f1_score(Y_train, train_labels, average='micro')
    dev_score = f1_score(Y_dev, dev_labels, average='micro')
    print('SVM with C = {}'.format(c_c))
    print('F1 score on train set: {}'.format(train_score))
    print('F1 score on dev set: {}'.format(dev_score))

It seems like C = 1 is the best value for C. Let's see what is the score on our test set.

In [None]:
test_labels_l = svc_l.predict(X_test)
test_score_l = f1_score(Y_test, test_labels_l, average='micro')
print('F1 score on train set: {}'.format(test_score_l))