# Birch clustering result analysis

Let's start by loading up some libraries and static data that may be useful in the next steps.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from visualization import vis_data, vis_cluster
from collections import defaultdict, Counter
from utilities import constants, evaluation
from preprocessing import pp_action
from clustering import clu_birch
import plotly.graph_objs as go
import plotly.offline as ply
import pandas as pd
import numpy as np
import json
import os

In [3]:
config = json.load(open('config.json', 'r'))
uuids_family = json.load(open(os.path.join(constants.dir_d, constants.json_labels), 'r'))
words = json.load(open(os.path.join(constants.dir_d, constants.json_words), 'r'))
ply.init_notebook_mode(connected=True)

In [4]:
thresholds = [50, 100, 150, 300, 500, 1000, 2000, 3000]
branching_factors = [50, 100, 150, 300, 500, 1000, 2000, 3000]

## Data selection

Select a subset of the original dataset. Then the selected subset will be split into a training and a testing set.


In [5]:
samples_data = pp_action.pre_process(config)
pp_action.split_show_data(samples_data)

Please choose the subset of data to workon on:
l for all labeled samples
k for samples of families mydoom, gepys, lamer, neshta, bladabindi, flystudio, eorezo
s for a small balanced subset
f for a single family
b for a balanced subset of samples
q to quit
k

Would you like to compute the Jensen-Shannon distance matrix for the chosen data? [y/n]
n

846 train samples belonging to 7 malware families
Malware family:      bladabindi      Number of samples:  199  
Malware family:        eorezo        Number of samples:  191  
Malware family:        neshta        Number of samples:  150  
Malware family:        mydoom        Number of samples:  104  
Malware family:        lamer         Number of samples:   89  
Malware family:      flystudio       Number of samples:   60  
Malware family:        gepys         Number of samples:   53  

182 dev samples belonging to 7 malware families
Malware family:      bladabindi      Number of samples:   42  
Malware family:        neshta        Number of 

In [6]:
uuids = samples_data.index[samples_data['selected'] == 1].tolist()
numerical_labels = samples_data.fam_num[samples_data['selected'] == 1].tolist()

## Utilities

let's define a few helper fucntions


In [7]:
def plot_surface(to_plot, a_x, a_y, title):
    data = [go.Surface(x=a_x, y=a_y, z=to_plot)]
    
    layout = go.Layout(
        title=title
    )
    
    fig = go.Figure(data=data, layout=layout)
    
    ply.iplot(fig, filename=title)    

In [8]:
def plot_surfaces(to_plot_f, to_plot_s, clusters, thresholds, branching_factors):
    plot_surface(to_plot_f, thresholds, branching_factors, 'f1 scores')
    plot_surface(to_plot_s, thresholds, branching_factors, 'silhouette scores')
    plot_surface(clusters, thresholds, branching_factors, 'clusters')

In [9]:
def test_params(data_matrix, numerical_labels, config, thresholds, branching_factors):
    to_plot_f = np.zeros((len(thresholds), len(branching_factors)))
    to_plot_s = np.zeros((len(thresholds), len(branching_factors)))
    clusters = np.zeros((len(thresholds), len(branching_factors)))

    i = 0
    for t in thresholds:
        j = 0
        
        for b in branching_factors:

            clustering_labels, model, modifier, data, metric = clu_birch.cluster(
                data_matrix,
                numerical_labels,
                config,
                {
                    'num_clusters': 7,
                    'threshold': t,
                    'branching_factor': b
                }
            )
            
            n_clusters = len(set(clustering_labels)) - (1 if -1 in clustering_labels else 0)

            if n_clusters < 2:
                data = None
            results = evaluation.evaluate_clustering(numerical_labels, clustering_labels, data, metric, True)

            to_plot_f[i][j] = results[7]
            to_plot_s[i][j] = results[8] if data is not None else 0
            clusters[i][j] = n_clusters
            j += 1

        i += 1

    return to_plot_f, to_plot_s, clusters

## Clustering

Here we will use Birch, a hierarchical density-based clustering algorithm.

### 128 features
Let's start by using the data matrix produced by using PCA for feature extraction.

In [10]:
data_matrix_p = np.loadtxt('data/matrix/pca_128_1209.txt')

In [14]:
to_plot_f, to_plot_s, clusters = test_params(data_matrix_p, numerical_labels, config, thresholds, branching_factors)

--------------------------------------------------------------------------------
Clustering evaluation
Number of clusters 7
Number of distinct families 7
Adjusted Rand index: 0.42280289668
Adjusted Mutual Information: 0.581275034043
Fowlkes-Mallows: 0.608423261646
Homogeneity: 0.584842444012
Completeness: 0.850642671342
BCubed Precision: 0.901268797291
BCubed Recall: 0.597368782428
BCubed FScore: 0.718505729956
Silhouette 0.537244114664
--------------------------------------------------------------------------------
Clustering evaluation
Number of clusters 7
Number of distinct families 7
Adjusted Rand index: 0.42280289668
Adjusted Mutual Information: 0.581275034043
Fowlkes-Mallows: 0.608423261646
Homogeneity: 0.584842444012
Completeness: 0.850642671342
BCubed Precision: 0.901268797291
BCubed Recall: 0.597368782428
BCubed FScore: 0.718505729956
Silhouette 0.537244114664
--------------------------------------------------------------------------------
Clustering evaluation
Number of clust

--------------------------------------------------------------------------------
Clustering evaluation
Number of clusters 7
Number of distinct families 7
Adjusted Rand index: 0.252809424508
Adjusted Mutual Information: 0.405078465048
Fowlkes-Mallows: 0.518839400764
Homogeneity: 0.41025713446
Completeness: 0.77470673897
BCubed Precision: 0.892510951721
BCubed Recall: 0.466376120428
BCubed FScore: 0.612627500304
Silhouette 0.449022477425
--------------------------------------------------------------------------------
Clustering evaluation
Number of clusters 7
Number of distinct families 7
Adjusted Rand index: 0.252809424508
Adjusted Mutual Information: 0.405078465048
Fowlkes-Mallows: 0.518839400764
Homogeneity: 0.41025713446
Completeness: 0.77470673897
BCubed Precision: 0.892510951721
BCubed Recall: 0.466376120428
BCubed FScore: 0.612627500304
Silhouette 0.449022477425
--------------------------------------------------------------------------------
Clustering evaluation
Number of cluster

--------------------------------------------------------------------------------
Clustering evaluation
Number of clusters 7
Number of distinct families 7
Adjusted Rand index: 0.194995139339
Adjusted Mutual Information: 0.364961992075
Fowlkes-Mallows: 0.492914349682
Homogeneity: 0.370491712722
Completeness: 0.778294356589
BCubed Precision: 0.903456883659
BCubed Recall: 0.417332426365
BCubed FScore: 0.570934138415
Silhouette 0.439179943933
--------------------------------------------------------------------------------
Clustering evaluation
Number of clusters 7
Number of distinct families 7
Adjusted Rand index: 0.194995139339
Adjusted Mutual Information: 0.364961992075
Fowlkes-Mallows: 0.492914349682
Homogeneity: 0.370491712722
Completeness: 0.778294356589
BCubed Precision: 0.903456883659
BCubed Recall: 0.417332426365
BCubed FScore: 0.570934138415
Silhouette 0.439179943933
--------------------------------------------------------------------------------
Clustering evaluation
Number of clu

--------------------------------------------------------------------------------
Clustering evaluation
Number of clusters 7
Number of distinct families 7
Adjusted Rand index: 0.0699865631979
Adjusted Mutual Information: 0.169367544734
Fowlkes-Mallows: 0.434114875829
Homogeneity: 0.176221852586
Completeness: 0.634617028716
BCubed Precision: 0.911387631976
BCubed Recall: 0.285235501825
BCubed FScore: 0.434489525099
Silhouette 0.607199979635
--------------------------------------------------------------------------------
Clustering evaluation
Number of clusters 7
Number of distinct families 7
Adjusted Rand index: 0.0699865631979
Adjusted Mutual Information: 0.169367544734
Fowlkes-Mallows: 0.434114875829
Homogeneity: 0.176221852586
Completeness: 0.634617028716
BCubed Precision: 0.911387631976
BCubed Recall: 0.285235501825
BCubed FScore: 0.434489525099
Silhouette 0.607199979635
--------------------------------------------------------------------------------
Clustering evaluation
Number of c

In [15]:
plot_surfaces(to_plot_f, to_plot_s, clusters, thresholds, branching_factors)

Now let's see how it compares with the same algorithm applied to a data matrix obtained using Random Forest Classifiers for feature selection.

In [None]:
data_matrix_r = np.loadtxt('data/matrix/rfc_128_1209.txt')

In [None]:
to_plot_f, to_plot_s, clusters = test_params(data_matrix_r, numerical_labels, config, thresholds, branching_factors)

In [None]:
plot_surfaces(to_plot_f, to_plot_s, clusters, thresholds, branching_factors)

### 256 features


In [16]:
data_matrix_p = np.loadtxt('data/matrix/pca_256_1209.txt')

In [17]:
to_plot_f, to_plot_s, clusters = test_params(data_matrix_p, numerical_labels, config, thresholds, branching_factors)

In [18]:
plot_surfaces(to_plot_f, to_plot_s, clusters, thresholds, branching_factors)

Now let's see how it compares with the same algorithm applied to a data matrix obtained using Random Forest Classifiers for feature selection.

In [None]:
data_matrix_r = np.loadtxt('data/matrix/rfc_256_1209.txt')

### 512 features

In [19]:
data_matrix_p = np.loadtxt('data/matrix/pca_512_1209.txt')

In [20]:
to_plot_f, to_plot_s, clusters = test_params(data_matrix_p, numerical_labels, config, thresholds, branching_factors)

In [21]:
plot_surfaces(to_plot_f, to_plot_s, clusters, thresholds, branching_factors)

In [None]:
data_matrix_r = np.loadtxt('data/matrix/rfc_512_1209.txt')