# Inspecting the content of possibly misclassified samples


After performin the clustering phase we compared the results with a baseline clustering provided by AV labels. 

From this comparison it was clear that there were some malware families which where classified in the same way by both our clustering and the AVs.

At the same time, however, there are groups of samples which result close in our feature space while being cathegorized as belonging to different families by the AVs.

We would like to inspect this samples to better understand why they were classified differently from the AV baseline.

Let's start by importing some useful packages.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from collections import defaultdict, Counter
from sklearn.externals import joblib
from preprocessing import pp_action
from utilities import db_manager
from utilities import constants
import plotly.offline as ply
from pprint import pprint
import pandas as pd
import numpy as np
import json
import os

In [None]:
config = json.load(open('config.json', 'r'))
uuids_family = json.load(open(os.path.join(constants.dir_d, constants.json_labels), 'r'))
words = json.load(open(os.path.join(constants.dir_d, constants.json_words), 'r'))
ply.init_notebook_mode(connected=True)

In [None]:
samples_data = pp_action.pre_process(config)
pp_action.split_show_data(samples_data)

In [None]:
uuids = samples_data.index[samples_data['selected'] == 1].tolist()

Next we load the labels and clustering results files

In [None]:
labels = json.load(open('data/labels.json', 'r'))
inv_labels = json.load(open('data/inverted_labels.json', 'r'))

clustering = json.load(open('data/d_clusterings/clustering_hdbscan_cosine_1209.json', 'r'))
uuid_md5 = db_manager.acquire_malware_file_dict_full(config['dir_db'])

In [None]:
clust_compositions = {i: Counter() for i in sorted(set(clustering.values()))}

for i in clustering:
    clust_compositions[clustering[i]][labels[i]] += 1

for clu in sorted(clust_compositions.keys()):
    print('Cluster {}:'.format(clu))
    print(clust_compositions[clu].most_common())
    print()

In [None]:
inverted_clustering = defaultdict(list)
for i in clustering:
    inverted_clustering[clustering[i]].append(i)

Let's isolate the noise cluster, i.e. the samples which the algorithm was unable to fit in a cluster.

In [None]:
noise = inverted_clustering[-1]

This cluster seems composed primarily by samples of the Eorezo and Bladabindi families.

In [None]:
noise_e = []
noise_b = []

for uuid in noise:
    if uuids_family[uuid] == 'eorezo':
        noise_e.append(uuid)
    elif uuids_family[uuid] == 'bladabindi':
        noise_b.append(uuid)

noise_e = sorted(noise_e)
noise_b = sorted(noise_b)

pprint(dict(zip(noise_e[:5], [uuid_md5[i] for i in noise_e[:5]])))
pprint(dict(zip(noise_b[:5], [uuid_md5[i] for i in noise_b[:5]])))

Similarly for cluster number 4

In [None]:
clus4 = inverted_clustering[4]

This time it seems this cluster should have been populated primarily by the Flystudio or the Gepys family. However a large number of samples from both Eorezo and Bladabindi are included in this cluster.

In [None]:
clus4_e = []
clus4_b = []
clus4_g = []
clus4_f = []

for uuid in clus4:
    if uuids_family[uuid] == 'eorezo':
        clus4_e.append(uuid)
    elif uuids_family[uuid] == 'bladabindi':
        clus4_b.append(uuid)
    elif uuids_family[uuid] == 'gepys':
        clus4_g.append(uuid)
    elif uuids_family[uuid] == 'flystudio':
        clus4_f.append(uuid)


clus4_e = sorted(clus4_e)
clus4_b = sorted(clus4_b)
clus4_g = sorted(clus4_g)
clus4_f = sorted(clus4_f)

pprint(dict(zip(clus4_e[:5], [uuid_md5[i] for i in clus4_e[:5]])))
pprint(dict(zip(clus4_b[:5], [uuid_md5[i] for i in clus4_b[:5]])))
pprint(dict(zip(clus4_g[:5], [uuid_md5[i] for i in clus4_g[:5]])))
pprint(dict(zip(clus4_f[:5], [uuid_md5[i] for i in clus4_f[:5]])))

Having isolated 5 samples for each 'misclassified' group we can try to inspect each of them individually. Let's start by printing the top ten wordsfor each sample.

In [None]:
def top_words(config, sample):
    tf_idf_file = os.path.join(config['dir_store'], sample)
    tf_idf = Counter(json.load(open(tf_idf_file, 'r')))
    pprint([i for i in tf_idf.most_common(10)])

In [None]:
def top_words_grp(config, grp):
    for sample in grp:
        print(sample)
        top_words(config, sample)
        print()

In [None]:
print('-' * 80)
print('eorezo')
top_words_grp(config, clus4_e[:5])

print('-' * 80)
print('bladabindi')
top_words_grp(config, clus4_b[:5])

print('-' * 80)
print('gepys')
top_words_grp(config, clus4_g[:5])

print('-' * 80)
print('flystudio')
top_words_grp(config, clus4_f[:5])


Let's see if we can understand what words are maintained in PCA

In [None]:
dr_model = joblib.load(os.path.join(constants.dir_d, constants.dir_mod, 'pca_128_1209.pkl')) 
reduced = np.loadtxt('data/d_matrices/pca_128_1209.txt')

In [None]:
words = dict(zip(range(len(words)), sorted(words.keys())))

In [None]:
for uuid in clus4_e[:5]:
    
    print(uuid)
    
    red_vec = reduced[uuids.index(uuid)]
    orig_feat_vec = np.dot(red_vec, dr_model.components_)
    cent_series = pd.Series(np.abs(orig_feat_vec), index=sorted(words.values()))

    print(cent_series.nlargest(10))
    print()

### Looking at VirusTotal data

Now that we have isolated some problematic samples, let's look at the realted VirusTotal report.

In [None]:
print('Eorezo samples in cluster 4: ', len(clus4_e))
for uuid in clus4_e:
    md5 = uuid_md5[uuid]
    vt = json.load(open(os.path.join(config['dir_vt'], md5), 'r'))
    ms_lab = vt['scans']['Microsoft']['result']
    ks_lab = vt['scans']['Kaspersky']['result']
    fs_lab = vt['scans']['F-Secure']['result']
    ca_lab = vt['scans']['ClamAV']['result']
    
#     print('{:<20} {:<20} {:<20} {:<20}'.format(str(md5), str(ms_lab), str(ks_lab), str(fs_lab)))
    print('{:<20} {:<38} {:<30} {:<20}'.format(str(ms_lab), str(ks_lab), str(fs_lab), str(ca_lab)))
    