In [1]:
from elasticsearch import Elasticsearch, helpers
from collections import Counter
from pprint import pprint
import json
import os

Elasticsearch configuration varables

In [2]:
port_num = 9200
host = 'localhost'

In [3]:
conn_conf = {
    'host': host,
    'port': port_num,
    'timeout': 3600
}

Establish a connection

In [4]:
es = Elasticsearch([conn_conf,], timeout=600)

Format query parameters

In [5]:
def format_query(term):
    query_b = {
        "_source": {
            "exclude": ["content", "syscalls", "raw", "registry"]
        },
        "query":{
            "multi_match" : {
                "query" : term,
                "fields": ["content"],
                "fuzziness": 0
            }
        }
    }
    return query_b

Define helper function to search for term and return a Counter object

In [6]:
def search_term(term):
    term = term.strip().lower()
    res_dict = Counter()
    
    # Execute query
    res = es.search(index="malwords", body=format_query(term))
    n_hits = res['hits']['total']
    print('Query: {:35} \t Number of hits: {}'.format(term, n_hits))
    
    if n_hits == 0:
        return res_dict
    
    res = helpers.scan(
        es,
        query=format_query(term),
        index="malwords",
        doc_type="samples"
    )
    
    # Scan the result array
    for elem in res:
        res_dict[elem["_source"]["family"]] += 1
    
    return res_dict

Define hellper function to read relevant strings file

In [7]:
string_file = 'relevant_strings.txt'

In [8]:
def prepare_strings(file_path):
    to_search_dict = {}
    
    with open(file_path, 'r', encoding='utf-8') as in_file:
        for line in in_file:
            
            # Relevant strings are space-separated and divided from the family name by a '-'
            family = line.split('-')[0].strip()
            strings = [i.strip() for i in line.split('-')[1].split()]
            to_search_dict[family] = strings
    
    return to_search_dict             

In [9]:
to_search = prepare_strings(string_file)

Look for the strings and report the results

In [10]:
def search_all(to_search):
    results = {}
    
    # Scan through the strings dictionary
    for family, strings in to_search.items():
        results[family] = {}
        
        print('\nAnalyzing family: {}'.format(family))
        
        # Scan through the list of strings
        for rel_string in strings:
            results[family][rel_string] = search_term(rel_string)
    
    return results

Perform the search

In [11]:
results = search_all(to_search)


Analyzing family: allaple
Query: a15xv9x7s                           	 Number of hits: 0
Query: twofortheworms                      	 Number of hits: 0
Query: net46                               	 Number of hits: 0
Query: amf45dfv                            	 Number of hits: 0
Query: rpcstringbindingcompose             	 Number of hits: 0
Query: rpcbindingfromstringbinding         	 Number of hits: 0
Query: rundll32                            	 Number of hits: 0
Query: smbnetwork.dll                      	 Number of hits: 0
Query: executeservice                      	 Number of hits: 0
Query: zorn                                	 Number of hits: 0
Query: babcdefghijklmnopqrstuvwabcdefghi   	 Number of hits: 0

Analyzing family: virut
Query: zief                                	 Number of hits: 19805
Query: ircgalaxy                           	 Number of hits: 0
Query: updatehost                          	 Number of hits: 0
Query: celebrate                           	 Number of hits: 0

Explore the results

In [12]:
def explore_results(results):

    # Scan the results dictionary
    for family, queries in results.items():
        print('\nAnalyzing family: {}'.format(family))
        
        #Scan through the queries per family
        for query, count in queries.items():
            
            # If Counter is empty skip it
            if len(count) == 0:
                continue
            
            # If Counter is not empty report the percentage
            # of elements with the same class as current family
            tot = sum(count.values())
            tot_fam = count[family]
            percent = tot_fam * 100 / tot
            others = tot - tot_fam
            print('Found {} results for {}'.format(tot, query))
            print('Of those, {} belong to {}, {}\%, and {} to other families'.format(tot_fam, family, percent, others))
            
    

In [13]:
explore_results(results)


Analyzing family: allaple

Analyzing family: virut
Found 19805 results for zief
Of those, 2774 belong to virut, 14.006563998990154\%, and 17031 to other families
Found 1350 results for summer
Of those, 91 belong to virut, 6.7407407407407405\%, and 1259 to other families
Found 5860 results for garden
Of those, 734 belong to virut, 12.525597269624573\%, and 5126 to other families
Found 5752 results for happy
Of those, 308 belong to virut, 5.354659248956884\%, and 5444 to other families
Found 214 results for expectant
Of those, 7 belong to virut, 3.2710280373831777\%, and 207 to other families

Analyzing family: virlock
Found 3252 results for federal
Of those, 993 belong to virlock, 30.535055350553506\%, and 2259 to other families
Found 1420 results for bitcoin
Of those, 1062 belong to virlock, 74.78873239436619\%, and 358 to other families
Found 4607 results for infringement
Of those, 990 belong to virlock, 21.489038419795964\%, and 3617 to other families

Analyzing family: multiplug

A