In [136]:
%reset
# All imports
from pyparsing import Word, hexnums, WordEnd, Optional, alphas, alphanums
from collections import defaultdict
import pandas as pd
from copy import copy

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [137]:
address_format = Word(hexnums, exact=8) + WordEnd() # use WordEnd to avoid parsing leading a-f of non-hex numbers as a hex
byte_format = Word(hexnums, exact=2) + WordEnd()
instrn_line_format = ".text:" + address_format + (byte_format*(1,))("bytes") + Word(alphas,alphanums)("instruction")
byte_line_format = address_format + (byte_format*(1,))("bytes")

In [142]:
# Globals
SAMPLES_BASE_DIR = 'samples/'
TEST_FILES = ['0A32eTdBKayjCWhZqDOQ', '0ACDbR5M3ZhBJajygTuf']
INSTRN_BIGRAM_THRESHOLD = 20
BYTE_BIGRAM_THRESHOLD = 100
X = pd.DataFrame()

In [143]:
def get_features(filename):
    instrn_unigram = defaultdict(int)
    instrn_bigram = defaultdict(int)
    byte_unigram = defaultdict(int)
    byte_bigram = defaultdict(int)
    segments = defaultdict(int)
    with open(SAMPLES_BASE_DIR + filename + ".asm", 'r', encoding='Latin-1') as file:
        prev, now = 0, 0
        for line in file:
            # Filtering lines
            segments[line.split(':')[0]] += 1
            if not line.startswith('.text'):
                continue
            if ' db ' in line or ' dd ' in line or ' dw ' in line or 'align ' in line:
                continue
                
            try:
                result = instrn_line_format.parseString(line)
            except:
                continue
                
            prev = now
            now = result.instruction
            instrn_bigram[(prev, now)] += 1
            instrn_unigram[now] += 1
#                 if result.instruction == 'CC':
#                     print(line)
    instrn_bigram = defaultdict(int, {k:v for k,v in instrn_bigram.items() if v > INSTRN_BIGRAM_THRESHOLD and k[0] != 0})
#     print(segments)
#     print(instrn_unigram)
#     print(sum(instrn_unigram.values()))
#     print("==========================================================================================")
#     print(instrn_bigram)
#     print("==========================================================================================")
    with open(SAMPLES_BASE_DIR + filename + ".bytes", 'r', encoding='Latin-1') as file:
        prev, now = 0, 0
        for line in file:
            try:
                result = byte_line_format.parseString(line)
            except:
                continue
            
            byte_list = list(result.bytes)
            for byte in byte_list:
                prev = now
                now = byte
                byte_bigram[(prev, now)] += 1
                byte_unigram[now] += 1

    byte_bigram = defaultdict(int, {k:v for k,v in byte_bigram.items() if v > BYTE_BIGRAM_THRESHOLD and k[0] != 0})
#     print(byte_unigram)
#     print(sum(byte_unigram.values()))
#     print("==========================================================================================")
#     print(byte_bigram)
#     print("==========================================================================================")
    all_features = copy(segments)
    all_features.update(instrn_unigram)
    all_features.update(instrn_bigram)
    all_features.update(byte_unigram)
    all_features.update(byte_bigram)
    p = pd.DataFrame(all_features, index=[filename,])
    print(p)
    return p 

In [145]:
for filename in TEST_FILES:
    features = get_features(filename)
    X = pd.concat([X, features], axis=0)

                      .text  .rdata   .data  .idata  push  lea   mov  call  \
0A32eTdBKayjCWhZqDOQ  13801   39622  842632     455   971  367  1923   411   

                      pop  retn    ...     (4A, 23)  (20, 4A)  (13, 1F)  \
0A32eTdBKayjCWhZqDOQ  471   299    ...          105       114       116   

                      (1F, 12)  (4E, 22)  (11, 90)  (90, 10)  (12, 88)  \
0A32eTdBKayjCWhZqDOQ       116       101       102       102       102   

                      (16, 13)  (33, E9)  
0A32eTdBKayjCWhZqDOQ       102       101  

[1 rows x 3025 columns]
                      HEADER  .text  .idata  .rdata  .data  .rsrc  .reloc  \
0ACDbR5M3ZhBJajygTuf      17  23917     241  250376    417      3       3   

                      push  mov  add    ...     (11, 0C)  (10, CA)  (11, C4)  \
0ACDbR5M3ZhBJajygTuf    25  818    9    ...         1434      1472      1241   

                      (48, 00)  (01, 84)  (68, 10)  (10, 4B)  (4B, 00)  \
0ACDbR5M3ZhBJajygTuf      1861      1578  

In [148]:
X.fillna(0, inplace=True)
X

Unnamed: 0,.data,.idata,.rdata,.text,add,and,call,cdq,cmp,dec,...,"(11, 8A)","(01, CC)","(A4, 11)","(01, A2)","(A2, 10)","(01, EC)","(11, C4)","(68, 10)","(10, 4B)","(4B, 00)"
0A32eTdBKayjCWhZqDOQ,842632,455,39622,13801,194,232,411,29.0,483,75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0ACDbR5M3ZhBJajygTuf,417,241,250376,23917,9,48,9,0.0,355,2,...,1781.0,1399.0,1444.0,1543.0,1687.0,1474.0,1241.0,1584.0,114.0,128.0
