In [16]:
%reset
# All imports
from pyparsing import Word, hexnums, WordEnd, Optional, alphas, alphanums
from collections import defaultdict
import pandas as pd
from copy import copy
import csv

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [137]:
address_format = Word(hexnums, exact=8) + WordEnd() # use WordEnd to avoid parsing leading a-f of non-hex numbers as a hex
byte_format = Word(hexnums, exact=2) + WordEnd()
instrn_line_format = ".text:" + address_format + (byte_format*(1,))("bytes") + Word(alphas,alphanums)("instruction")
byte_line_format = address_format + (byte_format*(1,))("bytes")

In [142]:
# Globals
SAMPLES_BASE_DIR = 'samples/'
TRAIN_FILES = ['0A32eTdBKayjCWhZqDOQ', '0ACDbR5M3ZhBJajygTuf']
# VALIDATE_FILES = []
INSTRN_BIGRAM_THRESHOLD = 20
BYTE_BIGRAM_THRESHOLD = 100

In [143]:
def get_features(filename):
    instrn_unigram = defaultdict(int)
    instrn_bigram = defaultdict(int)
    byte_unigram = defaultdict(int)
    byte_bigram = defaultdict(int)
    segments = defaultdict(int)
    with open(SAMPLES_BASE_DIR + filename + ".asm", 'r', encoding='Latin-1') as file:
        prev, now = 0, 0
        for line in file:
            # Filtering lines
            segments[line.split(':')[0]] += 1
            if not line.startswith('.text'):
                continue
            if ' db ' in line or ' dd ' in line or ' dw ' in line or 'align ' in line:
                continue
                
            try:
                result = instrn_line_format.parseString(line)
            except:
                continue
                
            prev = now
            now = result.instruction
            instrn_bigram[(prev, now)] += 1
            instrn_unigram[now] += 1
#                 if result.instruction == 'CC':
#                     print(line)
    instrn_bigram = defaultdict(int, {k:v for k,v in instrn_bigram.items() if v > INSTRN_BIGRAM_THRESHOLD and k[0] != 0})
#     print(segments)
#     print(instrn_unigram)
#     print(sum(instrn_unigram.values()))
#     print("==========================================================================================")
#     print(instrn_bigram)
#     print("==========================================================================================")
    with open(SAMPLES_BASE_DIR + filename + ".bytes", 'r', encoding='Latin-1') as file:
        prev, now = 0, 0
        for line in file:
            try:
                result = byte_line_format.parseString(line)
            except:
                continue
            
            byte_list = list(result.bytes)
            for byte in byte_list:
                prev = now
                now = byte
                byte_bigram[(prev, now)] += 1
                byte_unigram[now] += 1

    byte_bigram = defaultdict(int, {k:v for k,v in byte_bigram.items() if v > BYTE_BIGRAM_THRESHOLD and k[0] != 0})
#     print(byte_unigram)
#     print(sum(byte_unigram.values()))
#     print("==========================================================================================")
#     print(byte_bigram)
#     print("==========================================================================================")
    all_features = copy(segments)
    all_features.update(instrn_unigram)
    all_features.update(instrn_bigram)
    all_features.update(byte_unigram)
    all_features.update(byte_bigram)
    p = pd.DataFrame(all_features, index=[filename,])
    print(p)
    return p 

In [15]:
def get_labels():
    label = defaultdict(int)
    with open("./trainLabels.csv", 'r') as file:
        fileReader = csv.reader(file, delimiter=',')
        fileReader = list(fileReader)
        fileReader = fileReader[1:]
        for row in fileReader:
            label[row[0]] = row[1]
    return label

['01kcPWA9K2BOxQeS5Rju', '1']
['04EjIdbPV5e1XroFOpiN', '1']
['05EeG39MTRrI6VY21DPd', '1']
['05rJTUWYAKNegBk2wE8X', '1']
['0AnoOZDNbPXIr2MRBSCJ', '1']
['0AwWs42SUQ19mI7eDcTC', '1']
['0cH8YeO15ZywEhPrJvmj', '1']
['0DNVFKwYlcjO7bTfJ5p1', '1']
['0DqUX5rkg3IbMY6BLGCE', '1']
['0eaNKwluUmkYdIvZ923c', '1']
['0fHVZKeTE6iRb1PIQ4au', '1']
['0G4hwobLuAzvl1PWYfmd', '1']
['0gkj92oIleU4SYiCWpaM', '1']
['0H63jydvIahOVqgx5Kfo', '1']
['0Hrfce4X5YGESJPjl9uL', '1']
['0I4ZVvngsAatm8fzD3pk', '1']
['0iBaz3krsQ8HuA7cGDSt', '1']
['0iS3pwlgJco8XORD4TLq', '1']
['0jkmvR43UQ9yKxqXei61', '1']
['0KigmP9TLwJXNGz26tfO', '1']
['0l6fhCty3aSLDOgAjYQi', '1']
['0MOorvEIRmZGhqQdc3TA', '1']
['0NEsQlDGnUMg3Bew7R1A', '1']
['0NyfGXt8nmlK72Q9Irhs', '1']
['0odUVkrjp2B1n8NDS6bR', '1']
['0pTO4SVnWDehgUlYBAvq', '1']
['0qpMX5CZU8mgD2kaRNVv', '1']
['0Rgetc1wAfxlzHTGBOa7', '1']
['0rgudc7PpbexCtBjNqWF', '1']
['0S7z4qxYTHPUDO8fgIyA', '1']
['0SIqXMkDBWKE8s9pQaOv', '1']
['0uNkt6sirCnUWw175pjl', '1']
['0WdoYq78xDkFMcIwRpmJ', '1']
['0X6U3Sfc

['2UcBWPt6mf8dohDsYxOR', '2']
['2UCjXPzG97TAEsVghnlu', '2']
['2UfwYHR03ZMokQgFAzqt', '2']
['2uRxGAsyiWeOE7apNgvh', '2']
['2vfzAkDRd5PYmSwVqWx7', '2']
['2VHnal41dt9Ar0EhQSjZ', '2']
['2vQgFHCYq9wfb8GOV70B', '2']
['2wCPm7giSGuRejqFU0BN', '2']
['2WHevmqLEZAb5ug3I7DN', '2']
['2woOYK65vEk9LXrjDxmS', '2']
['2wxRyP0giXLMC4daDunr', '2']
['2xCQWIzZGDP9fTbidlqc', '2']
['2Y8z5kBNOFot4rAVn3cK', '2']
['2yI8fF9vgTcO0Dr3PdsQ', '2']
['2z8s73BXN0wqjOUdtJFD', '2']
['2Zlt5IE9yexj3QM7N6cP', '2']
['2ZNUOyAMv67lcYPoetkR', '2']
['2ZoTE3iUtwFfuNAYlyOk', '2']
['2zUdQEyv5oKARaYGxhXk', '2']
['30bZwzUydP1hQD8WJcrL', '2']
['30Sj9rlHUZveV8FN1k54', '2']
['31q6gcPKtCp8EQGj7yTL', '2']
['32kdRqV7gDCjrpW1L4xS', '2']
['32WwkMHD8dafQApiKmlv', '2']
['34JhfVHMQArU8iNpvKC2', '2']
['35rbwhMGTxqAWjuFt20m', '2']
['369xrHyDKegPqFMYWQJu', '2']
['36pPng0NYsTMyCd2jf87', '2']
['36zPKsYmJc8wOiEqStFN', '2']
['38IdJg2Se7po9rGZaNAw', '2']
['38MtDCdfSEF7BWhizVyL', '2']
['38nAlIJyjGuc5PpS1ZHw', '2']
['39VK5R6xNtwkBZqDY7Lr', '2']
['39x27Lvz

['cyTsg9fViFZ8uLjUXGSv', '2']
['cyuBTQ0SUwpF3jfPeLiI', '2']
['CzgoyP4jQWJvYMn25EXS', '2']
['CziF6LUN1YaQOHmsX0T7', '2']
['CzTXgxNKqlcbMUOoSGkE', '2']
['czxjYgBb4TeOD38AhNvi', '2']
['czy0QlI3ebmEYXWjaGkr', '2']
['d0QN7fJslMFL6qkKgbWS', '2']
['D14MK7pSBhfeR3WxHztw', '2']
['d1EnrNW57S40FeVhfzPu', '2']
['d1fXGRAg2psKqVW93zTk', '2']
['D1jnAYav7uQ6GZ9cU5zP', '2']
['d1NogDxXBb0jsyQnSUZE', '2']
['D2YdSge0M35tWfRUxFJc', '2']
['D2ZKSx9I7MarJWeVtjls', '2']
['d3fFztVhAlOUSc5obTqC', '2']
['D3WtlJNAupzn0vxXiIce', '2']
['d3zMqD8Fehx6EvRUmBuZ', '2']
['d4lM2k7aq1RwJuTpgO6X', '2']
['D4Lv6uRQo2cFgwj7NnpU', '2']
['d4MZmSTpJQuP5yh7cl8W', '2']
['D5ecMC0KmzORPVrwgLT1', '2']
['D5wceyp2lHBF8JP3qz0S', '2']
['d603ro8OlAQxKC9wGib1', '2']
['d6ACYSuPQ4wcjJynZhNE', '2']
['d7oLya1XJFWjEpRzmGVI', '2']
['d7VWOQbLxZIENHrwcXKe', '2']
['d7Zghj1V46fQUYuSc0kl', '2']
['d8lFkrfZsQom9zgRqpLi', '2']
['d8NE6mrFWU9t2nb40f3G', '2']
['D95QEl8vuaTqdIAszekS', '2']
['d98DIzHL6lsYQS2X0URe', '2']
['D9QULlKPYijhJezwSOFT', '2']
['d9YpONFc

['Jd1mD8obKMOwqBgXjRut', '2']
['jD6G4s3pinYJQTMxmgfZ', '2']
['JDC9zOMvfUxWby4HVFk3', '2']
['jDdg0PT2HAlk6qrbJKMi', '2']
['jDfyvw6lL31RQYMBHGzq', '2']
['jDlW6ZoIcO0Nh5L4sqa2', '2']
['JDpx3T254Vv1UbM9OQWc', '2']
['je3Im0RAozk8M7gnlXiO', '2']
['JEdVbXyCro7SBKHZRPOe', '2']
['jelaIp5XS4JU7MzrmnoP', '2']
['jExBgXmKlhA4URac86kQ', '2']
['JEzvM43OSnZQcV901IBR', '2']
['JF2417UDdQBcfryqLz3g', '2']
['jfcC8WhHuIJUAlLTZpNs', '2']
['JFGA1Ypxt56ElPa3WUOQ', '2']
['JfgmjCtoTKyM6pcIrBY7', '2']
['JFiWOaubTAKCyRoBqzfn', '2']
['JfUstdXDHbQZOxuBcG60', '2']
['JFuXwQeIqjPgyrWtcmUk', '2']
['JFW8DGfRUPbNo3YHz2xq', '2']
['jG4TKyJkzD97Vn1lrscI', '2']
['Jgc17BNL9qFovTjD65hY', '2']
['jgeLHoNOsITmWJ8bpnZi', '2']
['JgicF4aOhmrZBK3HYM8V', '2']
['jgNc6pwRCBPetnT2yXQH', '2']
['jgO3mkayeMX8CuLbxFpV', '2']
['jgOs7KiB0aTEzvSUJVPp', '2']
['JGqMAsU8BNzPpT1cIF2n', '2']
['JGxqvbB0l8hEgki6cOef', '2']
['jGXxl65NzmIQfF802LRC', '2']
['jh5EgYDdqxsan190RW8U', '2']
['JH9ls1UkVQ3qbX2tRFx6', '2']
['JhBYHQsTMGWqaf24Ou0p', '2']
['jHfu9AMl

['7hqLNuxI1Hn5BXrCmt6k', '3']
['7hsb8evACNndQgJPSR61', '3']
['7HXWPhqBE6cnVN8tDMTO', '3']
['7HYhXpwdBAxL6cGn3Ov0', '3']
['7iL5gJyPeOdKruGDn82s', '3']
['7IqKtSRB0mUzuDs6G4OL', '3']
['7KNOxVBrs8pf4u35Jdk2', '3']
['7kUpheD4FqzZjQXcLO63', '3']
['7kxCNEI8zBDM1KeW56u3', '3']
['7ljvb89q14fGINmaZhrP', '3']
['7lYJNX4TjapibMyvd9sz', '3']
['7lz9eQuHKTdVIfGYMrUn', '3']
['7MgbPSoCVYwTWBiF2lNx', '3']
['7mGrwqj2tdfEuNDn4gVR', '3']
['7MWNnkoDrl2aRZAIPKOy', '3']
['7NCqsB8KRtWYQ0LVaPAH', '3']
['7NFwbJ9Ky8P3Iz06hU1Z', '3']
['7nL2QiAIcsRxfMHwyTeB', '3']
['7nN0mHVK2B6v5dsMUiQh', '3']
['7NvXmzKWC3VeUZaBFnSh', '3']
['7obPSDvanqmeT4Wiuw9Z', '3']
['7oKB6OiGX8LStVwnqlQI', '3']
['7OoFgDGrsN1hqkueAmTC', '3']
['7P5gA4ZdSNWuYCi8lqpa', '3']
['7PtsSV1FrYgEfGpaqOn8', '3']
['7PZmOnTke1EJh9I5yijz', '3']
['7qBpIPJl0DvXnuWAMr6i', '3']
['7qgZAJlOevNdjHtbpfuR', '3']
['7r3gQfJxjyIdX1YloWuR', '3']
['7r5DCxQ2nJKN0GOYPmcH', '3']
['7rcIYafVLBJQ1Hwy2gDO', '3']
['7RfoFlxTSDhLmqreOzpg', '3']
['7ri8e1uFxo9B3hD4p2Jk', '3']
['7rLEfBjb

['dV3jJqMW64Z27mEHNXGU', '3']
['dV8TQtBk3AK6nFq4zihw', '3']
['dv9DhwK6ClXYcZkg1rMF', '3']
['dVmW97NCRysfpkMxSuBi', '3']
['dVo7TPLcFQvOBx5kn1bR', '3']
['DVsfP6pU89nBdhZHqxI5', '3']
['dw1szJLjHfcmE49uZKvW', '3']
['DW2p936zbvPKIECerA70', '3']
['Dw2sXhq6imtArjTZo3YC', '3']
['Dw4btEPMk3reBuNsZ15S', '3']
['dW4sDcJMQCVOEgFjxXU0', '3']
['dwBAfDK5Y7SQoO63qxMP', '3']
['dWF5Zcz6wUv2gxOrMtK8', '3']
['DWFmpGk9T0cqB1f3YCjy', '3']
['DWixC3uFhlE6J4K0rfsZ', '3']
['dwIXmJU36HYyvpC8hj2n', '3']
['DwNQdfBUbji2WRGn1vLE', '3']
['dWR0pTQsxDn2aVoM37CB', '3']
['dwSDEfQ57Aye46YmCNOG', '3']
['dXB9U7M1pgt8lPGhyNuo', '3']
['dxq8SWO4gJvZH2NBfPpD', '3']
['DxtXmvJyZMRjka8Wg07B', '3']
['DxXaHhzNWCijYObQsPry', '3']
['dxXyVcRlfr3t6skMUPCK', '3']
['DY3QpwMcCOHsSWuVe89P', '3']
['dyTqfGCNl7Hv21oZLMK5', '3']
['DYzoP85CvmEU6BHWwVT0', '3']
['DZ9asqbIBo1XLmUvRy7k', '3']
['dZB602DGWj5pkLCtzgvO', '3']
['DZcx05fYb82BoIs3XrAR', '3']
['DZGpVNe2idLjnY07IcyF', '3']
['dZQBi4t7Ws5yDImRvSUj', '3']
['DZSwtHBVTqhivJscaoWA', '3']
['dzV9JlQs

['GmPLUX1Y2DQhan563Etf', '3']
['Gnc8AEuT1KtvpZMCh5mU', '3']
['GneEPpmtLjvhTW0HIsNc', '3']
['gNKpoY74nrmyQLZJaIkh', '3']
['gNOGKEjzJi5phY0eyx8m', '3']
['gnQaT4Gdq81OLlsR06tA', '3']
['gNtLHRhP5depSUoqJ9mk', '3']
['gnuITJ1G6aZPskYvm3FX', '3']
['GoAF7XTjK2HmwJzDhdkQ', '3']
['GoYCdUnVlSbXwk8I9r1O', '3']
['GOZy7VoT8K5AWci6DIu3', '3']
['gp7MxT3EYbicRlBKty90', '3']
['gPA06qVDSM1XILOKoG8R', '3']
['GPEC53g1KvWksBcYhuef', '3']
['gPICkhiGxzB9fea5wUWM', '3']
['gpOUTtWIXsvLdewJmfY9', '3']
['gPUaRd12vzpH7lkV5DhG', '3']
['gq0Y8ZH1eU2NuIo5tOA6', '3']
['gQcxnGXHDVTCvz8hyaO9', '3']
['GqjF9NARSywHgC6bUBte', '3']
['gQTjhF1n4ASvDV56ZpWP', '3']
['Gr8pcIbXheNmzaBiHgFR', '3']
['grbAvIxKkFeLica6D2QJ', '3']
['gRNQ2Kw6dHlOXrhBa0Vb', '3']
['gROmPjLGQ34WNVr8ZtiX', '3']
['grOUwuRlHBySvhJF0nLP', '3']
['GSBVYnK0U8HMDLyfeEjF', '3']
['GseuqVvdHoYCySXkJpEL', '3']
['gsHjJZv1AGPzeofaBqVY', '3']
['gSHu3cdNKLOwke7hDipC', '3']
['GSoMecQhOTdXZy7D1Cjg', '3']
['GsPr60aJXyNVFQA5eTHK', '3']
['GSQeltaBqZPDTpHEoxW7', '3']
['gsqykQJm

['j9ViIqQ4RXTm7hSFwblW', '3']
['JahiBscoyeldwpEV0mxW', '3']
['jaHTAc6MVElKrygs1CzO', '3']
['jan6Lld9PSf5ZG4iOuFq', '3']
['JaO8p2qFXjiNARcbIvGH', '3']
['jAPy1K0O2qwrC4VDgUGt', '3']
['JaQydTjesbikFpx6m9Zv', '3']
['JARHFzZD26e3iQIaCEoV', '3']
['jaUTIBe2KchQo3AlCWwE', '3']
['jav8x0cXCqsOofl9MPyw', '3']
['JbaZC7Fdx0rzD83LEghm', '3']
['jBIefpw1x6UGi2WPAOFM', '3']
['jBqmxJCU0dZTO3kSyhMI', '3']
['jBuKaDNdCeyWLkElmpSo', '3']
['Jbv0zUjcOhsALd5waN8F', '3']
['JCA6mY29clo8LOGxaMnT', '3']
['JcaYWHPMxb4jSBLiXZNC', '3']
['JcFgaI0O58Am1EVtPxY4', '3']
['jcxNbYug1rMTmGlQz9O4', '3']
['JCzoBIAiqnR09N3Pumvb', '3']
['jCzu57r2voR9A86hFgHp', '3']
['JCzVMIhYuK13LZgo6Nct', '3']
['jdcAqB7IF9bflzOVQMLY', '3']
['jDO07nLsp3BTQSIuaYAF', '3']
['jdqclF2YAsm4LJwZUxPN', '3']
['JdqoYzZN0X7PUBCyvspt', '3']
['JDWPrC4Vh2SevtkYimaz', '3']
['JdxPDreW3hNGZljws1yL', '3']
['je0Ds6Gax5oiTCmPAcBR', '3']
['JEbogFC9kPnGSdi2KDaZ', '3']
['JecsGrMgn5xWXVkyj2Io', '3']
['jepVLK14FMHN90rgJCzs', '3']
['JeuzM8SRYWGK1navfErh', '3']
['jEYIT73v

['avEsqegSVNdCUIAX53cY', '6']
['avKCprtmoRuXSeDNi5HU', '6']
['Ax8CPSOsnjuY1cDyNbqH', '6']
['AY0G56vi1jp72ErLmkJb', '6']
['aylR8xBNVPfJ4eICGLDM', '6']
['azZYUgqQNfKAdm5nsutG', '6']
['B0vrgthbJyqlNks3UXIx', '6']
['b2CAKLd8wJBEgHry3Qhk', '6']
['b2nCMhEW137KuGiLXkf9', '6']
['b3EWAzP4jaUevDKZrfqt', '6']
['b7gT9EzYN5fs40cJX2FQ', '6']
['BASrsZoQNCtedvFkT5Xz', '6']
['BC3It0Ak7YM9UQXVZnes', '6']
['BDpyWqHITz950E3Q8Mf6', '6']
['BEcVolryIXTuDYLKbgGj', '6']
['bfcBD4vaOtUJG6K5imqx', '6']
['BFmbPXnMtl9oVyI1SYvT', '6']
['BK2Osv0kr1lnCpSDUaIf', '6']
['BkI0DfGqsHhejTp3v1RP', '6']
['BLUqHvpdaytR0E3rZMu6', '6']
['blUxaBNCoSJ3veXVIKr4', '6']
['Bm1lbzD7fUOh8soCgRjV', '6']
['BM4bG5xEwohipXWTKt3U', '6']
['BnRgI07aqODWw3FcKL6G', '6']
['boOznJjLCqDKkAT9RB0m', '6']
['BoQniFqe8vAE19rXDy6L', '6']
['bQVJ7HpnSC01EI4PNzwy', '6']
['br3oAnTeGyDp97XkzCOS', '6']
['Br5kJPaoUAEHjGcINVtS', '6']
['bRKkovBDCAzcZfNJ7LWH', '6']
['bRleFLPfA672tpnWVsdK', '6']
['bSeMuYvEqX4a9J02VWl8', '6']
['BShUIfCj1Z8recsNWYT6', '6']
['bSMwUTCc

['dKSp52B8YbLTfUc1VNOz', '8']
['dlDuzi87p9eAZtUE4w2G', '8']
['dlKCRHePG31OFg29TXbo', '8']
['dLUoZ03Xqh5PKMyflOBj', '8']
['dLuRwUfn04V1ZsDpbvKP', '8']
['dlWuKQ8saoCgw0ISYPjB', '8']
['dmHnoVODQABpur6sx7ql', '8']
['dMVkBwLu3SRhqrIy7Nop', '8']
['DMYi0TwJsZ5pt31IaxzO', '8']
['Dnk0O61Ic2hMxiAlY4bE', '8']
['DNmcvMG3LHIbr6zRed4h', '8']
['DNPAd4wCIHh59p2nbqJT', '8']
['DnSqtl6oksRxOwP8g0AM', '8']
['Do9QfaXw52dYzcFipUeq', '8']
['dOA4mHxVR8JsDoa2YlkB', '8']
['dONo5qIgbC4PZ9yW6QSl', '8']
['DpRqhyeIswlXrdF7YVTv', '8']
['DPVTHpkfLrABWaqgoyjw', '8']
['dQ5kFDwP4ByqU3tnLTZf', '8']
['dqBs38ourIHZWme1CTOp', '8']
['dr5FIgB8cMHx6WETnmOe', '8']
['DReqVt7Hxa5plh8v0QP9', '8']
['drj8265K4UhucxtI7ERX', '8']
['DRk5zvgt4bPBjoy6l1w2', '8']
['drkjQq9sGnzNb0X3CFpJ', '8']
['DS3nJylYKdXQp7wbghk8', '8']
['Ds9pnm1hBNrOxVlza3KT', '8']
['dT6n5gVkjxi3oDqUB4Zy', '8']
['Dtfl1zhqCkGNSbmcaXW2', '8']
['dTPeEC5gShQ1YVcI94N0', '8']
['DTWtmMOnSVQ6L8KxEwba', '8']
['duYKP0sJ6pINyZRmOCWF', '8']
['DvCqbn37xi5YRwaP19NJ', '8']
['DW4JTXzd

['IRvabWSMkDe8pxXVLOrh', '9']
['IsvGOaEpYKnrlqAbkRZz', '9']
['itDk014ZYvsbGMfdR2cH', '9']
['ItHfy5k2V6T9MNLs83ae', '9']
['itPKjAZNemORVxF5ul9c', '9']
['iugDTFftkUxZAWcBwCYI', '9']
['IUVTHk0fbcLtrx91MeRh', '9']
['iveb4LBTZR0ofd7SqVlP', '9']
['iVR4woCP02d89KLaXTm3', '9']
['iVXI84d0vYqOcLArnTtm', '9']
['IWwBaxyoTQbklN6CDG3h', '9']
['ixblgAhzj1nEso30pr5V', '9']
['iXZ3voSbundaAjCJl9Ue', '9']
['Iy6lQgsbZ9q0tfrT1hMe', '9']
['IyC7QefoEahdAmqDN5b9', '9']
['izAB3JHdROIwy8UmKbY1', '9']
['izS2yBV3Z4nKch5OmAuo', '9']
['j0pIG349TtMiXgkH2bnF', '9']
['J1LCS4Foa2jbAOYGWQhz', '9']
['j2Ovf5KQNuceaI0yliHs', '9']
['j514c8CPoS9fIyKmnOaH', '9']
['j5q9Z8uCLzcFGHpA7l4x', '9']
['J8BKYxOaonACvqH5GT7m', '9']
['J9EgXeHbB5VaD23i7rhN', '9']
['JaQjr806qALIHS2MDZyf', '9']
['jASo6C54kaZUxit0egTL', '9']
['JAvZf2EF7UHC6OKL81Gm', '9']
['JaxoikOjnRvIHFAzBMUD', '9']
['jBbO9vIsVMT21nl7megH', '9']
['JBGVawO7uqtxM103PbNy', '9']
['jbkwtqSZ6HavpN7Rr8el', '9']
['JBVExGsfH4oybvK92pPg', '9']
['JBXrCHbwSvZ9c5y067Ol', '9']
['jCXhwVO4

In [145]:
def get_train_data():
    train_data_points_ = pd.DataFrame()
    for filename in TRAIN_FILES:
        features = get_features(filename)
        train_data_points_ = pd.concat([train_data_points_, features], axis=0)
    train_data_points_.fillna(0, inplace=True)
    train_data_labels_ = get_labels()
    return (train_data_points_,train_data_labels_)

                      .text  .rdata   .data  .idata  push  lea   mov  call  \
0A32eTdBKayjCWhZqDOQ  13801   39622  842632     455   971  367  1923   411   

                      pop  retn    ...     (4A, 23)  (20, 4A)  (13, 1F)  \
0A32eTdBKayjCWhZqDOQ  471   299    ...          105       114       116   

                      (1F, 12)  (4E, 22)  (11, 90)  (90, 10)  (12, 88)  \
0A32eTdBKayjCWhZqDOQ       116       101       102       102       102   

                      (16, 13)  (33, E9)  
0A32eTdBKayjCWhZqDOQ       102       101  

[1 rows x 3025 columns]
                      HEADER  .text  .idata  .rdata  .data  .rsrc  .reloc  \
0ACDbR5M3ZhBJajygTuf      17  23917     241  250376    417      3       3   

                      push  mov  add    ...     (11, 0C)  (10, CA)  (11, C4)  \
0ACDbR5M3ZhBJajygTuf    25  818    9    ...         1434      1472      1241   

                      (48, 00)  (01, 84)  (68, 10)  (10, 4B)  (4B, 00)  \
0ACDbR5M3ZhBJajygTuf      1861      1578  

In [148]:
X.fillna(0, inplace=True)
X

Unnamed: 0,.data,.idata,.rdata,.text,add,and,call,cdq,cmp,dec,...,"(11, 8A)","(01, CC)","(A4, 11)","(01, A2)","(A2, 10)","(01, EC)","(11, C4)","(68, 10)","(10, 4B)","(4B, 00)"
0A32eTdBKayjCWhZqDOQ,842632,455,39622,13801,194,232,411,29.0,483,75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0ACDbR5M3ZhBJajygTuf,417,241,250376,23917,9,48,9,0.0,355,2,...,1781.0,1399.0,1444.0,1543.0,1687.0,1474.0,1241.0,1584.0,114.0,128.0
