# How to extract Benign(pefile) opcode
- Dataset Link: https://github.com/iosifache/DikeDataset
- 해당 파일은 양성코드 파일들에서 opcode를 추출하여 딕셔너리 형태의 피클 파일로 저장하는 방법을 보여줍니다.
- 해당 방식을 통해 opcode를 추출하여 각 기존 기법 머신러닝 모델에 Input으로 넣는 것을 추천드립니다.

# Python Package load

In [1]:
from capstone import *
from capstone.x86 import *
import pefile
import time
import os
import pickle
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook

# BenginFile directory 내 파일명 리스트 생성

In [2]:
path_dir = 'D:/Consistency Training for Malware Static detection/data2/benign'
file_list = os.listdir(path_dir)
file_list[:10]

['002ce0d28ec990aadbbc89df457189de37d8adaadc9c084b78eb7be9a9820c81.exe',
 '003851675800dc05cdac1baa84cab8f68534b244906d97ced0c40f50da27df0a.exe',
 '00eea85752664955047caad7d6280bc7bf1ab91c61eb9a2542c26b747a12e963.exe',
 '0111bddac92a792c7b2ee3ab77642c33df0e01afe737b0d1fa0cbbf331d9572c.exe',
 '016584e586de67b725ac1e3974fcca320bf81c8c489ebb17d9909735d517e7ae.exe',
 '01a38ae91ae28ca9dcc89790292ad106a3f5bf8c16318b7ac9c1e7ab4ca628d4.exe',
 '0222aaf048e5bc28f88d03862f133bc444d358f06201e00dcc93422c81e5bcca.exe',
 '02253301617f5201605443ec0c4ab9e3bf8667caa3ea57a0be4d0641e2394ef6.exe',
 '02431aa60089b968bc59acc69796ed9418546894752d0c9766fbe3aae0a85031.exe',
 '03aed529543ac4ce2d9bb6b325f59686a06f1e9df112cf719a8e5222fffda801.exe']

# opcode 리스트 로드
- file 폴더에 있는 opcodesList.txt의 파일 경로로 변경해주시면 됩니다.

In [3]:
filePath = 'opcodesList.txt'
with open(filePath, 'rb') as lf:
    opcodes = pickle.load(lf)
    print(opcodes)

['aaa', 'aad', 'aam', 'aas', 'adc', 'add', 'and', 'call', 'cbw', 'clc', 'cld', 'cli', 'cmc', 'cmp', 'cmpsb', 'cmpsw', 'cwd', 'daa', 'das', 'dec', 'div', 'esc', 'hlt', 'idiv', 'imul', 'in', 'inc', 'int', 'into', 'iret', 'jcc', 'ja', 'jae', 'jb', 'jbe', 'jc', 'je', 'jg', 'jge', 'jl', 'jle', 'jnle', 'jno', 'jnp', 'jns', 'jnz', 'jo', 'jp', 'jpe', 'jpo', 'js', 'jz', 'jcxz', 'jmp', 'lahf', 'lds', 'lea', 'les', 'lock', 'lodsb', 'lodsw', 'loop', 'loope', 'loopne', 'loopnz', 'loopz', 'loopx', 'mov', 'movsb', 'movsw', 'mul', 'neg', 'nop', 'not', 'or', 'out', 'pop', 'popf', 'push', 'pushf', 'rcl', 'rcr', 'repxx', 'rep', 'repe', 'repne', 'repnz', 'repz', 'ret', 'retn', 'retf', 'rol', 'ror', 'sahf', 'sal', 'sar', 'sbb', 'scasb', 'scasw', 'shl', 'shr', 'stc', 'std', 'sti', 'stosb', 'stosw', 'sub', 'test', 'wait', 'xchg', 'xlat', 'xor']


# Vocab Load

In [18]:
opcodeDict = {}
for i in range(len(opcodes)):
    opcodeDict.setdefault(opcodes[i], i)

print(opcodeDict)

{'aaa': 0, 'aad': 1, 'aam': 2, 'aas': 3, 'adc': 4, 'add': 5, 'and': 6, 'call': 7, 'cbw': 8, 'clc': 9, 'cld': 10, 'cli': 11, 'cmc': 12, 'cmp': 13, 'cmpsb': 14, 'cmpsw': 15, 'cwd': 16, 'daa': 17, 'das': 18, 'dec': 19, 'div': 20, 'esc': 21, 'hlt': 22, 'idiv': 23, 'imul': 24, 'in': 25, 'inc': 26, 'int': 27, 'into': 28, 'iret': 29, 'jcc': 30, 'ja': 31, 'jae': 32, 'jb': 33, 'jbe': 34, 'jc': 35, 'je': 36, 'jg': 37, 'jge': 38, 'jl': 39, 'jle': 40, 'jnle': 41, 'jno': 42, 'jnp': 43, 'jns': 44, 'jnz': 45, 'jo': 46, 'jp': 47, 'jpe': 48, 'jpo': 49, 'js': 50, 'jz': 51, 'jcxz': 52, 'jmp': 53, 'lahf': 54, 'lds': 55, 'lea': 56, 'les': 57, 'lock': 58, 'lodsb': 59, 'lodsw': 60, 'loop': 61, 'loope': 62, 'loopne': 63, 'loopnz': 64, 'loopz': 65, 'loopx': 66, 'mov': 67, 'movsb': 68, 'movsw': 69, 'mul': 70, 'neg': 71, 'nop': 72, 'not': 73, 'or': 74, 'out': 75, 'pop': 76, 'popf': 77, 'push': 78, 'pushf': 79, 'rcl': 80, 'rcr': 81, 'repxx': 82, 'rep': 83, 'repe': 84, 'repne': 85, 'repnz': 86, 'repz': 87, 'ret': 

# 필요 함수

## Disassembling 함수

In [4]:
#the function takes two arguments, both are fetched from the exe file using
#pefile. the first one is the list of all sections. The second one is the
#address of the first instruction in the program
def get_main_code_section(sections, base_of_code):
    addresses = []
    #get addresses of all sections
    for section in sections: 
        addresses.append(section.VirtualAddress)
        
    #if the address of section corresponds to the first instruction then
    #this section should be the main code section
    if base_of_code in addresses:    
        return sections[addresses.index(base_of_code)]
    #otherwise, sort addresses and look for the interval to which the base of code
    #belongs
    else:
        addresses.append(base_of_code)
        addresses.sort()
        if addresses.index(base_of_code)!= 0:
            return sections[addresses.index(base_of_code)-1]
        else:
            #this means we failed to locate it
            return None

In [5]:
def fine_disassemble(exe):
    #get main code section
    main_code = get_main_code_section(exe.sections, exe.OPTIONAL_HEADER.BaseOfCode)
    #define architecutre of the machine 
    md = Cs(CS_ARCH_X86, CS_MODE_32)
    md.detail = True
    last_address = 0
    last_size = 0
    #Beginning of code section
    begin = main_code.PointerToRawData
    #the end of the first continuous bloc of code
    end = begin+main_code.SizeOfRawData
    while True:
        #parse code section and disassemble it
        data = exe.get_memory_mapped_image()[begin:end]
        for i in md.disasm(data, begin):
            print(i)
            last_address = int(i.address)
            last_size = i.size
        #sometimes you need to skip some bytes
        begin = max(int(last_address),begin)+last_size+1
        if begin >= end:
            print("out")
            break

## Opcode 추출 함수

In [6]:
def ExtractPefileOpcodes(exe):
    #Opcode LIst
    opcodeList = []
    #get main code section
    main_code = get_main_code_section(exe.sections, exe.OPTIONAL_HEADER.BaseOfCode)
    #define architecutre of the machine 
    md = Cs(CS_ARCH_X86, CS_MODE_32)
    md.detail = True
    last_address = 0
    last_size = 0
    #Beginning of code section
    begin = main_code.PointerToRawData
    #the end of the first continuous bloc of code
    end = begin+main_code.SizeOfRawData
    while True:
        #parse code section and disassemble it
        data = exe.get_memory_mapped_image()[begin:end]
        for i in md.disasm(data, begin):
            # print(i)
            line = str(i).rstrip().split()

            for opcode in opcodes:
                if opcode in line:
                    opcodeList.append(opcode)
                    break

            last_address = int(i.address)
            last_size = i.size
        #sometimes you need to skip some bytes
        begin = max(int(last_address),begin)+last_size+1
        if begin >= end:
            break
    
    return opcodeList

# benignFIle sample opcode 추출

In [12]:
sample = path_dir + '/' + file_list[0]
print(f'sample: {sample}')

exe = pefile.PE(sample)

sampleOpcodeList = ExtractPefileOpcodes(exe)

print(f'sample opcode 개수: {len(sampleOpcodeList)}')
print(sampleOpcodeList[:20])

sample: D:/Consistency Training for Malware Static detection/data2/benign/002ce0d28ec990aadbbc89df457189de37d8adaadc9c084b78eb7be9a9820c81.exe
sample opcode 개수: 99752
['ret', 'dec', 'mov', 'xor', 'dec', 'lea', 'inc', 'mov', 'dec', 'cmp', 'inc', 'mov', 'inc', 'mov', 'inc', 'inc', 'test', 'js', 'inc', 'mov']


In [48]:
sample = path_dir + '/' + BenignDf['Name'][961]
print(f'sample: {sample}')

exe = pefile.PE(sample)

sampleOpcodeList = ExtractPefileOpcodes(exe)

print(f'sample opcode 개수: {len(sampleOpcodeList)}')
print(sampleOpcodeList[:20])

sample: D:/Consistency Training for Malware Static detection/data2/benign/fffadeda975e01bb25fddcc63670cb1b73082db4addc58e782f0b4aa2af976e4.exe
sample opcode 개수: 0
[]


# benignFIle Directory 내 파일 opcode 추출

In [40]:
outputDict = {}
for fileName in tqdm_notebook(file_list):
    filePath = path_dir + '/' + fileName
    try:
        #parse exe file
        exe = pefile.PE(filePath)
        try:
            #call the function we created earlier
            sampleOpcodeList = ExtractPefileOpcodes(exe)
        except:
            continue
    except:
        continue
        
    outputDict.setdefault(fileName, sampleOpcodeList)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for fileName in tqdm_notebook(file_list):


  0%|          | 0/1082 [00:00<?, ?it/s]

In [41]:
with open('D:/Consistency Training for Malware Static detection/data2/BenignOpcodes.p', 'wb') as file:
    pickle.dump(outputDict, file)

In [42]:
with open('D:/Consistency Training for Malware Static detection/data2/BenignOpcodes.p', 'rb') as file:
    outputDict2 = pickle.load(file)

In [43]:
len(outputDict2)

962

In [49]:
nameList = list(outputDict2.keys())
opcodeList = list(outputDict2.values())
opcodeFrequencyList = []
LenList = []

for opList in opcodeList:
    tempList = [0 for i in range(len(opcodes))]
    tempLen = 0
    for op in opList:
        tempList[opcodeDict[op]] += 1
        tempLen += 1
    
    opcodeFrequencyList.append(tempList)
    LenList.append(tempLen)

BenignDict = {'Name' : nameList,
              'opocodes' : opcodeList,# opcodeFrequencyList,
              'Len' : LenList,
              'class' : [0 for i in range(len(opcodeList))]}

BenignDf = pd.DataFrame.from_dict(BenignDict)

In [50]:
BenignDf

Unnamed: 0,Name,opocodes,Len,class
0,002ce0d28ec990aadbbc89df457189de37d8adaadc9c08...,"[ret, dec, mov, xor, dec, lea, inc, mov, dec, ...",99752,0
1,003851675800dc05cdac1baa84cab8f68534b244906d97...,"[ret, nop, nop, dec, sub, dec, mov, xor, mov, ...",4074,0
2,00eea85752664955047caad7d6280bc7bf1ab91c61eb9a...,"[ret, nop, nop, dec, sub, dec, mov, xor, mov, ...",7549,0
3,0111bddac92a792c7b2ee3ab77642c33df0e01afe737b0...,"[push, mov, sub, mov, test, je, and, mov, or, ...",11917,0
4,016584e586de67b725ac1e3974fcca320bf81c8c489ebb...,"[mov, shl, mov, push, push, xor, mov, mov, cmp...",15636,0
...,...,...,...,...
957,ff3a27c79a9938205bb158f87c020fa24a42612c8b6b0c...,"[push, mov, push, push, mov, xor, push, test, ...",7564,0
958,ff6d6d846bb0ef538a95836a52e6187c855cbf93e2fce3...,"[ret, nop, nop, dec, sub, dec, mov, xor, mov, ...",10381,0
959,ff9ef3f71807789ab7387c00020b61f7c7ffdcbd7f55d5...,"[add, add, add, add, push, add, add, add, add,...",15521,0
960,ffb860e143e71b639ce6e78afc0a4e120714fcffbbc811...,"[push, push, mov, push, push, mov, xor, push, ...",382920,0


In [54]:
BenignDf[BenignDf['Len'] == 0]

Unnamed: 0,Name,opocodes,Len,class
961,fffadeda975e01bb25fddcc63670cb1b73082db4addc58...,[],0,0
