In [10]:
import pefile, fnmatch, os, time, pickle
import numpy as np
import pandas as pd
from multiprocessing import Pool
from scipy.sparse import csr_matrix
from scipy.sparse import hstack
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import FeatureHasher
import xgboost as xgb
from xgboost import cv, DMatrix

In [52]:
def parse_headers(pe):
    headers = {
        'OH.SizeOfCode'                 :pe.OPTIONAL_HEADER.SizeOfCode,
        'OH.SizeOfInitializedData'      :pe.OPTIONAL_HEADER.SizeOfInitializedData,
        'OH.AddressOfEntryPoint'        :pe.OPTIONAL_HEADER.AddressOfEntryPoint,
        'OH.BaseOfCode'                 :pe.OPTIONAL_HEADER.BaseOfCode,
        'OH.BaseOfData'                 :pe.OPTIONAL_HEADER.BaseOfData,
        'OH.ImageBase'                  :pe.OPTIONAL_HEADER.ImageBase,
        'OH.MajorOperatingSystemVersion':pe.OPTIONAL_HEADER.MajorOperatingSystemVersion,
        'OH.MajorSubsystemVersion'      :pe.OPTIONAL_HEADER.MajorSubsystemVersion,
        'OH.SizeOfImage'                :pe.OPTIONAL_HEADER.SizeOfImage,
        'OH.SizeOfHeaders'              :pe.OPTIONAL_HEADER.SizeOfHeaders,
        'OH.CheckSum'                   :pe.OPTIONAL_HEADER.CheckSum,
        'OH.Subsystem'                  :pe.OPTIONAL_HEADER.Subsystem,
        'OH.DllCharacteristics'         :pe.OPTIONAL_HEADER.DllCharacteristics,
        'OH.SizeOfStackReserve'         :pe.OPTIONAL_HEADER.SizeOfStackReserve,
        'OH.SizeOfStackCommit'          :pe.OPTIONAL_HEADER.SizeOfStackCommit,
        'OH.SizeOfHeapReserve'          :pe.OPTIONAL_HEADER.SizeOfHeapReserve,
        'OH.SizeOfHeapCommit'           :pe.OPTIONAL_HEADER.SizeOfHeapCommit,

        'FH.NumberOfSections'           :pe.FILE_HEADER.NumberOfSections,
        'FH.TimeDateStamp'              :pe.FILE_HEADER.TimeDateStamp,
        'FH.Characteristics'            :pe.FILE_HEADER.Characteristics,
    }
    return headers

def parse_sections(pe):
    sections = []
    for entry in pe.sections:
        sect = {
            'SectionName'   :str(entry.Name),
            'SectionSize'   :hex(entry.SizeOfRawData),
            'SectionEntropy':entry.get_entropy()
            }
        sections.append(sect)
    return sections

def parse_import(pe):
    import_table = list()
    import_num = 0
    dll_num = 0
    if hasattr(pe, 'DIRECTORY_ENTRY_IMPORT'):
        dll_num = len(pe.DIRECTORY_ENTRY_IMPORT)
        for entry in pe.DIRECTORY_ENTRY_IMPORT:
            import_entry = dict()
            import_entry['dll'] = entry.dll
            import_entry['symbols'] = list()
            import_num = import_num + len(entry.imports)
            for imp in entry.imports:
                import_entry['symbols'].append(imp.name)
            import_table.append(import_entry)
    return import_table, import_num, dll_num

def parse_export(pe):
    if hasattr(pe, 'DIRECTORY_ENTRY_EXPORT'):
        return len(pe.DIRECTORY_ENTRY_EXPORT.symbols)
    return 0

def size(name): return os.path.getsize(name)

def parse_pe(name):
    try:
        pe = pefile.PE(name)
        info = parse_headers(pe)
        info['sections'] = parse_sections(pe)
        info['export_num'] = parse_export(pe)
        info['import'], info['import_num'], info['dll_num'] = parse_import(pe)
        pe.close()
        info['sha256'] = name.split('\\')[-1]
        info['size'] = size(name)
        return info
    except Exception:
        return None
    return None

def find_files(directory, pattern):
    for root, dirs, files in os.walk(directory):
        for basename in files:
            if fnmatch.fnmatch(basename, pattern):
                filename = os.path.join(root, basename)
                yield filename

def enum_pe(type, path):
    pool = Pool()
    results = [pool.apply_async(parse_pe, (filename,)) for filename in find_files(path, '*')]
    files = list()
    for result in results:
        result.wait()
        try:
            info = result.get()
        except:
            continue
        if info != None:
            info['type'] = type
            files.append(info)
    pool.terminate()
    return files

In [65]:
files = enum_pe('malware', 'malware') + enum_pe('normal', 'normal')

df = pd.DataFrame(files)

files.clear()

print(df.groupby('type')['sha256'].count())
print()

df.to_csv('features.csv', sep=';', index=False)
df = df.drop(['sha256', 'size'], axis=1)

sections = df['sections'].apply(pd.Series).stack().reset_index(level=1, drop=True).apply(pd.Series)

imports = df['import'].apply(pd.Series).stack().reset_index(level=1, drop=True).apply(pd.Series)
imports = imports.reset_index().set_index(['index', 'dll'])
imports = imports['symbols'].apply(pd.Series).stack().reset_index(level=2, drop=True).to_frame('import').reset_index().set_index('index')

join = sections.join(imports).fillna(0)

join['SectionName'] = join['SectionName'].astype('str')
join['dll'] = join['dll'].astype('str')
join['import'] = join['import'].astype('str')

type
malware    198
normal     422
Name: sha256, dtype: int64



In [66]:
string_columns = ['SectionName', 'dll', 'import']
ohe = OneHotEncoder(handle_unknown='ignore')
matrix = ohe.fit_transform(join[string_columns])
index = join.index
rows = []
for i in index.unique():
    select = index.slice_indexer(start=i, end=i)
    rows.append(csr_matrix(matrix[select].sum(axis=0)))

join_encoded = pd.DataFrame(data={'matrix':rows})

In [67]:
df = df.drop(['sections', 'import'], axis=1)
df = df.join(join_encoded)

y = df['type'] == 'malware'

X = df.drop('type', axis=1)
X = X.apply(lambda x: hstack((x.drop('matrix').astype('int64').values, x['matrix'])).T, axis=1)
X = hstack(X.values).T
X = X.todok().toarray()

In [68]:
dtrain = DMatrix(X, y)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


In [69]:
params = {
    'max_depth': 6
}

In [70]:
cv(params=params, dtrain=dtrain, metrics="auc")

Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
0,0.994777,0.003703,0.982244,0.01375
1,1.0,0.0,0.992187,0.007615
2,1.0,0.0,0.992187,0.007615
3,1.0,0.0,0.993258,0.008153
4,1.0,0.0,0.993311,0.008189
5,1.0,0.0,0.993273,0.008241
6,1.0,0.0,0.993348,0.008136
7,1.0,0.0,0.993365,0.008147
8,1.0,0.0,0.993366,0.008111
9,1.0,0.0,0.993366,0.008111


In [71]:
booster = xgb.train(params=params, dtrain=dtrain)

In [72]:
with open("model.pickle", "wb") as f:
    pickle.dump(booster, f)

In [73]:
with open("ohe.pickle", "wb") as f:
    pickle.dump(ohe, f)

In [72]:
files = [parse_pe('malware/0cf176827f89ff5167319c15aba7c74d176b0f93dd42f383567faa6c5af2ef21')]
df = pd.DataFrame(files)
df = df.drop(['sha256', 'size'], axis=1)
sections = df['sections'].apply(pd.Series).stack().reset_index(level=1, drop=True).apply(pd.Series)

imports = df['import'].apply(pd.Series).stack().reset_index(level=1, drop=True).apply(pd.Series)
imports = imports.reset_index().set_index(['index', 'dll'])
imports = imports['symbols'].apply(pd.Series).stack().reset_index(level=2, drop=True).to_frame('import').reset_index().set_index('index')

join = sections.join(imports).fillna(0)

join['SectionName'] = join['SectionName'].astype('str')
join['dll'] = join['dll'].astype('str')
join['import'] = join['import'].astype('str')

string_columns = ['SectionName', 'dll', 'import']
encoders = {}
for col in string_columns:
    le = LabelEncoder()
    le.fit(join[col])
    encoders[col] = le
    join[col] = le.transform(join[col])

matrix = ohe.transform(join)

index = join.index
rows = []
for i in index.unique():
    select = index.slice_indexer(start=i, end=i)
    rows.append(csr_matrix(matrix[select].sum(axis=0)))

join_encoded = pd.DataFrame(data={'matrix':rows})

df = df.drop(['sections', 'import'], axis=1)
df = df.join(join_encoded)

X = df.apply(lambda x: hstack((x.drop('matrix').astype('int64').values, x['matrix'])).T, axis=1)
X = hstack(X.values).T
X = X.todok().toarray()
booster.predict(DMatrix(X))[0] > 0.5