In [1]:
import glob
import os
import msgpack
from tqdm.auto import tqdm

from silence_tensorflow import silence_tensorflow
silence_tensorflow()
import tensorflow as tf
import tensorflow.keras as keras
import keras.models as models
import keras.layers as layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
import numpy as np
import pandas as pd
import parmap

In [2]:
truncate_len = 128
truncate = lambda x: x[:truncate_len] # String truncation function for the tokenizer (length: 128)

In [3]:
# Load the tokenizer with the vocabulary
def load_tokenizer(filename: str) -> Tokenizer:
    if not os.path.exists(filename):
        raise FileNotFoundError(f"Could not find file {filename}")
    tokenizer = Tokenizer()
    tokenizer.word_index = msgpack.load(open(filename, 'rb'))
    return tokenizer

In [4]:
# Padding the sequences
def get_padded_seqence(data, maxlen: int):
    return keras.preprocessing.sequence.pad_sequences(data, maxlen=maxlen, padding='post', truncating='post')

In [5]:
# Tokenize the text using the tokenizer
def get_padded_seqence_tokenizer(tokenizer: Tokenizer, texts: str, maxlen: int):
    _sequence = tokenizer.texts_to_sequences([texts])
    return get_padded_seqence(_sequence, maxlen=maxlen)

In [6]:
def load_datas(sha256sum: str, feature_dir: str, ascii_tokenizer: Tokenizer, utf16le_tokenizer: Tokenizer, opcode_tokenizer: Tokenizer):
    ascii_path = os.path.join(feature_dir, 'strings', f'{sha256sum}.ascii.msgpack')
    utf16le_path = os.path.join(feature_dir, 'strings', f'{sha256sum}.utf16le.msgpack')
    opcode_path = os.path.join(feature_dir, 'opcode', f'{sha256sum}.msgpack')
    pe_header_path = os.path.join(feature_dir, 'pe_header', f'{sha256sum}.bin')
    rich_header_path = os.path.join(feature_dir, 'rich_header', f'{sha256sum}.bin')

    # Load data from file
    ascii_data = msgpack.load(open(ascii_path, 'rb'))[0]
    utf16le_data = msgpack.load(open(utf16le_path, 'rb'))[0]
    opcode_data = msgpack.load(open(opcode_path, 'rb'))[0]
    pe_header_data = np.fromfile(pe_header_path, dtype=np.uint8)
    rich_header_data = np.fromfile(rich_header_path, dtype=np.uint8)

    # ASCII Data
    # 1. Truncate the data by 128
    # 2. Tokenize the data
    # 3. Padding the data
    ascii_data = ' '.join([truncate(x) for x in ascii_data])
    ascii_data = get_padded_seqence_tokenizer(ascii_tokenizer, ascii_data, 100)

    # UTF16LE Data
    # 1. Truncate the data by 128
    # 2. Tokenize the data
    # 3. Padding the data
    utf16le_data = truncate(utf16le_data)
    utf16le_data = get_padded_seqence_tokenizer(utf16le_tokenizer, utf16le_data, 100)

    # Opcode Data
    # 1. Tokenize the data
    # 2. Padding the data
    opcode_data = get_padded_seqence_tokenizer(opcode_tokenizer, opcode_data, 100)

    # PE Header Data
    # 1. Padding the data
    # 2. Expand the dimension
    pe_header_data = get_padded_seqence([pe_header_data], 4096)
    pe_header_data = np.expand_dims(pe_header_data, axis=2)

    # Rich Header Data
    # 1. Padding the data
    # 2. Expand the dimension
    rich_header_data = get_padded_seqence([rich_header_data], 512)
    rich_header_data = np.expand_dims(rich_header_data, axis=2)
    
    return ascii_data, utf16le_data, opcode_data, pe_header_data, rich_header_data
    

In [7]:
def predict(data, model) -> float:
    return model.predict(data, verbose=0)[0][0]

In [8]:
# Load word_index for tokenizer
ascii_tokenizer = load_tokenizer('word_index.ascii.msgpack')
utf16le_tokenizer = load_tokenizer('word_index.utf16le.msgpack')
opcode_tokenizer = load_tokenizer('word_index.opcode.msgpack')

# Load Models
ascii_model = models.load_model('models/ascii.h5')
utf16le_model = models.load_model('models/utf16le.h5')
opcode_model = models.load_model('models/opcode.h5')
pe_header_model = models.load_model('models/pe_header.h5')
rich_header_model = models.load_model('models/rich_header.h5')

In [11]:
csv = pd.read_csv('test_set/label.csv', header=None, dtype={0: str, 1: int})
csv = csv.rename(columns={0: 'sha256sum', 1: 'label'})
result = []
for _, row in tqdm(list(csv.iterrows())):
    sha256sum = row['sha256sum']
    label = row['label']
    ascii_data, utf16le_data, opcode_data, pe_header_data, rich_header_data = load_datas(sha256sum, 'test_set', ascii_tokenizer, utf16le_tokenizer, opcode_tokenizer)

    result.append(pd.DataFrame({
        'sha256sum': sha256sum,
        'ascii': predict(ascii_data, ascii_model),
        'utf16le': predict(utf16le_data, utf16le_model),
        'opcode': predict(opcode_data, opcode_model),
        'pe_header': predict(pe_header_data, pe_header_model),
        'rich_header': predict(rich_header_data, rich_header_model),
        'label': label}, index=[0]))
# result = parmap.map(to_df, list(csv.iterrows()), pm_pbar=True, pm_chunksize=1)

  0%|          | 0/50000 [00:00<?, ?it/s]

In [13]:
result.to_csv('test_dataset.csv', index=False)

TypeError: first argument must be an iterable of pandas objects, you passed an object of type "DataFrame"