In [22]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import sys
import os
from pathlib import Path

In [23]:
# Define variables
MALWARE_DIR = Path('../malware_data/v077_clean/')  # Directory containing malware type folders
SAVED_MODELS_DIR = Path(f'../saved_models/elmo/')
os.makedirs(SAVED_MODELS_DIR, exist_ok=True)
MALWARE_TYPES = ['Winwebsec', 'Small', 'Zbot']  # Malware type folder names
MAX_SAMPLES_PER_TYPE = [500] * len(MALWARE_TYPES) # Set to -1 to read all files, or set to maximum number of files per folder

In [24]:
def load_opcodes(file_path):
    """Load opcodes from a file, one per line"""
    with open(file_path, 'r') as f:
        return [l for line in f if (l := line.strip())]

def get_elmo_embeddings(opcodes):
    """Get a single embedding for the entire opcode sequence"""
    # Treat the entire sequence as a single sentence
    sentences = [opcodes]
    
    # Load ELMo model
    elmo = hub.load("https://tfhub.dev/google/elmo/3")
    
    # Prepare inputs
    tokens_input = tf.constant(sentences)
    lengths = tf.constant([len(opcodes)])
    
    # Get embeddings
    outputs = elmo.signatures["tokens"](
        tokens=tokens_input,
        sequence_len=lengths
    )
    
    # Average all token embeddings to get sequence-level embedding
    token_embeddings = outputs["elmo"].numpy()[0]  # Shape: (num_opcodes, 1024)
    sequence_embedding = np.mean(token_embeddings, axis=0)  # Shape: (1024,)
    
    return sequence_embedding

In [25]:
opcodes = load_opcodes('malware_samples/winwebsec/0a4da66b67ee14db74aa982fb86d495ecb1ad229.asm.txt')
if not opcodes:
    print("No valid opcodes found in file")
    exit(1)

embedding = get_elmo_embeddings(opcodes)
print(f"Generated single embedding for {len(opcodes)} opcodes")
print(f"Embedding shape: {embedding.shape}")
np.save("opcode_embedding.npy", embedding)
print("Saved to opcode_embedding.npy")

Generated single embedding for 1060 opcodes
Embedding shape: (1024,)


In [26]:
'''
Resources
https://stackoverflow.com/questions/67298869/extracting-elmo-features-using-tensorflow-and-convert-them-to-numpy
'''

'\nResources\nhttps://stackoverflow.com/questions/67298869/extracting-elmo-features-using-tensorflow-and-convert-them-to-numpy\n'