In [1]:
%pip install pefile python-magic

Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
import pefile
import hashlib
import string
import magic
import re
from statistics import mean

In [4]:
def extract_features_from_file(filepath):
    import os
    import pefile
    import hashlib
    import magic
    import string
    import re
    from statistics import mean

    def extract_printable_strings(file_bytes, min_len=5):
        result = []
        current = ''
        for byte in file_bytes:
            try:
                char = chr(byte)
                if char in string.printable and char not in '\n\r\t':
                    current += char
                else:
                    if len(current) >= min_len:
                        result.append(current)
                    current = ''
            except:
                continue
        if len(current) >= min_len:
            result.append(current)
        return result

    features = {}

    # Load file bytes first
    try:
        with open(filepath, 'rb') as f:
            file_bytes = f.read()
        features["sha256"] = hashlib.sha256(file_bytes).hexdigest()
        features["file_size"] = len(file_bytes)
    except:
        # Total failure
        return {
            "vsize": None, "imports": 0, "exports": 0,
            "has_debug": 0, "has_tls": 0, "has_resources": 0,
            "has_relocations": 0, "has_signature": 0, "symbols": 0,
            "sha256": "", "file_size": 0, "numstrings": 0, "avlength": 0,
            "printables": 0, "paths": 0, "urls": 0, "registry": 0, "MZ": 0,
            "file_type_trid": "Unknown", "file_type_prob_trid": None
        }

    # Try PE parsing
    try:
        pe = pefile.PE(data=file_bytes, fast_load=True)

        features["vsize"] = pe.OPTIONAL_HEADER.SizeOfImage
        try:
            features["imports"] = sum(len(entry.imports) for entry in pe.DIRECTORY_ENTRY_IMPORT)
        except:
            features["imports"] = 0

        try:
            features["exports"] = len(pe.DIRECTORY_ENTRY_EXPORT.symbols)
        except:
            features["exports"] = 0

        features["has_debug"] = int(hasattr(pe, 'DIRECTORY_ENTRY_DEBUG'))
        features["has_tls"] = int(hasattr(pe, 'DIRECTORY_ENTRY_TLS'))
        features["has_resources"] = int(hasattr(pe, 'DIRECTORY_ENTRY_RESOURCE'))
        features["has_relocations"] = int(hasattr(pe, 'DIRECTORY_ENTRY_BASERELOC'))
        features["has_signature"] = int(pe.OPTIONAL_HEADER.DATA_DIRECTORY[4].Size > 0)
        features["symbols"] = 0  # Could be improved
    except:
        # Not a valid PE file
        features.update({
            "vsize": None,
            "imports": 0,
            "exports": 0,
            "has_debug": 0,
            "has_tls": 0,
            "has_resources": 0,
            "has_relocations": 0,
            "has_signature": 0,
            "symbols": 0
        })

    # String-based features
    strings_found = extract_printable_strings(file_bytes)
    features["numstrings"] = len(strings_found)
    features["avlength"] = mean([len(s) for s in strings_found]) if strings_found else 0
    features["printables"] = sum(1 for b in file_bytes if chr(b) in string.printable)
    features["paths"] = sum(1 for s in strings_found if s.lower().startswith("c:\\"))
    features["urls"] = sum(1 for s in strings_found if "http://" in s.lower() or "https://" in s.lower())
    features["registry"] = sum(1 for s in strings_found if "HKEY_" in s)
    features["MZ"] = file_bytes.count(b"MZ")

    # File type (via magic)
    try:
        mime = magic.Magic(mime=False)
        features["file_type_trid"] = mime.from_file(filepath)
        features["file_type_prob_trid"] = None
    except:
        features["file_type_trid"] = "Unknown"
        features["file_type_prob_trid"] = None

    return features


In [6]:
file_path = "demo-files/notepad_copy.exe"
features = extract_features_from_file(file_path)
from pprint import pprint
pprint(features)


{'MZ': 7,
 'avlength': 11.117036011080332,
 'exports': 0,
 'file_size': 360448,
 'file_type_prob_trid': None,
 'file_type_trid': 'PE32+ executable (GUI) x86-64, for MS Windows',
 'has_debug': 0,
 'has_relocations': 0,
 'has_resources': 0,
 'has_signature': 0,
 'has_tls': 0,
 'imports': 0,
 'numstrings': 1444,
 'paths': 0,
 'printables': 106293,
 'registry': 0,
 'sha256': 'b862fd21ab3c38f7aabb3f41b8b6845d14692cd4273edc9dfec7b555e2c6b505',
 'symbols': 0,
 'urls': 1,
 'vsize': 368640}
