# Laboratorio 4
Universidad del Valle de Guatemala <br>
Security Data Science <br>

Diego Andrés Morales Aquino - 21762 <br>
Pablo Andrés Zamora Vásquez - 21780

In [1]:
import os
import pefile
from capstone import *
import shutil
import subprocess
import pandas as pd
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer

## Creación de dataset

In [2]:
malware_directory = "MALWR"
unpacked_directory = f"{malware_directory}/UNPACKED"

In [3]:
# Vaciar carpeta de archivos desempaquetados
if os.path.exists(unpacked_directory):
    # Eliminar todo su contenido
    shutil.rmtree(unpacked_directory)

# Crear la carpeta nuevamente
os.makedirs(unpacked_directory)

In [4]:
malware_ex = os.listdir(malware_directory)

# Listar solo archivos (ignorando subdirectorios y .DS_Store)
malware_ex = [
    file for file in os.listdir(malware_directory)
    if os.path.isfile(os.path.join(malware_directory, file)) and file != ".DS_Store"
]

print("Contenido de la carpeta:", malware_ex)

Contenido de la carpeta: ['BVJ2D9FBF759F527AF373E34673DC3ACA462', 'DS22_A670D13D4D014169C4080328B8FEB86', 'EEE99EC8AA67B05407C01094184C33D2B5A44', 'F6655E39465C2FF5B016980D918EA028', 'F8437E44748D2C3FCF84019766F4E6DC', 'FGJKJJ1_2BA0D0083976A5C1E3315413CDCFFCD2', 'FGTR43_EF8E0FB20E7228C7492CCDC59D87C690', 'FHHH6576C196385407B0F7F4B1B537D88983', 'FTTR9EA3C16194CE354C244C1B74C46CD92E', 'GBV66_8F259BB36E00D124963CFA9B86F502E', 'GFT4_7DDD3D72EAD03C7518F5D47650C8572', 'HJGQDD892986B2249B5214639ECC8AC0223', 'JH78C0A33A1B472A8C16123FD696A5CE5EBB', 'JKK8CA6FE7A1315AF5AFEAC2961460A80569', 'K99_C3A9A7B026BFE0E55FF219FD6AA7D94', 'KLp90_6D5C8FC4B14559F73B6136D85B94198', 'L11_1415EB8519D13328091CC5C76A624E3D', 'NBV_8B75BCBFF174C25A0161F30758509A44', 'NV99_C9C9DBF388A8D81D8CFB4D3FC05F8E4', 'PL98_BD8B082B7711BC980252F988BB0CA936', 'POL55_A4F1ECC4D25B33395196B5D51A06790', 'QW2_4C6BDDCCA2695D6202DF38708E14FC7E', 'RTC_7F85D7F628CE62D1D8F7B39D8940472', 'SAM_B659D71AE168E774FAAF38DB30F4A84', 'TG78Z__727A68

In [5]:
def is_packed(pe):
    
    common_sections = {".text", ".data", ".rdata", ".bss", ".rsrc", ".edata", ".idata", ".tls", ".reloc"}
    
    # Extraer los nombres de las secciones del ejecutable
    section_names = {section.Name.decode().strip("\x00") for section in pe.sections}

    common_count = len(common_sections & section_names)  # Intersección con las comunes
    total_sections = len(section_names)

    # Si no hay secciones con nombres comunes o solo hay una sección, está empaquetado
    if common_count < 2 or total_sections == 1:
        return True
    
    return False

In [6]:
def unpack_upx(file_path):
    unpacked_path = os.path.join(unpacked_directory, os.path.basename(file_path))
    result = subprocess.run(["upx", "-d", file_path, "-o", unpacked_path], capture_output=True, text=True)

    # Verificar si el proceso fue exitoso
    if result.returncode != 0:
        raise Exception(f"Error al desempaquetar {file_path}: {result.stderr}")
    
    return unpacked_path

In [7]:
def static_analysis(file_path):

    original_file_path = file_path
    pe = pefile.PE(file_path)

    packed = is_packed(pe)

    if packed:
        # Desempaquetar el archivo - UPX
           file_path = unpack_upx(file_path)
           pe = pefile.PE(file_path)

    # Timestamp de compilación
    time_date_stamp_str = pe.FILE_HEADER.dump_dict()['TimeDateStamp']['Value'].split('[')[1][:-1]
    time_date_stamp = datetime.strptime(time_date_stamp_str, '%a %b %d %H:%M:%S %Y UTC')
    
    # Obtener DLLs y funciones
    dlls = []
    functions = []

    for entry in pe.DIRECTORY_ENTRY_IMPORT:
        dlls.append(entry.dll.decode())
        for function in entry.imports:
            functions.append(function.name.decode() if function.name else "")

    if packed:
        # Para obtener código assembler, se debe analizar archivo original
        pe = pefile.PE(original_file_path)

    # Obtener código assembler (ingeniería inversa)
    assembly_code = None
    entrypoint = pe.OPTIONAL_HEADER.AddressOfEntryPoint
    entrypoint_address = entrypoint+pe.OPTIONAL_HEADER.ImageBase

    # Obtener el código binario mapeado en memoria
    binary_image = pe.get_memory_mapped_image()
    
    # Asegurarse de que entrypoint sea válido
    if entrypoint < 0 or entrypoint >= len(binary_image):
        print(f"Warning en exe {original_file_path}: La dirección de entrada (0x{entrypoint:x}) está fuera de los límites del archivo.")
    else:
        binary_code = pe.get_memory_mapped_image()[entrypoint:entrypoint+200]

        disassembler = Cs(CS_ARCH_X86, CS_MODE_32)
        assembly_code = []
        for instruction in disassembler.disasm(binary_code, entrypoint_address):
            assembly_code.append("%s\t%s" %(instruction.mnemonic, instruction.op_str))

    # Agregar información al dataframe
    # Las listas de dlls, funciones y código assembler se convierten a strings separados por comas
    return {
        "file": os.path.basename(original_file_path),
        'packed': int(packed),
        "compilation_day": time_date_stamp.day,
        "compilation_month": time_date_stamp.month,
        "compilation_year": time_date_stamp.year,
        "compilation_hour": time_date_stamp.hour,
        "compilation_minute": time_date_stamp.minute,
        "compiltation_day_of_week": time_date_stamp.weekday(),
        "dlls": ', '.join(dlls),
        "functions": ', '.join(functions),
        "entrypoint_address": int(entrypoint_address),
        "assembly_code": ', '.join(assembly_code) if assembly_code else ''
    }

    return packed
    

In [8]:
data = []
for file in malware_ex:
    file_path = os.path.join(malware_directory, file)
    row_data = static_analysis(file_path)
    data.append(row_data)

df = pd.DataFrame(data)
   



### Exploración y preprocesamiento de datos

In [9]:
df.head(5)

Unnamed: 0,file,packed,compilation_day,compilation_month,compilation_year,compilation_hour,compilation_minute,compiltation_day_of_week,dlls,functions,entrypoint_address,assembly_code
0,BVJ2D9FBF759F527AF373E34673DC3ACA462,1,14,5,2009,17,12,3,"KERNEL32.DLL, MSVCRT.dll, SHELL32.dll, USER32....","CloseHandle, WaitForSingleObject, CreateEventA...",4222368,"pushal\t, mov\tesi, 0x406000, lea\tedi, [esi -..."
1,DS22_A670D13D4D014169C4080328B8FEB86,0,28,9,2010,8,9,1,"KERNEL32.dll, USER32.dll, ADVAPI32.dll, SHELL3...","CreateFileA, LocalAlloc, Sleep, CreateThread, ...",3426811886,
2,EEE99EC8AA67B05407C01094184C33D2B5A44,1,15,1,2010,17,20,4,"KERNEL32.DLL, MSVCRT.dll, USER32.dll, WS2_32.dll","CloseHandle, WaitForSingleObject, CreateEventA...",4221936,"pushal\t, mov\tesi, 0x406000, lea\tedi, [esi -..."
3,F6655E39465C2FF5B016980D918EA028,1,15,1,2010,17,20,4,"KERNEL32.DLL, MSVCRT.dll, USER32.dll, WS2_32.dll","CloseHandle, WaitForSingleObject, CreateEventA...",4221936,"pushal\t, mov\tesi, 0x406000, lea\tedi, [esi -..."
4,F8437E44748D2C3FCF84019766F4E6DC,1,14,5,2009,17,12,3,"KERNEL32.DLL, MSVCRT.dll, SHELL32.dll, USER32....","CloseHandle, WaitForSingleObject, CreateEventA...",4222368,"pushal\t, mov\tesi, 0x406000, lea\tedi, [esi -..."


In [10]:
# Ver tipo de cada columna
df.dtypes

file                        object
packed                       int64
compilation_day              int64
compilation_month            int64
compilation_year             int64
compilation_hour             int64
compilation_minute           int64
compiltation_day_of_week     int64
dlls                        object
functions                   object
entrypoint_address           int64
assembly_code               object
dtype: object

In [11]:
# Convertir columnas string (dlls, functions, assembly_code) en vectores numéricos aplicando TF-IDF
vectorizer_dlls = TfidfVectorizer()
vectorizer_functions = TfidfVectorizer()
vectorizer_assembly = TfidfVectorizer()

# Convertir las columnas de texto a matrices numéricas
dlls_matrix = vectorizer_dlls.fit_transform(df['dlls'])
functions_matrix = vectorizer_functions.fit_transform(df['functions'])
assembly_matrix = vectorizer_assembly.fit_transform(df['assembly_code'])

# Convertir las matrices en DataFrames
dlls_df = pd.DataFrame(dlls_matrix.toarray(), columns=vectorizer_dlls.get_feature_names_out())
functions_df = pd.DataFrame(functions_matrix.toarray(), columns=vectorizer_functions.get_feature_names_out())
assembly_df = pd.DataFrame(assembly_matrix.toarray(), columns=vectorizer_assembly.get_feature_names_out())

# Concatenar los DataFrames resultantes con el DataFrame original
df = pd.concat([df, dlls_df, functions_df, assembly_df], axis=1)

# Eliminar las columnas originales de texto
df.drop(['dlls', 'functions', 'assembly_code'], axis=1, inplace=True)


In [12]:
# Datafrafe con cadenas de texto convertidas a vectores numéricos
df.head()

Unnamed: 0,file,packed,compilation_day,compilation_month,compilation_year,compilation_hour,compilation_minute,compiltation_day_of_week,entrypoint_address,advapi32,...,mov,nop,or,ptr,push,pushal,sar,shl,sub,xor
0,BVJ2D9FBF759F527AF373E34673DC3ACA462,1,14,5,2009,17,12,3,4222368,0.0,...,0.300234,0.123626,0.017661,0.247252,0.017661,0.017661,0.0,0.017661,0.141287,0.035322
1,DS22_A670D13D4D014169C4080328B8FEB86,0,28,9,2010,8,9,1,3426811886,0.215999,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,EEE99EC8AA67B05407C01094184C33D2B5A44,1,15,1,2010,17,20,4,4221936,0.0,...,0.297781,0.122616,0.017517,0.245231,0.017517,0.017517,0.0,0.017517,0.140132,0.035033
3,F6655E39465C2FF5B016980D918EA028,1,15,1,2010,17,20,4,4221936,0.0,...,0.297781,0.122616,0.017517,0.245231,0.017517,0.017517,0.0,0.017517,0.140132,0.035033
4,F8437E44748D2C3FCF84019766F4E6DC,1,14,5,2009,17,12,3,4222368,0.0,...,0.300234,0.123626,0.017661,0.247252,0.017661,0.017661,0.0,0.017661,0.141287,0.035322


In [13]:
# Eliminar columna de nombre de ejecutable (no es relevante para el modelo)
df.drop('file', axis=1, inplace=True)

In [14]:
# Dataframe final
df.head()

Unnamed: 0,packed,compilation_day,compilation_month,compilation_year,compilation_hour,compilation_minute,compiltation_day_of_week,entrypoint_address,advapi32,dll,...,mov,nop,or,ptr,push,pushal,sar,shl,sub,xor
0,1,14,5,2009,17,12,3,4222368,0.0,0.89564,...,0.300234,0.123626,0.017661,0.247252,0.017661,0.017661,0.0,0.017661,0.141287,0.035322
1,0,28,9,2010,8,9,1,3426811886,0.215999,0.893009,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,15,1,2010,17,20,4,4221936,0.0,0.889119,...,0.297781,0.122616,0.017517,0.245231,0.017517,0.017517,0.0,0.017517,0.140132,0.035033
3,1,15,1,2010,17,20,4,4221936,0.0,0.889119,...,0.297781,0.122616,0.017517,0.245231,0.017517,0.017517,0.0,0.017517,0.140132,0.035033
4,1,14,5,2009,17,12,3,4222368,0.0,0.89564,...,0.300234,0.123626,0.017661,0.247252,0.017661,0.017661,0.0,0.017661,0.141287,0.035322
