In [None]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report


In [None]:
def generate_sample_dataset():
    malware = [
        "mov push pop call mov xor ret",
        "jmp call xor xor mov ret",
        "push mov xor add sub call jmp ret",
        "jmp xor xor xor xor xor ret",
        "push call mov add ret"
    ]

    benign = [
        "mov add sub cmp jne ret",
        "mov push call add ret",
        "add mov cmp jne jmp ret",
        "mov push add cmp jz ret",
        "sub add cmp je ret"
    ]

    data = malware * 200 + benign * 200
    labels = [1]*len(malware)*200 + [0]*len(benign)*200
    return pd.DataFrame({"opcode": data, "label": labels})

def preprocess_data(df, max_len=20):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df["opcode"])
    sequences = tokenizer.texts_to_sequences(df["opcode"])
    padded = pad_sequences(sequences, maxlen=max_len)

    X = padded
    y = np.array(df["label"])

    vocab_size = len(tokenizer.word_index) + 1
    return X, y, vocab_size, tokenizer

# Building LSTM Now

def build_model(input_length, vocab_size):
    model = Sequential([
        Embedding(vocab_size, 64, input_length=input_length),
        SpatialDropout1D(0.2),
        LSTM(64, dropout=0.2, recurrent_dropout=0.2),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
def train_model(X, y, vocab_size):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = build_model(X.shape[1], vocab_size)
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1, verbose=1)

    y_pred = (model.predict(X_test) > 0.5).astype(int)
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    model.save("models/malware_lstm_model.h5")
    print("Model saved.")

In [None]:
if __name__ == "__main__":
    df = generate_sample_dataset()
    X, y, vocab_size, tokenizer = preprocess_data(df)
    train_model(X, y, vocab_size)