In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report, accuracy_score

In [7]:
import os


files = [
    'smote.csv',
    'adasyn.csv',
    'borderline_smote.csv',
    'tomek_links.csv',
    'smoteenn.csv',
    'smotetomek.csv'
]
target = 'techniques_mitre'

In [5]:
def lstm(file):
    folder = r'..\Dataset_simulazione'
    file_path = os.path.join(folder, file)
    new_df= pd.read_csv(file_path)


    # Initialize LabelEncoder for the target manually
    label_encoder = LabelEncoder()

    # Apply LabelEncoder to the target column
    new_df['techniques_mitre'] = label_encoder.fit_transform(new_df['techniques_mitre'])

    # Encoding the target variable to categorical
    y = new_df['techniques_mitre']
    y_categorical = to_categorical(y)  # Suitable for multi-class classification

    # Prepare features
    X = new_df.drop('techniques_mitre', axis=1).values

    # Reshape X to be suitable for LSTM [samples, timesteps, features]
    X = X.reshape((X.shape[0], X.shape[1], 1))

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.2, random_state=42)

    # Normalize features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train.reshape(-1, X_train.shape[-1])).reshape(X_train.shape)
    X_test = scaler.transform(X_test.reshape(-1, X_test.shape[-1])).reshape(X_test.shape)

    # Define the LSTM model
    model = Sequential([
        Input(shape=(X_train.shape[1], 1)),  # Input layer defining the shape
        LSTM(50),  # LSTM layer with 50 units
        Dense(y_categorical.shape[1], activation='softmax')  # Output layer for classification
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # Train the model
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)

    # Predict the test set
    y_pred_prob = model.predict(X_test)
    y_pred = np.argmax(y_pred_prob, axis=1)  # Convert probabilities to class labels
    y_true = np.argmax(y_test, axis=1)  # True labels

    # Evaluate the model
    accuracy = accuracy_score(y_true, y_pred)
    print(file, "Accuracy:", accuracy)

    # Classification Report - Ensure labels are strings
    target_names = label_encoder.inverse_transform(np.unique(new_df['techniques_mitre']))

    print("Classification Report:")
    print(classification_report(y_true, y_pred, target_names=target_names.astype(str), zero_division=0))

In [8]:
for file in files:
    lstm(file)

Epoch 1/10
[1m23384/23384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 3ms/step - accuracy: 0.5372 - loss: 1.3534 - val_accuracy: 0.7050 - val_loss: 0.8991
Epoch 2/10
[1m23384/23384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 3ms/step - accuracy: 0.7128 - loss: 0.8726 - val_accuracy: 0.7549 - val_loss: 0.7154
Epoch 3/10
[1m23384/23384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 3ms/step - accuracy: 0.7572 - loss: 0.7096 - val_accuracy: 0.7767 - val_loss: 0.6389
Epoch 4/10
[1m23384/23384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 3ms/step - accuracy: 0.7780 - loss: 0.6393 - val_accuracy: 0.7855 - val_loss: 0.6065
Epoch 5/10
[1m23384/23384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 3ms/step - accuracy: 0.7912 - loss: 0.5954 - val_accuracy: 0.7987 - val_loss: 0.5749
Epoch 6/10
[1m23384/23384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 3ms/step - accuracy: 0.7990 - loss: 0.5648 - val_accuracy: 0.8043 - val_loss: 0.540