In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Input
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report, accuracy_score

In [2]:
import os


files = [
    'smote.csv',
    'adasyn.csv',
    'borderline_smote.csv',
    'tomek_links.csv',
    'smoteenn.csv',
    'smotetomek.csv'
]
target = 'techniques_mitre'

# RNN

In [5]:
def rnn(file):
    folder = r'..\Dataset_simulazione'
    file_path = os.path.join(folder, file)
    new_df= pd.read_csv(file_path)

    # Initialize LabelEncoder for the target manually
    label_encoder = LabelEncoder()

    # Apply LabelEncoder to the target column
    new_df['techniques_mitre'] = label_encoder.fit_transform(new_df['techniques_mitre'])

    # Store the label encoder in a dictionary for future use in inverse transforming
    label_encoders = {'techniques_mitre': label_encoder}

    # Encoding the target variable to categorical
    y = new_df['techniques_mitre']
    y_categorical = to_categorical(y)  # Suitable for multi-class classification

    # Prepare features
    X = new_df.drop('techniques_mitre', axis=1).values

    # Reshape X to be suitable for RNN [samples, timesteps, features]
    X = X.reshape((X.shape[0], X.shape[1], 1))

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.2, random_state=42)

    # Normalize features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train.reshape(-1, X_train.shape[-1])).reshape(X_train.shape)
    X_test = scaler.transform(X_test.reshape(-1, X_test.shape[-1])).reshape(X_test.shape)

    # Define the RNN model using an Input layer
    model = Sequential([
        Input(shape=(X_train.shape[1], 1)),  # Define the input shape explicitly here
        SimpleRNN(50),  # 50 units
        Dense(y_categorical.shape[1], activation='softmax')  # Output layer nodes = number of categories
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # Train the model
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)

    # Predict the test set
    y_pred_prob = model.predict(X_test)
    y_pred = np.argmax(y_pred_prob, axis=1)  # Convert probabilities to class labels
    y_true = np.argmax(y_test, axis=1)  # True labels

    # Evaluate the model
    accuracy = accuracy_score(y_true, y_pred)
    print(file, "Accuracy:", accuracy)

    # Classification Report - Ensure labels are strings
    target_names = label_encoder.inverse_transform(np.unique(new_df['techniques_mitre']))

    print("Classification Report:")
    print(classification_report(y_true, y_pred, target_names=target_names.astype(str), zero_division=0))


In [6]:
for file in files:
    rnn(file)

Epoch 1/10
[1m23384/23384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 2ms/step - accuracy: 0.5644 - loss: 1.3467 - val_accuracy: 0.6879 - val_loss: 1.0009
Epoch 2/10
[1m23384/23384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 2ms/step - accuracy: 0.6507 - loss: 1.0947 - val_accuracy: 0.6754 - val_loss: 1.0293
Epoch 3/10
[1m23384/23384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 2ms/step - accuracy: 0.6569 - loss: 1.0901 - val_accuracy: 0.6617 - val_loss: 1.1033
Epoch 4/10
[1m23384/23384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 1ms/step - accuracy: 0.6896 - loss: 0.9817 - val_accuracy: 0.7276 - val_loss: 0.8529
Epoch 5/10
[1m23384/23384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 2ms/step - accuracy: 0.6889 - loss: 0.9620 - val_accuracy: 0.6445 - val_loss: 0.8995
Epoch 6/10
[1m23384/23384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 2ms/step - accuracy: 0.7252 - loss: 0.8240 - val_accuracy: 0.7421 - val_loss: 0.756