# Random Forest 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.inspection import permutation_importance

In [2]:
import os


files = [
    'smote.csv',
    'adasyn.csv',
    'borderline_smote.csv',
    'tomek_links.csv',
    'smoteenn.csv',
    'smotetomek.csv'
]
target = 'techniques_mitre'

In [3]:
def randomforest(file):
    folder = r'..\Dataset_simulazione'
    file_path = os.path.join(folder, file)
    new_df= pd.read_csv(file_path)

    # Assuming the target variable is 'techniques_mitre'
    X = new_df.drop(target, axis=1)
    y = new_df[target]

    # Encode categorical variables if any
    # Check if any columns are object type and encode them
    categorical_columns = X.select_dtypes(include=['object']).columns
    for column in categorical_columns:
        le = LabelEncoder()
        X[column] = le.fit_transform(X[column])

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize the Random Forest Classifier
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

    # Fit the model on the training data
    rf_classifier.fit(X_train, y_train)

    # Predict on the test data
    y_pred = rf_classifier.predict(X_test)

    # Evaluate the model with RF
    print(file, "Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

In [4]:
for file in files:
    randomforest(file)

smote.csv Accuracy: 0.833830477161854
Classification Report:
                                       precision    recall  f1-score   support

             account_discovery_domain       0.62      0.62      0.62     23003
                               benign       0.45      0.42      0.43     23309
               domain_trust_discovery       0.58      0.61      0.59     23141
               group_policy_discovery       0.97      0.97      0.97     23008
            network_service_discovery       0.97      0.96      0.97     23029
        reconnaissance_scan_ip_blocks       0.97      0.97      0.97     23036
reconnaissance_vulnerability_scanning       0.94      0.98      0.96     22971
     reconnaissance_wordlist_scanning       0.99      0.99      0.99     23201
              remote_system_discovery       1.00      1.00      1.00     23156

                             accuracy                           0.83    207854
                            macro avg       0.83      0.83      0.83