In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

In [2]:
import os


files = [
    'smote.csv',
    'adasyn.csv',
    'borderline_smote.csv',
    'tomek_links.csv',
    'smoteenn.csv',
    'smotetomek.csv'
]
target = 'techniques_mitre'

# KNN

In [3]:
def knn(file):
    folder = r'..\Dataset_simulazione'
    file_path = os.path.join(folder, file)
    new_df= pd.read_csv(file_path)


    # Initialize LabelEncoder for the target
    label_encoder = LabelEncoder()

    # Apply LabelEncoder to the target column
    new_df['techniques_mitre'] = label_encoder.fit_transform(new_df['techniques_mitre'])

    # Encoding the target variable to categorical might not be necessary for KNN, but it's good for consistent label handling
    y = new_df['techniques_mitre'].values

    # Prepare features
    X = new_df.drop('techniques_mitre', axis=1).values

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Normalize features - very important for KNN
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Define the KNN model
    knn = KNeighborsClassifier(n_neighbors=5)  # Start with 5 neighbors, adjust based on validation

    # Train the model
    knn.fit(X_train, y_train)

    # Predict the test set
    y_pred = knn.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(file, "Accuracy:", accuracy)

    # Classification Report - Ensure labels are strings
    target_names = label_encoder.inverse_transform(np.unique(new_df['techniques_mitre']))

    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=target_names.astype(str), zero_division=0))

In [4]:
for file in files:
    knn(file)

smote.csv Accuracy: 0.8245547355355202
Classification Report:
                                       precision    recall  f1-score   support

             account_discovery_domain       0.59      0.64      0.61     23003
                               benign       0.46      0.39      0.42     23309
               domain_trust_discovery       0.50      0.51      0.50     23141
               group_policy_discovery       0.97      0.97      0.97     23008
            network_service_discovery       0.97      0.98      0.97     23029
        reconnaissance_scan_ip_blocks       0.98      0.98      0.98     23036
reconnaissance_vulnerability_scanning       0.95      0.97      0.96     22971
     reconnaissance_wordlist_scanning       0.99      0.99      0.99     23201
              remote_system_discovery       0.99      0.99      0.99     23156

                             accuracy                           0.82    207854
                            macro avg       0.82      0.83      0.8