In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

In [1]:
import os


files = ['D1_encoded_categorical',
    'smote.csv',
    'adasyn.csv',
    'borderline_smote.csv',
    'tomek_links.csv',
    'smoteenn.csv',
    'smotetomek.csv'
]
target = 'techniques_mitre'

# Gradient Boosting Machines (GBM)Â¶

In [4]:
def gbm(file):
    folder = r'..\Dataset_simulazione'
    file_path = os.path.join(folder, file)
    data = pd.read_csv(file_path)

    X = data.drop('techniques_mitre', axis=1)
    y = data['techniques_mitre']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize the Gradient Boosting Classifier
    gbm_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

    # Fit the model on the training data
    gbm_classifier.fit(X_train, y_train)

    # Predict on the test data
    y_pred = gbm_classifier.predict(X_test)

    # Evaluate the model

    print(file, "Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred, zero_division=0))

In [4]:
for file in files:
    gbm(file)

smote.csv Accuracy: 0.8420237281938283
Classification Report:
                                       precision    recall  f1-score   support

             account_discovery_domain       0.68      0.71      0.69     23003
                               benign       0.55      0.47      0.51     23309
               domain_trust_discovery       0.54      0.54      0.54     23141
               group_policy_discovery       0.97      0.96      0.96     23008
            network_service_discovery       0.95      0.97      0.96     23029
        reconnaissance_scan_ip_blocks       0.98      0.98      0.98     23036
reconnaissance_vulnerability_scanning       0.90      0.98      0.94     22971
     reconnaissance_wordlist_scanning       0.99      0.99      0.99     23201
              remote_system_discovery       0.99      0.99      0.99     23156

                             accuracy                           0.84    207854
                            macro avg       0.84      0.84      0.8

In [7]:
gbm('ready.csv')

ready.csv Accuracy: 0.986729585359427
Classification Report:
                                       precision    recall  f1-score   support

             account_discovery_domain       0.80      0.20      0.32        20
                               benign       0.96      0.99      0.98     12248
               domain_trust_discovery       0.60      0.07      0.12        90
               group_policy_discovery       0.11      0.22      0.15         9
            network_service_discovery       1.00      1.00      1.00     28805
        reconnaissance_scan_ip_blocks       0.94      1.00      0.97        16
reconnaissance_vulnerability_scanning       0.60      0.39      0.47       308
     reconnaissance_wordlist_scanning       1.00      1.00      1.00       138
              remote_system_discovery       1.00      0.01      0.02       113

                             accuracy                           0.99     41747
                            macro avg       0.78      0.54      0.56