In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

In [10]:
import os


files = [
    'smote.csv',
    'adasyn.csv',
    'borderline_smote.csv',
    'tomek_links.csv',
    'smoteenn.csv',
    'smotetomek.csv'
]
target = 'techniques_mitre'

# XGB

In [11]:
def xgb(file):
    folder = r'..\Dataset_simulazione'
    file_path = os.path.join(folder, file)
    new_df = pd.read_csv(file_path)
    
    # Preprocess categorical features
    categorical_columns = new_df.select_dtypes(include=['object']).columns.tolist()
    if target in categorical_columns:
        categorical_columns.remove(target)

    # Encode categorical features
    for column in categorical_columns:
        le = LabelEncoder()
        new_df[column] = le.fit_transform(new_df[column])

    # Check if the target column exists and encode it
    if target in new_df.columns:
        le_target = LabelEncoder()
        new_df[target] = le_target.fit_transform(new_df[target])
    else:
        raise ValueError("Column target not found in the DataFrame")

    # Prepare features and target
    X = new_df.drop(target, axis=1)
    y = new_df[target]

    # Feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Initialize the XGBoost Classifier
    xgb_classifier = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, eval_metric='mlogloss')

    # Fit the model on the training data
    xgb_classifier.fit(X_train, y_train)

    # Predict on the test data
    y_pred = xgb_classifier.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(file)
    print(f"Accuracy: {accuracy}")

    # Classification Report
    target_names = le_target.inverse_transform(range(len(le_target.classes_)))
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=target_names, zero_division=0))


In [12]:
for file in files:
    xgb(file)

smote.csv
Accuracy: 0.8387570121335168
Classification Report:
                                       precision    recall  f1-score   support

             account_discovery_domain       0.69      0.69      0.69     23003
                               benign       0.55      0.49      0.51     23309
               domain_trust_discovery       0.53      0.53      0.53     23141
               group_policy_discovery       0.96      0.95      0.95     23008
            network_service_discovery       0.93      0.97      0.95     23029
        reconnaissance_scan_ip_blocks       0.98      0.98      0.98     23036
reconnaissance_vulnerability_scanning       0.90      0.98      0.94     22971
     reconnaissance_wordlist_scanning       0.99      0.99      0.99     23201
              remote_system_discovery       0.99      0.98      0.98     23156

                             accuracy                           0.84    207854
                            macro avg       0.83      0.84      0.8