In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

In [2]:
import os


files = [
    'smote.csv',
    'adasyn.csv',
    'borderline_smote.csv',
    'tomek_links.csv',
    'smoteenn.csv',
    'smotetomek.csv'
]
target = 'techniques_mitre'

In [3]:
def logistic(file):
    folder = r'..\Dataset_simulazione'
    file_path = os.path.join(folder, file)
    new_df= pd.read_csv(file_path)
    
    # Preprocessing
    categorical_columns = new_df.select_dtypes(include=['object']).columns.tolist()
    if target in categorical_columns:
        categorical_columns.remove(target)

    # Encode categorical features
    for column in categorical_columns:
        le = LabelEncoder()
        new_df[column] = le.fit_transform(new_df[column])

    # Check if the target column exists and encode it
    if target in new_df.columns:
        le_target = LabelEncoder()
        new_df[target] = le_target.fit_transform(new_df[target])
    else:
        raise ValueError("Column target not found in the DataFrame")

    # Prepare features and target
    X = new_df.drop(target, axis=1)
    y = new_df[target]

    # Feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Initialize and train logistic regression model with class weight
    model = LogisticRegression(max_iter=10000, class_weight='balanced')
    model.fit(X_train, y_train)

    # Predict and evaluate the model
    y_pred = model.predict(X_test)

   # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(file, "Accuracy:", accuracy)

    # Classification Report - Ensure labels are strings
    target_names = le_target.inverse_transform(np.unique(new_df['techniques_mitre']))

    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=target_names.astype(str), zero_division=0))

In [4]:
for file in files:
    logistic(file)

smote.csv Accuracy: 0.7380420872343088
Classification Report:
                                       precision    recall  f1-score   support

             account_discovery_domain       0.63      0.72      0.67     23003
                               benign       0.35      0.12      0.18     23309
               domain_trust_discovery       0.47      0.17      0.25     23141
               group_policy_discovery       0.92      0.90      0.91     23008
            network_service_discovery       0.62      0.89      0.73     23029
        reconnaissance_scan_ip_blocks       0.88      0.96      0.92     23036
reconnaissance_vulnerability_scanning       0.89      0.91      0.90     22971
     reconnaissance_wordlist_scanning       0.99      0.99      0.99     23201
              remote_system_discovery       0.61      0.97      0.75     23156

                             accuracy                           0.74    207854
                            macro avg       0.71      0.74      0.7