In [7]:
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
import os

# List of attack types and BENIGN
attack_types = ["Bot", "DDoS", "DoS GoldenEye", "DoS Hulk", "DoS Slowhttptest", "DoS slowloris", "FTP-Patator",
                "Heartbleed", "Infiltration", "PortScan", "SSH-Patator", "Web Attack – Brute Force",
                "Web Attack – Sql Injection", "Web Attack – XSS"]
benign_type = "BENIGN"

# Ensure attacks directory exists (it should, but for safety)
output_dir = "attacks"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Loop through each attack type
for attack_type in attack_types:
    # Load the combined data file
    input_filename = f"{attack_type}_vs_{benign_type}.csv"
    file_path = os.path.join(output_dir, input_filename)
    
    try:
        combined_data = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"File not found: {file_path}, skipping...")
        continue

    # Separate features (X) and labels (y)
    # Note: Using ' Label' with a space as identified in other notebooks
    if ' Label' in combined_data.columns:
        X = combined_data.drop(columns=[" Label"])
        y = combined_data[" Label"]
    elif 'Label' in combined_data.columns:
        X = combined_data.drop(columns=["Label"])
        y = combined_data["Label"]
    else:
        print(f"Label column not found in {file_path}, skipping...")
        continue
    
    # Skip undersampling if the class count is too low
    if len(set(y)) <= 1:
        print(f"Skipping {attack_type}, not enough classes")
        continue
    
    # Perform undersampling using RandomUnderSampler
    # Using a smaller ratio to keep more data if possible, or keep 0.5 as original
    sampler = RandomUnderSampler(sampling_strategy=0.5, random_state=42)
    try:
        X_resampled, y_resampled = sampler.fit_resample(X, y)
    except ValueError as e:
        print(f"Undersampling failed for {attack_type}: {e}")
        continue

    # Create a DataFrame with resampled data
    resampled_data = pd.DataFrame(X_resampled, columns=X.columns)
    resampled_data[" Label"] = y_resampled
    
    # Save the resampled data back to the original file
    resampled_data.to_csv(file_path, index=False)
    print(f"Resampled and replaced {file_path}")


In [6]:
import matplotlib.pyplot as plt
import pandas as pd
import os

output_dir = "attacks"

# Loop through each attack type to check distribution after undersampling
for attack_type in attack_types:
    # Load the combined data file
    input_filename = f"{attack_type}_vs_{benign_type}.csv"
    file_path = os.path.join(output_dir, input_filename)
    
    try:
        combined_data = pd.read_csv(file_path)
    except FileNotFoundError:
        continue
    
    # Count the number of instances for each class
    label_col = ' Label' if ' Label' in combined_data.columns else 'Label'
    if label_col in combined_data.columns:
        class_counts = combined_data[label_col].value_counts()
        
        # Plot the class distribution
        plt.figure(figsize=(8, 6))
        class_counts.plot(kind="bar", color=["green", "red"])
        plt.title(f"Class Distribution for {attack_type}")
        plt.xlabel("Class")
        plt.ylabel("Count")
        plt.xticks(rotation=45)
        plt.legend(["BENIGN", attack_type])
        plt.tight_layout()
        plt.show()
        
        print(f"File: {input_filename}")
        print(class_counts)
        print("Shape:", combined_data.shape)
        print("-----------------------------")
