In [7]:
import os
import pandas as pd

DATA_PATH = "../data"   # <-- change to your actual path
OUTPUT_PATH = '../data/combined_realistic.csv'

In [4]:
# Only keep these realistic/common classes
KEEP_LABELS = [
    'Benign',
    'SSH-Bruteforce',
    'FTP-BruteForce',
    'DoS attacks-Hulk',
    'DoS attacks-SlowHTTPTest',
    'DDOS attack-HOIC',
    'Bot',
    'Infilteration'
]

def get_file_paths(data_path):
    """Get all CSV file paths except the corrupted one"""
    file_paths = []
    for root, _, filenames in os.walk(data_path):
        for filename in filenames:
            if filename.endswith('.csv'):
                file_paths.append(os.path.join(root, filename))
    # remove the corrupted file
    bad_file = os.path.join(DATA_PATH, '02-20-2018.csv')
    if bad_file in file_paths:
        file_paths.remove(bad_file)
    return sorted(file_paths)

# Get file paths
file_paths = get_file_paths(DATA_PATH)
print(f"Found {len(file_paths)} CSV files")

# Read + clean + combine
df_list = []
for file_path in file_paths:
    print(f"Loading: {os.path.basename(file_path)}")
    chunk = pd.read_csv(file_path, low_memory=False)

    # Drop timestamp if exists
    chunk = chunk.drop(columns=['Timestamp'], errors='ignore')

    # Remove header rows accidentally read as data
    chunk = chunk[chunk['Label'] != 'Label']

    # Keep only relevant labels
    chunk = chunk[chunk['Label'].isin(KEEP_LABELS)]

    df_list.append(chunk)

Found 9 CSV files
Loading: 02-14-2018.csv
Loading: 02-15-2018.csv
Loading: 02-16-2018.csv
Loading: 02-21-2018.csv
Loading: 02-22-2018.csv
Loading: 02-23-2018.csv
Loading: 02-28-2018.csv
Loading: 03-01-2018.csv
Loading: 03-02-2018.csv


In [5]:
# # Combine all dataframes
combined_df = pd.concat(df_list, ignore_index=True)
print(f"✅ Combined dataset shape: {combined_df.shape}")

# Save to CSV
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
combined_df.to_csv(OUTPUT_PATH, index=False)
print(f"💾 Final realistic dataset saved to: {OUTPUT_PATH}")

# Quick distribution check
print("\n📊 Class distribution:")
print(combined_df['Label'].value_counts())

✅ Combined dataset shape: (8229039, 79)
💾 Final realistic dataset saved to: ../combined_realistic.csv

📊 Class distribution:
Label
Benign                      6112151
DDOS attack-HOIC             686012
DoS attacks-Hulk             461912
Bot                          286191
FTP-BruteForce               193360
SSH-Bruteforce               187589
Infilteration                161934
DoS attacks-SlowHTTPTest     139890
Name: count, dtype: int64


In [8]:
# Check saved file size
file_size_gb = os.path.getsize(OUTPUT_PATH) / (1024 ** 3)
print(f"💽 Saved file size: {file_size_gb:.2f} GB")

💽 Saved file size: 2.66 GB
