In [1]:
import pandas as pd
import glob
import os

# --- 1. Define the CORRECT path to your data folder ---
# The '../' tells Python to go up one directory from the 'notebooks' folder
path = '../data/N-BaLot/' 

# --- 2. Get a list of all CSV files in the folder ---
all_files = glob.glob(os.path.join(path, "*.csv"))

# --- 3. Loop through the files, load them, and add a label ---
df_list = []
for filename in all_files:
    # Skip non-data files
    if 'data_summary' in filename or 'device_info' in filename or 'features' in filename:
        continue
        
    # Read the current CSV file
    df_temp = pd.read_csv(filename)
    
    # Create the label: 0 for benign, 1 for attack
    if 'benign' in filename:
        df_temp['label'] = 0
    else:
        df_temp['label'] = 1
        
    df_list.append(df_temp)

# --- 4. Combine all the individual dataframes into one ---
df_combined = pd.concat(df_list, ignore_index=True)

# --- 5. Verify the result ---
print("All files loaded and combined successfully!")
print(f"Total shape of the new dataset: {df_combined.shape}")
print("\nLabel distribution:")
print(df_combined['label'].value_counts())
print("\nFirst 5 rows of the combined dataset:")
print(df_combined.head())

All files loaded and combined successfully!
Total shape of the new dataset: (7062606, 116)

Label distribution:
label
1    6506674
0     555932
Name: count, dtype: int64

First 5 rows of the combined dataset:
   MI_dir_L5_weight  MI_dir_L5_mean  MI_dir_L5_variance  MI_dir_L3_weight  \
0          1.000000       98.000000        0.000000e+00          1.000000   
1          1.029000       98.000000        1.818989e-12          1.119520   
2          1.504156       76.725612        2.281808e+02          1.729662   
3          2.460087       75.617679        1.372200e+02          2.699075   
4          3.460055       75.150149        9.809937e+01          3.699054   

   MI_dir_L3_mean  MI_dir_L3_variance  MI_dir_L1_weight  MI_dir_L1_mean  \
0       98.000000            0.000000          1.000000       98.000000   
1       98.000000            0.000000          1.492583       98.000000   
2       79.499272          249.746357          2.294102       84.051188   
3       77.461807          1

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# --- 1. Separate features (X) and labels (y) ---
X = df_combined.drop('label', axis=1)
y = df_combined['label']

# --- 2. Create a smaller, representative sample (e.g., 300,000 records) ---
# We use train_test_split as a clever way to get a stratified sample.
# 'stratify=y' ensures the sample has the same percentage of anomalies as the original dataset.
_, X_sample, _, y_sample = train_test_split(X, y, test_size=300000, random_state=42, stratify=y)

print("Created a representative sample of the data.")
print(f"Shape of the sample features (X_sample): {X_sample.shape}")
print(f"Sample label distribution:\n{y_sample.value_counts()}")


# --- 3. Scale the numerical features of the sample ---
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_sample)

print("\nData preprocessing complete.")
print(f"Shape of the final scaled features (X_scaled): {X_scaled.shape}")