In [1]:
import pandas as pd
import glob
import os
from sklearn.preprocessing import MinMaxScaler

# --- 1. Define the path and the sampling fraction ---
path = '../data/N-BaLot/' 
sample_fraction = 0.05 # We'll take a 5% random sample from each file

# --- 2. Loop through files, load a sample from each, and combine ---
all_files = glob.glob(os.path.join(path, "*.csv"))
print(f"Found {len(all_files)} files. Sampling {sample_fraction*100}% from each...")

df_list = []
for i, filename in enumerate(all_files):
    if 'data_summary' in filename or 'device_info' in filename or 'features' in filename:
        continue
    
    print(f"Processing file {i+1}/{len(all_files)}: {os.path.basename(filename)}")
    
    # Load the full file
    df_temp = pd.read_csv(filename)
    
    # Take a random sample from this file
    df_sample_temp = df_temp.sample(frac=sample_fraction, random_state=42)
    
    # Create the label
    if 'benign' in filename:
        df_sample_temp['label'] = 0
    else:
        df_sample_temp['label'] = 1
        
    df_list.append(df_sample_temp)

print("\nCombining all samples...")
df_sample = pd.concat(df_list, ignore_index=True)

print("\nCombined sample created successfully!")
print(f"Total shape of the sample dataset: {df_sample.shape}")
print("\nLabel distribution:")
print(df_sample['label'].value_counts())

# --- 3. Preprocess the final sample ---
X_sample = df_sample.drop('label', axis=1)
y_sample = df_sample['label']

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_sample)

print("\nData preprocessing complete.")
print(f"Shape of the final scaled features (X_scaled): {X_scaled.shape}")

Found 92 files. Sampling 5.0% from each...
Processing file 1/92: 7.gafgyt.combo.csv
Processing file 2/92: 6.mirai.scan.csv
Processing file 3/92: 5.gafgyt.junk.csv
Processing file 4/92: 5.gafgyt.combo.csv
Processing file 5/92: 9.mirai.syn.csv
Processing file 7/92: 9.mirai.scan.csv
Processing file 8/92: 9.mirai.udp.csv
Processing file 9/92: 8.mirai.udp.csv
Processing file 10/92: 1.gafgyt.combo.csv
Processing file 11/92: 1.benign.csv
Processing file 12/92: 1.mirai.syn.csv
Processing file 13/92: 3.gafgyt.combo.csv
Processing file 14/92: 9.mirai.udpplain.csv
Processing file 15/92: 6.mirai.syn.csv
Processing file 16/92: 4.mirai.syn.csv
Processing file 17/92: 9.benign.csv
Processing file 18/92: 8.gafgyt.tcp.csv
Processing file 19/92: 9.mirai.ack.csv
Processing file 20/92: 5.gafgyt.scan.csv
Processing file 21/92: 2.mirai.syn.csv
Processing file 22/92: 6.gafgyt.tcp.csv
Processing file 23/92: 7.benign.csv
Processing file 24/92: 8.gafgyt.udp.csv
Processing file 25/92: 5.mirai.ack.csv
Processing f

In [2]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# --- 1. Separate features (X) and labels (y) ---
X = df_combined.drop('label', axis=1)
y = df_combined['label']

# --- 2. Create a smaller, representative sample (e.g., 300,000 records) ---
# We use train_test_split as a clever way to get a stratified sample.
# 'stratify=y' ensures the sample has the same percentage of anomalies as the original dataset.
_, X_sample, _, y_sample = train_test_split(X, y, test_size=300000, random_state=42, stratify=y)

print("Created a representative sample of the data.")
print(f"Shape of the sample features (X_sample): {X_sample.shape}")
print(f"Sample label distribution:\n{y_sample.value_counts()}")


# --- 3. Scale the numerical features of the sample ---
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_sample)

print("\nData preprocessing complete.")
print(f"Shape of the final scaled features (X_scaled): {X_scaled.shape}")

NameError: name 'df_combined' is not defined