In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import os

# === File Paths ===
datasets = {
    "bot-iot": r"C:\Users\User\IIoT_IDS_Project\data\raw\bot-iot\UNSW_2018_IoT_Botnet_Final_10_best_Training.csv",
    "ton-iot-modbus": r"C:\Users\User\IIoT_IDS_Project\data\raw\ton-iot\Train_Test_IoT_Modbus.csv"
}

output_dir = r"C:\Users\User\IIoT_IDS_Project\data\splits"
os.makedirs(output_dir, exist_ok=True)

# === Preprocess BoT-IoT using pandas chunks ===
def preprocess_bot_iot(file_path, majority_sample_size=50000, chunksize=500000):
    print("\nProcessing BoT-IoT dataset (pandas chunks)...")
    
    drop_cols = ['pkSeqID', 'saddr', 'sport', 'daddr', 'dport', 'subcategory']
    X_list, y_list = [], []

    for chunk in pd.read_csv(file_path, chunksize=chunksize, low_memory=False):
        # Drop unneeded columns
        chunk = chunk.drop(columns=drop_cols, errors='ignore')
        
        # Encode categorical consistently
        chunk['proto'] = LabelEncoder().fit_transform(chunk['proto'])
        chunk['category'] = LabelEncoder().fit_transform(chunk['category'])
        
        # Separate features and target
        X_list.append(chunk.drop(columns=['attack']))
        y_list.append(chunk['attack'])
    
    # Concatenate chunks into a manageable dataframe
    X = pd.concat(X_list, ignore_index=True)
    y = pd.concat(y_list, ignore_index=True)
    
    # Downsample majority class
    majority_idx = y[y == 1].index
    minority_idx = y[y == 0].index
    majority_downsampled_idx = np.random.choice(majority_idx, size=majority_sample_size, replace=False)
    
    X_small = pd.concat([X.loc[majority_downsampled_idx], X.loc[minority_idx]])
    y_small = pd.concat([y.loc[majority_downsampled_idx], y.loc[minority_idx]])
    
    print(f"Class distribution after downsampling:\n{y_small.value_counts()}")
    
    # Apply SMOTE
    sm = SMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X_small, y_small)
    print(f"Class distribution after SMOTE:\n{pd.Series(y_res).value_counts()}")
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_res)
    
    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y_res, test_size=0.2, random_state=42, stratify=y_res
    )
    
    return X_train, X_test, y_train, y_test

# === Preprocess TON-IoT normally ===
def preprocess_ton_iot(file_path):
    print("\nProcessing TON-IoT Modbus dataset...")
    
    df = pd.read_csv(file_path)
    df = df.drop(columns=['date', 'time', 'type'], errors='ignore')
    
    X = df.drop(columns=['label'])
    y = df['label']
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Apply SMOTE if highly imbalanced
    if y_train.value_counts().min() / y_train.value_counts().max() < 0.4:
        sm = SMOTE(random_state=42)
        X_train, y_train = sm.fit_resample(X_train, y_train)
        print(f"Class distribution after SMOTE:\n{y_train.value_counts()}")
    else:
        print(f"Class distribution (balanced):\n{y_train.value_counts()}")
    
    return X_train, X_test, y_train, y_test

# === Run preprocessing for both datasets ===
for name, path in datasets.items():
    if name == "bot-iot":
        X_train, X_test, y_train, y_test = preprocess_bot_iot(path)
    else:
        X_train, X_test, y_train, y_test = preprocess_ton_iot(path)
    
    # Save splits
    np.save(os.path.join(output_dir, f"X_train_{name}.npy"), X_train)
    np.save(os.path.join(output_dir, f"X_test_{name}.npy"), X_test)
    np.save(os.path.join(output_dir, f"y_train_{name}.npy"), y_train)
    np.save(os.path.join(output_dir, f"y_test_{name}.npy"), y_test)
    
    print(f"{name} preprocessing complete. Files saved in {output_dir}")


ModuleNotFoundError: No module named 'dask'