# Model Prototyping and Data Preprocessing

In [None]:
import pandas as pd
import numpy as np
import joblib
from src.data_preprocessing import preprocess_data

# Load the dataset
try:
    df = pd.read_csv('../data/raw/CICDDoS2019.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: CICDDoS2019.csv not found. Please ensure the dataset is in the 'data/raw/' directory.")
    df = None

if df is not None:
    # Preprocess the data
    X_train_resampled, X_test, y_train_resampled, y_test, scaler, selector, selected_features = preprocess_data(df, target_column='Label')

    print("\n--- Preprocessing Complete ---")
    print(f"Shape of X_train_resampled: {X_train_resampled.shape}")
    print(f"Shape of X_test: {X_test.shape}")
    print(f"Selected features: {selected_features}")

    # Save processed data
    X_train_resampled.to_csv('../data/processed/train_data_features.csv', index=False)
    y_train_resampled.to_csv('../data/processed/train_data_labels.csv', index=False)
    X_test.to_csv('../data/processed/test_data_features.csv', index=False)
    y_test.to_csv('../data/processed/test_data_labels.csv', index=False)

    # Save scaler and feature selector for later use in prediction
    joblib.dump(scaler, '../models/minmax_scaler.pkl')
    joblib.dump(selector, '../models/feature_selector.pkl')
    joblib.dump(selected_features, '../models/selected_features.pkl')

    print("Processed data, scaler, and feature selector saved successfully to 'data/processed/' and 'models/' directories.")

    # --- XGBoost Model Training (Placeholder) ---
    print("\n--- XGBoost Model Training (Next Step) ---")
    # This section will be filled in model_training.py

    # --- LSTM Autoencoder Model Training (Placeholder) ---
    print("\n--- LSTM Autoencoder Model Training (Next Step) ---")
    # This section will be filled in model_training.py
