# Attribute selection

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

train_df = pd.read_csv('../data/raw/train_data.csv')
test_df = pd.read_csv('../data/raw/test_data.csv')

# Correlation matrix
corr_matrix = train_df.corr(numeric_only=True).abs()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

print(f"Features to drop ({len(to_drop)}): {to_drop}")

# Drop features
train_df.drop(to_drop, axis=1, inplace=True)
test_df.drop(to_drop, axis=1, inplace=True)

Features to drop (24): ['bwd_data_pkts_tot', 'bwd_pkts_per_sec', 'flow_pkts_per_sec', 'bwd_header_size_tot', 'bwd_header_size_max', 'flow_ACK_flag_count', 'bwd_pkts_payload.tot', 'bwd_pkts_payload.std', 'flow_pkts_payload.max', 'flow_pkts_payload.tot', 'fwd_iat.tot', 'fwd_iat.std', 'flow_iat.max', 'flow_iat.tot', 'flow_iat.std', 'payload_bytes_per_second', 'bwd_subflow_bytes', 'fwd_bulk_packets', 'bwd_bulk_packets', 'active.std', 'idle.min', 'idle.max', 'idle.tot', 'idle.avg']


### Why remove features with correlation > 0.95?

We remove features with a correlation higher than **0.95** to avoid **multicollinearity**. 

When two variables are highly correlated (almost identical), they provide redundant information to the model. Keeping both doesn't improve performance but increases computational cost and can make some models unstable. By removing one of them, we simplify the dataset without losing significant information.

# Feature engineering 

In [2]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import os

categorical_cols = ['proto', 'service']
target_col = 'Attack_type'

# One-Hot Encoding for features
train_df = pd.get_dummies(train_df, columns=categorical_cols)
test_df = pd.get_dummies(test_df, columns=categorical_cols)

# Align columns to ensure train and test have same features
train_df, test_df = train_df.align(test_df, join='inner', axis=1)

# Label Encoding for target
le = LabelEncoder()
train_df[target_col] = le.fit_transform(train_df[target_col])
test_df[target_col] = le.transform(test_df[target_col])

# Scaling numeric features
numeric_cols = train_df.select_dtypes(include=[np.number]).columns.drop(target_col)
scaler = StandardScaler()
train_df[numeric_cols] = scaler.fit_transform(train_df[numeric_cols])
test_df[numeric_cols] = scaler.transform(test_df[numeric_cols])

# Combine train and test for splitting (80/20 split)
full_df = pd.concat([train_df, test_df], axis=0)
X = full_df.drop(target_col, axis=1)
y = full_df[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
    )

# Reconstruct DataFrames for saving
train_processed = pd.concat([X_train, y_train], axis=1)
test_processed = pd.concat([X_test, y_test], axis=1)

# Save processed data
os.makedirs('../data/processed', exist_ok=True)
train_processed.to_csv('../data/processed/train_processed.csv', index=False)
test_processed.to_csv('../data/processed/test_processed.csv', index=False)

print(f"Train shape: {train_processed.shape}, Test shape: {test_processed.shape}")

Train shape: (93568, 71), Test shape: (23393, 71)
