In [None]:
# Adversarial Machine Learning - EDA Notebook

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Set visualization style
sns.set(style="whitegrid")

# Load the dataset
file_path = "../02_data/NSL_KDD_train.csv"
df = pd.read_csv(file_path)

# Display basic dataset info
print("Dataset Shape:", df.shape)
print("\nFirst 5 Rows:")
display(df.head())

# Check for missing values
print("\nMissing Values in Each Column:")
print(df.isnull().sum())

# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
print("\nCategorical Columns:", categorical_cols)

# Encoding categorical features
label_encoders = {}
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    df[col] = label_encoders[col].fit_transform(df[col])

# Feature Scaling
scaler = MinMaxScaler()
df[df.columns] = scaler.fit_transform(df)

# Visualize class distribution
plt.figure(figsize=(8, 5))
sns.countplot(x=df.iloc[:, -1])  # Assuming last column is the target
plt.title("Class Distribution")
plt.xticks(rotation=45)
plt.show()

# Save processed dataset
df.to_csv("../02_data/NSL_KDD_train_processed.csv", index=False)
print("\nProcessed dataset saved successfully!")
