# Data Exploration (EDA) for CICDDoS2019 Dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
try:
    df = pd.read_csv('../data/raw/CICDDoS2019.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: CICDDoS2019.csv not found. Please ensure the dataset is in the 'data/raw/' directory.")
    df = None

if df is not None:
    # Display basic information
    print("\n--- Dataset Info ---")
    df.info()

    print("\n--- First 5 Rows ---")
    print(df.head())

    print("\n--- Dataset Description ---")
    print(df.describe())

    # Check for missing values
    print("\n--- Missing Values ---")
    missing_values = df.isnull().sum()
    print(missing_values[missing_values > 0])

    # Check for infinite values
    print("\n--- Infinite Values ---")
    infinite_values = df.isin([np.inf, -np.inf]).sum()
    print(infinite_values[infinite_values > 0])

    # Distribution of attack types
    print("\n--- Distribution of Labels ---")
    print(df['Label'].value_counts())

    plt.figure(figsize=(12, 6))
    sns.countplot(data=df, x='Label')
    plt.title('Distribution of Attack Types')
    plt.xticks(rotation=45)
    plt.show()

    # Example: Distribution of a numerical feature (e.g., 'Flow Duration')
    if 'Flow Duration' in df.columns:
        plt.figure(figsize=(10, 5))
        sns.histplot(df['Flow Duration'], bins=50, kde=True)
        plt.title('Distribution of Flow Duration')
        plt.xlabel('Flow Duration')
        plt.ylabel('Frequency')
        plt.show()

    # Correlation matrix (for numerical features)
    print("\n--- Correlation Matrix (Top 10 features) ---")
    numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
    if len(numerical_cols) > 1:
        corr_matrix = df[numerical_cols].corr()
        plt.figure(figsize=(14, 10))
        sns.heatmap(corr_matrix.abs().nlargest(10, 'Flow Duration').index.to_list(), annot=True, cmap='coolwarm')
        plt.title('Correlation Matrix of Numerical Features')
        plt.show()
    else:
        print("Not enough numerical columns to compute correlation matrix.")