# Exploratory Data Analysis

This notebook analyzes the Power Combined and HPC Kernel Events datasets.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import os
import sys
from pathlib import Path

# Configure matplotlib
%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 8)

## Loading the Datasets

In [None]:
# Define file paths
data_dir = Path('../data/raw')
power_path = data_dir / 'Power.csv'
hpc_path = data_dir / 'HPC.csv'

# Check if files exist
if not power_path.exists() or not hpc_path.exists():
    # Try alternative paths
    data_dir = Path('../datasets')
    power_path = data_dir / 'Power.csv'
    hpc_path = data_dir / 'HPC.csv'

# Load datasets
try:
    power_df = pd.read_csv(power_path)
    print(f"Power dataset loaded with shape: {power_df.shape}")
except FileNotFoundError:
    print(f"Power dataset not found at {power_path}")
    power_df = None

try:
    hpc_df = pd.read_csv(hpc_path)
    print(f"HPC dataset loaded with shape: {hpc_df.shape}")
except FileNotFoundError:
    print(f"HPC dataset not found at {hpc_path}")
    hpc_df = None

## Power Combined Dataset Analysis

In [None]:
if power_df is not None:
    # Basic info
    print("First 5 rows:")
    display(power_df.head())
    
    print("\nBasic info:")
    power_df.info()
    
    print("\nMissing values:")
    print(power_df.isnull().sum())
    
    print("\nSummary statistics:")
    display(power_df.describe())
else:
    print("Power dataset not available for analysis.")

In [None]:
if power_df is not None and 'Attack-Group' in power_df.columns:
    # Class distribution
    plt.figure(figsize=(10, 6))
    sns.countplot(x='Attack-Group', data=power_df)
    plt.title('Attack Group Distribution')
    plt.xlabel('Attack Group')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
if power_df is not None:
    # Correlation heatmap
    plt.figure(figsize=(12, 10))
    numeric_cols = power_df.select_dtypes(include=['float64', 'int64']).columns
    correlation = power_df[numeric_cols].corr()
    sns.heatmap(correlation, annot=True, cmap='coolwarm', linewidths=0.5)
    plt.title('Correlation Heatmap')
    plt.tight_layout()
    plt.show()

## HPC Kernel Events Dataset Analysis

In [None]:
if hpc_df is not None:
    # Basic info
    print("First 5 rows:")
    display(hpc_df.head())
    
    print("\nBasic info:")
    hpc_df.info()
    
    print("\nMissing values:")
    print(hpc_df.isnull().sum())
    
    print("\nSummary statistics:")
    display(hpc_df.describe())
else:
    print("HPC dataset not available for analysis.")

In [None]:
if hpc_df is not None and 'Scenario' in hpc_df.columns:
    # Top scenarios
    scenario_counts = hpc_df['Scenario'].value_counts().head(10)
    plt.figure(figsize=(12, 6))
    sns.barplot(x=scenario_counts.index, y=scenario_counts.values)
    plt.title('Top 10 Scenarios')
    plt.xlabel('Scenario')
    plt.ylabel('Count')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()

In [None]:
if hpc_df is not None:
    # Apply PCA
    numeric_cols = hpc_df.select_dtypes(include=['float64', 'int64']).columns
    if len(numeric_cols) > 2:
        # Standardize
        scaler = StandardScaler()
        scaled_data = scaler.fit_transform(hpc_df[numeric_cols])
        
        # PCA
        pca = PCA(n_components=2)
        pca_result = pca.fit_transform(scaled_data)
        
        # Plot
        plt.figure(figsize=(10, 8))
        plt.scatter(pca_result[:, 0], pca_result[:, 1], alpha=0.5)
        plt.title('PCA: First Two Principal Components')
        plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
        plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
        plt.tight_layout()
        plt.show()
        
        print(f"Total variance explained by 2 components: {sum(pca.explained_variance_ratio_):.2%}")

## Feature Fusion Considerations

In [None]:
if power_df is not None and hpc_df is not None:
    print(f"Power dataset features: {power_df.shape[1]}, samples: {power_df.shape[0]}")
    print(f"HPC dataset features: {hpc_df.shape[1]}, samples: {hpc_df.shape[0]}")
    print("\nPossible feature fusion approaches:")
    print("1. Feature concatenation")
    print("2. Weighted feature fusion")
    print("3. PCA-based dimensionality reduction")
else:
    print("Both datasets are needed for feature fusion analysis.")

## Preprocessing Steps Summary

### Power Combined Dataset:
1. Handle missing values
2. Rename attack labels ('host-attack' → 'Other', 'none' → 'Begin', 'recon' → 'Recon')
3. Encode categorical features ('Attack-Group', 'State', 'interface', 'Label')
4. Apply SMOTE for class balancing
5. Apply StandardScaler to normalize feature ranges

### HPC Kernel Events Dataset:
1. Handle missing values
2. Filter out specific Scenario values ('writeback:writeback_write_inode_start', '0', 0.0)
3. Rename 'Cryptojacking' to 'Other' (if the column exists)
4. Apply StandardScaler to normalize numeric features
5. Apply PCA for dimensionality reduction
6. Encode the 'Scenario' target label