# Data Preprocessing for PowerCombined and HPC-Kernel-Events Datasets

This notebook contains the preprocessing steps for two independent datasets:
1. PowerCombined dataset
2. HPC-Kernel-Events dataset

The preprocessing includes data cleaning, encoding categorical variables, standardization, dimensionality reduction, and handling class imbalance.

## Import Required Libraries

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# For preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE

# For utilities
import os
import warnings
warnings.filterwarnings('ignore')

# Set display options for pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

## Preprocess PowerCombined Dataset

In this section, we will:
1. Load the PowerCombined dataset
2. Explore and clean the data
3. Encode categorical features
4. Handle class imbalance using SMOTE
5. Scale the features using StandardScaler

In [None]:
# Load the PowerCombined dataset
# Replace the path with the actual path to your dataset
power_combined_path = "../data/PowerCombined.csv"  # Update this path

try:
    power_df = pd.read_csv(power_combined_path)
    print(f"PowerCombined dataset loaded with shape: {power_df.shape}")
    print("\nFirst 5 rows:")
    print(power_df.head())
except FileNotFoundError:
    print(f"File not found at {power_combined_path}. Please update the path.")

In [None]:
# Exploratory data analysis for PowerCombined
if 'power_df' in locals():
    print("Dataset info:")
    print(power_df.info())
    
    print("\nMissing values:")
    print(power_df.isnull().sum())
    
    print("\nSummary statistics:")
    print(power_df.describe())
    
    # Check for categorical columns
    categorical_cols = power_df.select_dtypes(include=['object']).columns
    print(f"\nCategorical columns: {list(categorical_cols)}")
    
    # Check class distribution if 'label' or similar column exists
    if 'label' in power_df.columns:
        print("\nClass distribution:")
        print(power_df['label'].value_counts())
    elif 'class' in power_df.columns:
        print("\nClass distribution:")
        print(power_df['class'].value_counts())

In [None]:
# Clean the PowerCombined dataset
if 'power_df' in locals():
    # Make a copy of the original data
    power_clean = power_df.copy()
    
    # Drop duplicates if any
    original_shape = power_clean.shape
    power_clean = power_clean.drop_duplicates()
    print(f"Removed {original_shape[0] - power_clean.shape[0]} duplicate rows")
    
    # Handle missing values
    for col in power_clean.columns:
        if power_clean[col].isnull().sum() > 0:
            if power_clean[col].dtype in ['int64', 'float64']:
                # Fill numeric columns with median
                power_clean[col] = power_clean[col].fillna(power_clean[col].median())
            else:
                # Fill categorical columns with mode
                power_clean[col] = power_clean[col].fillna(power_clean[col].mode()[0])
    
    print(f"After cleaning, dataset shape: {power_clean.shape}")

In [None]:
# Encode categorical features in PowerCombined dataset
if 'power_clean' in locals():
    # Initialize LabelEncoder
    le = LabelEncoder()
    
    # Identify categorical columns
    categorical_cols = power_clean.select_dtypes(include=['object']).columns
    
    # Encode each categorical column
    for col in categorical_cols:
        # Exclude target variable if it's one of the categorical columns
        if col not in ['label', 'class']:
            power_clean[col] = le.fit_transform(power_clean[col])
            print(f"Encoded column: {col}")
    
    # Encode target variable if it exists and is categorical
    if 'label' in power_clean.columns and power_clean['label'].dtype == 'object':
        power_clean['label'] = le.fit_transform(power_clean['label'])
        print("Encoded target column: label")
    elif 'class' in power_clean.columns and power_clean['class'].dtype == 'object':
        power_clean['class'] = le.fit_transform(power_clean['class'])
        print("Encoded target column: class")
        
    print("\nAfter encoding, first 5 rows:")
    print(power_clean.head())

In [None]:
# Separate features and target for PowerCombined dataset
if 'power_clean' in locals():
    # Identify target column
    target_col = None
    if 'label' in power_clean.columns:
        target_col = 'label'
    elif 'class' in power_clean.columns:
        target_col = 'class'
    
    if target_col:
        X_power = power_clean.drop(columns=[target_col])
        y_power = power_clean[target_col]
        print(f"Features shape: {X_power.shape}")
        print(f"Target shape: {y_power.shape}")
        print(f"Target distribution:\n{y_power.value_counts()}")
    else:
        print("No target column ('label' or 'class') found in the dataset.")

In [None]:
# Apply SMOTE to handle class imbalance in PowerCombined dataset
if 'X_power' in locals() and 'y_power' in locals():
    from collections import Counter
    
    print("Original class distribution:", Counter(y_power))
    
    # Apply SMOTE
    smote = SMOTE(random_state=42)
    X_power_resampled, y_power_resampled = smote.fit_resample(X_power, y_power)
    
    print("Resampled class distribution:", Counter(y_power_resampled))
    print(f"After SMOTE, features shape: {X_power_resampled.shape}")
    print(f"After SMOTE, target shape: {y_power_resampled.shape}")

In [None]:
# Scale features for PowerCombined dataset
if 'X_power_resampled' in locals():
    # Initialize StandardScaler
    scaler = StandardScaler()
    
    # Scale the features
    X_power_scaled = scaler.fit_transform(X_power_resampled)
    
    print(f"Scaled features shape: {X_power_scaled.shape}")
    print("First 5 rows after scaling:")
    print(pd.DataFrame(X_power_scaled, columns=X_power.columns).head())
    
    # Create a DataFrame with scaled features and target for easier handling
    power_processed = pd.DataFrame(X_power_scaled, columns=X_power.columns)
    power_processed[target_col] = y_power_resampled
    
    print("\nProcessed PowerCombined dataset shape:", power_processed.shape)
    print("First 5 rows of processed dataset:")
    print(power_processed.head())

In [None]:
# Save the processed PowerCombined dataset
if 'power_processed' in locals():
    output_path = "../data/PowerCombined_processed.csv"
    power_processed.to_csv(output_path, index=False)
    print(f"Processed PowerCombined dataset saved to {output_path}")

## Preprocess HPC-Kernel-Events Dataset

In this section, we will:
1. Load the HPC-Kernel-Events dataset
2. Clean the data by filtering and renaming columns
3. Standardize numeric features using StandardScaler
4. Apply PCA for dimensionality reduction
5. Encode labels using LabelEncoder

In [None]:
# Load the HPC-Kernel-Events dataset
# Replace the path with the actual path to your dataset
hpc_path = "../data/HPC-Kernel-Events.csv"  # Update this path

try:
    hpc_df = pd.read_csv(hpc_path)
    print(f"HPC-Kernel-Events dataset loaded with shape: {hpc_df.shape}")
    print("\nFirst 5 rows:")
    print(hpc_df.head())
except FileNotFoundError:
    print(f"File not found at {hpc_path}. Please update the path.")

In [None]:
# Exploratory data analysis for HPC-Kernel-Events
if 'hpc_df' in locals():
    print("Dataset info:")
    print(hpc_df.info())
    
    print("\nMissing values:")
    print(hpc_df.isnull().sum())
    
    print("\nSummary statistics:")
    print(hpc_df.describe())
    
    # Check for categorical columns
    categorical_cols = hpc_df.select_dtypes(include=['object']).columns
    print(f"\nCategorical columns: {list(categorical_cols)}")
    
    # Check class distribution if 'label' or similar column exists
    if 'label' in hpc_df.columns:
        print("\nClass distribution:")
        print(hpc_df['label'].value_counts())
    elif 'class' in hpc_df.columns:
        print("\nClass distribution:")
        print(hpc_df['class'].value_counts())
    elif 'kernel_name' in hpc_df.columns:
        print("\nKernel distribution:")
        print(hpc_df['kernel_name'].value_counts())

In [None]:
# Clean the HPC-Kernel-Events dataset
if 'hpc_df' in locals():
    # Make a copy of the original data
    hpc_clean = hpc_df.copy()
    
    # Drop duplicates if any
    original_shape = hpc_clean.shape
    hpc_clean = hpc_clean.drop_duplicates()
    print(f"Removed {original_shape[0] - hpc_clean.shape[0]} duplicate rows")
    
    # Handle missing values
    for col in hpc_clean.columns:
        if hpc_clean[col].isnull().sum() > 0:
            if hpc_clean[col].dtype in ['int64', 'float64']:
                # Fill numeric columns with median
                hpc_clean[col] = hpc_clean[col].fillna(hpc_clean[col].median())
            else:
                # Fill categorical columns with mode
                hpc_clean[col] = hpc_clean[col].fillna(hpc_clean[col].mode()[0])
    
    # Filter columns if needed
    # hpc_clean = hpc_clean.drop(columns=['unwanted_column1', 'unwanted_column2'])
    
    print(f"After cleaning, dataset shape: {hpc_clean.shape}")

In [None]:
# Prepare target variable for HPC-Kernel-Events
if 'hpc_clean' in locals():
    # Identify the target column
    target_col = None
    if 'kernel_name' in hpc_clean.columns:
        target_col = 'kernel_name'
    elif 'label' in hpc_clean.columns:
        target_col = 'label'
    elif 'class' in hpc_clean.columns:
        target_col = 'class'
    
    if target_col:
        # Encode the target variable
        le = LabelEncoder()
        hpc_clean[target_col] = le.fit_transform(hpc_clean[target_col])
        print(f"Encoded {target_col} with {len(le.classes_)} unique values")
        print(f"Class mapping: {dict(zip(le.classes_, range(len(le.classes_))))}")
        
        # Save the encoder classes for later use
        target_classes = le.classes_
    else:
        print("No suitable target column found in the dataset.")

In [None]:
# Encode other categorical features in HPC-Kernel-Events dataset
if 'hpc_clean' in locals() and target_col is not None:
    # Identify categorical columns excluding the target
    categorical_cols = [col for col in hpc_clean.select_dtypes(include=['object']).columns 
                        if col != target_col]
    
    # Encode each categorical column
    for col in categorical_cols:
        le = LabelEncoder()
        hpc_clean[col] = le.fit_transform(hpc_clean[col])
        print(f"Encoded column: {col}")
    
    print("\nAfter encoding, first 5 rows:")
    print(hpc_clean.head())

In [None]:
# Separate features and target for HPC-Kernel-Events dataset
if 'hpc_clean' in locals() and target_col is not None:
    X_hpc = hpc_clean.drop(columns=[target_col])
    y_hpc = hpc_clean[target_col]
    
    print(f"Features shape: {X_hpc.shape}")
    print(f"Target shape: {y_hpc.shape}")
    print(f"Target distribution:\n{y_hpc.value_counts()}")

In [None]:
# Scale features for HPC-Kernel-Events dataset
if 'X_hpc' in locals():
    # Initialize StandardScaler
    scaler = StandardScaler()
    
    # Scale the features
    X_hpc_scaled = scaler.fit_transform(X_hpc)
    
    print(f"Scaled features shape: {X_hpc_scaled.shape}")

In [None]:
# Apply PCA for dimensionality reduction
if 'X_hpc_scaled' in locals():
    # Determine number of components to keep 95% of variance
    pca = PCA(n_components=0.95)
    X_hpc_pca = pca.fit_transform(X_hpc_scaled)
    
    print(f"PCA components: {pca.n_components_}")
    print(f"Explained variance ratio: {np.sum(pca.explained_variance_ratio_):.4f}")
    print(f"After PCA, features shape: {X_hpc_pca.shape}")
    
    # Plot explained variance ratio
    plt.figure(figsize=(10, 6))
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance')
    plt.title('Explained Variance vs. Number of Components')
    plt.grid(True)
    plt.show()

In [None]:
# Create final processed dataframe for HPC-Kernel-Events
if 'X_hpc_pca' in locals() and 'y_hpc' in locals():
    # Create column names for PCA components
    pca_columns = [f'PC{i+1}' for i in range(X_hpc_pca.shape[1])]
    
    # Create a DataFrame with PCA features
    hpc_processed = pd.DataFrame(X_hpc_pca, columns=pca_columns)
    
    # Add the target column
    hpc_processed[target_col] = y_hpc
    
    print("\nProcessed HPC-Kernel-Events dataset shape:", hpc_processed.shape)
    print("First 5 rows of processed dataset:")
    print(hpc_processed.head())

In [None]:
# Save the processed HPC-Kernel-Events dataset
if 'hpc_processed' in locals():
    output_path = "../data/HPC-Kernel-Events_processed.csv"
    hpc_processed.to_csv(output_path, index=False)
    print(f"Processed HPC-Kernel-Events dataset saved to {output_path}")

## Summary

In this notebook, we have performed the following preprocessing steps:

### PowerCombined Dataset:
1. Loaded and explored the dataset
2. Cleaned the data by removing duplicates and handling missing values
3. Encoded categorical features using LabelEncoder
4. Balanced the dataset using SMOTE
5. Scaled features using StandardScaler
6. Saved the processed dataset

### HPC-Kernel-Events Dataset:
1. Loaded and explored the dataset
2. Cleaned the data by removing duplicates and handling missing values
3. Encoded categorical features and the target variable using LabelEncoder
4. Standardized numeric features using StandardScaler
5. Applied PCA for dimensionality reduction while preserving 95% of variance
6. Saved the processed dataset

The processed datasets are ready for modeling and analysis.