In [2]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
IoT Cyber Attack Analysis for Google Colab
-----------------------------------------
This script analyzes datasets of cyber attacks on IoT systems, performs
feature correlation analysis, and generates visualizations to help identify
patterns in different attack types.
"""

import os
import glob
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA
import matplotlib.cm as cm
from matplotlib.colors import ListedColormap
from collections import Counter
import matplotlib.gridspec as gridspec
from google.colab import drive

# Suppress warnings
warnings.filterwarnings('ignore')

# Set up styling for plots
plt.style.use('ggplot')
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12
SMALL_SIZE = 10
MEDIUM_SIZE = 12
BIGGER_SIZE = 14
plt.rc('font', size=SMALL_SIZE)
plt.rc('axes', titlesize=BIGGER_SIZE)
plt.rc('axes', labelsize=MEDIUM_SIZE)
plt.rc('xtick', labelsize=SMALL_SIZE)
plt.rc('ytick', labelsize=SMALL_SIZE)
plt.rc('legend', fontsize=SMALL_SIZE)
plt.rc('figure', titlesize=BIGGER_SIZE)

def mount_drive():
    """Mount Google Drive and return the base path."""
    drive.mount('/content/drive')
    print("Google Drive mounted successfully.")
    return '/content/drive'

def create_output_dir(base_path):
    """Create output directory for figures if it doesn't exist."""
    figures_dir = os.path.join(base_path, 'My Drive/Data_2023/figures')
    if not os.path.exists(figures_dir):
        os.makedirs(figures_dir)
    print(f"Output directory '{figures_dir}' is ready.")
    return figures_dir

def load_data(base_path):
    """Load all CSV files from the specified directory and concatenate them."""
    # Path to data folder
    data_folder = os.path.join(base_path, 'My Drive/Data_2023')

    # Get all CSV files
    all_files = glob.glob(os.path.join(data_folder, '*1.csv'))

    if not all_files:
        raise FileNotFoundError(f"No CSV files found in {data_folder}")

    print(f"Found {len(all_files)} CSV files in the directory.")

    # Initialize an empty list to store each dataframe
    dfs = []

    # Loop through each CSV file and load it into a dataframe
    for file in all_files:
        try:
            print(f"Loading {os.path.basename(file)}...")
            df = pd.read_csv(file)
            # Add file source as a column for reference
            df['source_file'] = os.path.basename(file)
            dfs.append(df)
        except Exception as e:
            print(f"Error loading {file}: {e}")

    # Concatenate all dataframes
    if not dfs:
        raise ValueError("No valid CSV files could be loaded.")

    data = pd.concat(dfs, ignore_index=True)
    print(f"Combined dataset shape: {data.shape}")

    return data

def preprocess_data(df):
    """Clean and preprocess the data."""
    print("\n--- Preprocessing Data ---")

    # Make a copy to avoid modifying the original
    data = df.copy()

    # Display initial info
    print(f"Initial data shape: {data.shape}")
    print(f"Columns: {data.columns.tolist()}")

    # Get the label column (assuming it's 'Label' or the last column before 'source_file')
    if 'Label' in data.columns:
        label_column = 'Label'
    else:
        # If 'Label' is not present, use the last column before 'source_file'
        if 'source_file' in data.columns:
            label_column = data.columns[-2]
        else:
            label_column = data.columns[-1]

    print(f"Detected label column: {label_column}")

    # Check for missing values
    missing_values = data.isnull().sum().sum()
    print(f"Total missing values: {missing_values}")

    if missing_values > 0:
        # Fill numeric columns with their median
        numeric_cols = data.select_dtypes(include=[np.number]).columns
        data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())

        # Fill categorical columns with mode
        cat_cols = data.select_dtypes(exclude=[np.number]).columns
        for col in cat_cols:
            data[col] = data[col].fillna(data[col].mode()[0])

        print(f"Missing values handled. Remaining missing: {data.isnull().sum().sum()}")

    # Handle potential infinite values
    numeric_cols = data.select_dtypes(include=[np.number]).columns
    data.replace([np.inf, -np.inf], np.nan, inplace=True)
    data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())

    # Identify and convert categorical columns to numeric
    cat_cols = data.select_dtypes(include=['object']).columns.tolist()

    # Remove 'source_file' and label column from this list
    if 'source_file' in cat_cols:
        cat_cols.remove('source_file')
    if label_column in cat_cols:
        cat_cols.remove(label_column)

    # OneHot encoding for categorical features with low cardinality
    for col in cat_cols:
        if data[col].nunique() < 10:  # Only one-hot encode if fewer than 10 unique values
            one_hot = pd.get_dummies(data[col], prefix=col, drop_first=True)
            data = pd.concat([data, one_hot], axis=1)
            data.drop(col, axis=1, inplace=True)
        else:
            # For high cardinality, use label encoding
            le = LabelEncoder()
            data[col] = le.fit_transform(data[col].astype(str))

    # Encode the label column
    le = LabelEncoder()
    data['attack_encoded'] = le.fit_transform(data[label_column])
    # Create a mapping of encoded values to original labels
    label_mapping = dict(zip(le.transform(le.classes_), le.classes_))

    # Keep the label column for reference
    data['attack_label'] = data[label_column]

    # Remove any unnecessary columns
    if 'source_file' in data.columns:
        data.drop('source_file', axis=1, inplace=True)

    print(f"Processed data shape: {data.shape}")
    print(f"Attack types: {data['attack_label'].unique().tolist()}")

    return data, label_mapping

def exploratory_data_analysis(df, label_mapping, figures_dir):
    """Perform exploratory data analysis and generate visualizations."""
    print("\n--- Performing Exploratory Data Analysis ---")

    # Count of each attack type
    plt.figure(figsize=(14, 8))
    attack_counts = df['attack_label'].value_counts()

    # Create a more readable plot with percentages
    ax = sns.barplot(x=attack_counts.index, y=attack_counts.values)
    plt.title('Distribution of Attack Types')
    plt.xlabel('Attack Type')
    plt.ylabel('Count')
    plt.xticks(rotation=45, ha='right')

    # Add percentage labels
    total = len(df)
    for i, p in enumerate(ax.patches):
        percentage = f'{100 * p.get_height() / total:.1f}%'
        ax.annotate(percentage, (p.get_x() + p.get_width() / 2., p.get_height()),
                     ha='center', va='bottom')

    plt.tight_layout()
    plt.savefig(os.path.join(figures_dir, 'attack_type_distribution.png'))
    plt.close()

    # Distribution of protocol types by attack
    if 'Protocol' in df.columns:
        plt.figure(figsize=(14, 8))
        protocol_by_attack = pd.crosstab(df['attack_label'], df['Protocol'])
        protocol_by_attack_pct = protocol_by_attack.div(protocol_by_attack.sum(axis=1), axis=0) * 100

        protocol_by_attack_pct.plot(kind='bar', stacked=True, colormap='viridis')
        plt.title('Protocol Distribution by Attack Type')
        plt.xlabel('Attack Type')
        plt.ylabel('Percentage (%)')
        plt.xticks(rotation=45, ha='right')
        plt.legend(title='Protocol', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        plt.savefig(os.path.join(figures_dir, 'protocol_by_attack.png'))
        plt.close()

    # Flow Duration by attack type (boxplot)
    if 'Flow Duration' in df.columns:
        plt.figure(figsize=(14, 8))

        # Use log scale for better visualization
        sns.boxplot(x='attack_label', y='Flow Duration', data=df)
        plt.yscale('log')
        plt.title('Flow Duration by Attack Type')
        plt.xlabel('Attack Type')
        plt.ylabel('Flow Duration (log scale)')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.savefig(os.path.join(figures_dir, 'flow_duration_by_attack.png'))
        plt.close()

        # Flow Duration distribution by attack type (violin plot)
        plt.figure(figsize=(16, 10))
        sns.violinplot(x='attack_label', y='Flow Duration', data=df, inner='quart', cut=0)
        plt.yscale('log')
        plt.title('Flow Duration Distribution by Attack Type')
        plt.xlabel('Attack Type')
        plt.ylabel('Flow Duration (log scale)')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.savefig(os.path.join(figures_dir, 'flow_duration_violin.png'))
        plt.close()

    # Analyze packet counts if those columns exist
    packet_cols = [col for col in df.columns if 'Packet' in col]
    if packet_cols:
        # Packet counts by attack type
        fig, axes = plt.subplots(nrows=len(packet_cols), figsize=(14, 5*len(packet_cols)))

        if len(packet_cols) == 1:
            axes = [axes]  # Make it iterable if only one subplot

        for i, col in enumerate(packet_cols):
            sns.boxplot(x='attack_label', y=col, data=df, ax=axes[i])
            axes[i].set_title(f'{col} by Attack Type')
            axes[i].set_xlabel('Attack Type')
            axes[i].set_ylabel(col)
            axes[i].tick_params(axis='x', rotation=45)

        plt.tight_layout()
        plt.savefig(os.path.join(figures_dir, 'packet_counts_by_attack.png'))
        plt.close()

    # PCA visualization for attack types
    # Select numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    # Exclude the encoded labels
    if 'attack_encoded' in numeric_cols:
        numeric_cols.remove('attack_encoded')

    if len(numeric_cols) > 2:  # Need at least 2 features for PCA
        try:
            # Standardize the features
            X = df[numeric_cols]
            scaler = StandardScaler()
            X_scaled = scaler.fit_transform(X)

            # Apply PCA
            pca = PCA(n_components=2)
            X_pca = pca.fit_transform(X_scaled)

            # Create a dataframe for plotting
            pca_df = pd.DataFrame(data=X_pca, columns=['PC1', 'PC2'])
            pca_df['attack_label'] = df['attack_label'].values

            # Plot PCA results
            plt.figure(figsize=(14, 10))

            attack_types = pca_df['attack_label'].unique()

            # Create colormap with distinct colors
            cmap = cm.get_cmap('tab20', len(attack_types))

            for i, attack in enumerate(attack_types):
                idx = pca_df['attack_label'] == attack
                plt.scatter(pca_df.loc[idx, 'PC1'], pca_df.loc[idx, 'PC2'],
                          label=attack, alpha=0.7, s=50, color=cmap(i))

            plt.title('PCA of Attack Types')
            plt.xlabel(f'Principal Component 1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
            plt.ylabel(f'Principal Component 2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
            plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
            plt.grid(True, alpha=0.3)
            plt.tight_layout()
            plt.savefig(os.path.join(figures_dir, 'pca_attack_types.png'))
            plt.close()

            print(f"PCA completed. Total explained variance: {sum(pca.explained_variance_ratio_):.2%}")
        except Exception as e:
            print(f"Error in PCA visualization: {e}")

def correlation_analysis(df, figures_dir):
    """Perform correlation analysis between features and attack types."""
    print("\n--- Performing Correlation Analysis ---")

    # Get numeric columns (excluding encoded labels)
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if 'attack_encoded' in numeric_cols:
        numeric_cols.remove('attack_encoded')

    # Correlation between features
    if len(numeric_cols) > 1:
        try:
            plt.figure(figsize=(16, 14))
            corr_matrix = df[numeric_cols].corr()

            # Create heatmap
            mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
            sns.heatmap(corr_matrix, mask=mask, cmap="coolwarm", center=0,
                        square=True, linewidths=.5, annot=False, fmt='.2f',
                        vmin=-1, vmax=1)
            plt.title('Feature Correlation Matrix')
            plt.tight_layout()
            plt.savefig(os.path.join(figures_dir, 'feature_correlation_matrix.png'))
            plt.close()

            # For better readability, create a filtered correlation matrix showing only high correlations
            high_corr = corr_matrix.abs().unstack().sort_values(kind="quicksort", ascending=False)
            high_corr = high_corr[high_corr < 1.0]  # Remove self-correlations (1.0)
            high_corr = high_corr[high_corr > 0.5]  # Keep only high correlations

            if not high_corr.empty:
                high_corr_df = pd.DataFrame(high_corr.reset_index())
                high_corr_df.columns = ['Feature 1', 'Feature 2', 'Correlation']

                plt.figure(figsize=(12, len(high_corr_df) * 0.4))
                sns.barplot(x='Correlation', y='Feature 1', hue='Feature 2', data=high_corr_df)
                plt.title('High Feature Correlations (|r| > 0.5)')
                plt.xlabel('Correlation Coefficient')
                plt.tight_layout()
                plt.savefig(os.path.join(figures_dir, 'high_feature_correlations.png'))
                plt.close()

                print(f"Found {len(high_corr_df)} feature pairs with high correlation (|r| > 0.5)")
            else:
                print("No high correlations (|r| > 0.5) found between features")
        except Exception as e:
            print(f"Error in correlation analysis: {e}")

    # Correlation of features with attack types (using point-biserial correlation for each attack type)
    attack_types = df['attack_label'].unique()

    # Create a figure for feature importance by attack type
    plt.figure(figsize=(16, 10))

    # For each attack type, calculate correlation with features
    correlations = {}

    try:
        for attack in attack_types:
            # Create binary column (1 for this attack, 0 for others)
            df[f'is_{attack}'] = (df['attack_label'] == attack).astype(int)

            # Calculate correlation of each feature with this attack type
            attack_corr = df[numeric_cols].corrwith(df[f'is_{attack}'])
            correlations[attack] = attack_corr

            # Remove the temporary column
            df.drop(f'is_{attack}', axis=1, inplace=True)

        # Convert to dataframe for easier manipulation
        corr_df = pd.DataFrame(correlations)

        # Sort features by their maximum absolute correlation with any attack type
        corr_df['max_abs_corr'] = corr_df.abs().max(axis=1)
        corr_df = corr_df.sort_values('max_abs_corr', ascending=False)

        # Take top 15 features for visualization
        top_features = corr_df.head(15).index.tolist()

        # Create heatmap of feature-attack correlations
        plt.figure(figsize=(14, 10))
        sns.heatmap(corr_df.loc[top_features, attack_types], cmap="coolwarm", center=0,
                   annot=True, fmt='.2f', linewidths=.5, vmin=-1, vmax=1)
        plt.title('Correlation between Top Features and Attack Types')
        plt.tight_layout()
        plt.savefig(os.path.join(figures_dir, 'feature_attack_correlation.png'))
        plt.close()

        print(f"Feature-attack correlation analysis completed for {len(attack_types)} attack types")
    except Exception as e:
        print(f"Error in feature-attack correlation analysis: {e}")

    # Return for use in the feature importance analysis
    return numeric_cols

def feature_importance_analysis_optimized(df, numeric_cols, figures_dir):
    """Analyze feature importance for attack classification using Random Forest with memory optimization."""
    print("\n--- Analyzing Feature Importance (Optimized) ---")

    # Take a sample if the dataset is very large
    sample_size = 100000  # Adjust this based on your available memory
    if len(df) > sample_size:
        print(f"Dataset is large ({len(df)} rows). Using a sample of {sample_size} rows for feature importance analysis.")
        df_sample = df.sample(sample_size, random_state=42)
    else:
        df_sample = df

    try:
        # Prepare data for model with fewer estimators and limited depth
        X = df_sample[numeric_cols]
        y = df_sample['attack_encoded']

        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

        # Train a lighter Random Forest Classifier
        rf = RandomForestClassifier(
            n_estimators=50,       # Reduced from 100
            max_depth=10,          # Limit tree depth
            min_samples_split=5,   # Require more samples to split
            n_jobs=4,              # Limit parallelism
            random_state=42
        )
        rf.fit(X_train, y_train)

        # Get feature importances
        feature_importances = pd.DataFrame({
            'feature': numeric_cols,
            'importance': rf.feature_importances_
        }).sort_values('importance', ascending=False)

        # Visualize top 20 features
        top_n = 20
        plt.figure(figsize=(12, 10))

        if len(feature_importances) > top_n:
            top_features = feature_importances.head(top_n)
        else:
            top_features = feature_importances

        sns.barplot(x='importance', y='feature', data=top_features)
        plt.title(f'Top {len(top_features)} Features by Importance')
        plt.xlabel('Importance')
        plt.ylabel('Feature')
        plt.tight_layout()
        plt.savefig(os.path.join(figures_dir, 'feature_importance.png'))
        plt.close()

        # Use a smaller subset for evaluation
        if len(X_test) > 10000:
            print(f"Using a subset of 10,000 samples for evaluation")
            X_test_small = X_test.iloc[:10000]
            y_test_small = y_test.iloc[:10000]
        else:
            X_test_small = X_test
            y_test_small = y_test

        # Evaluate on a smaller test set
        y_pred = rf.predict(X_test_small)
        accuracy = accuracy_score(y_test_small, y_pred)

        print(f"Random Forest model accuracy on sample: {accuracy:.4f}")

        # Create simplified confusion matrix with top attack types
        attack_counts = df_sample['attack_encoded'].value_counts()
        top_attacks = attack_counts.nlargest(10).index.tolist()

        # Filter test data to only include top attacks
        top_attacks_mask = np.isin(y_test_small, top_attacks)
        y_test_top = y_test_small[top_attacks_mask]
        y_pred_top = y_pred[top_attacks_mask]

        if len(y_test_top) > 0:
            cm = confusion_matrix(y_test_top, y_pred_top)
            plt.figure(figsize=(12, 10))

            # Get class names for the top attacks
            attack_labels = [df_sample.loc[df_sample['attack_encoded'] == attack, 'attack_label'].iloc[0]
                             for attack in top_attacks if attack in y_test_top.unique()]

            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                       xticklabels=attack_labels,
                       yticklabels=attack_labels)
            plt.title('Confusion Matrix (Top Attack Types)')
            plt.xlabel('Predicted')
            plt.ylabel('Actual')
            plt.xticks(rotation=45, ha='right')
            plt.yticks(rotation=45)
            plt.tight_layout()
            plt.savefig(os.path.join(figures_dir, 'confusion_matrix_top_attacks.png'))
            plt.close()

        return feature_importances.head(10)['feature'].tolist()
    except Exception as e:
        print(f"Error in feature importance analysis: {e}")
        return []

def create_summary_dashboard(df, top_features, figures_dir):
    """Create a summary dashboard with key insights."""
    print("\n--- Creating Summary Dashboard ---")

    try:
        # Create a dashboard with multiple plots
        fig = plt.figure(figsize=(20, 24))
        gs = gridspec.GridSpec(4, 2, figure=fig)

        # 1. Attack Distribution Pie Chart
        ax1 = fig.add_subplot(gs[0, 0])
        attack_counts = df['attack_label'].value_counts()
        attack_counts.plot.pie(autopct='%1.1f%%', textprops={'fontsize': 9},
                              colors=plt.cm.tab20.colors, ax=ax1)
        ax1.set_title('Attack Type Distribution')
        ax1.set_ylabel('')

        # 2. Top Features Bar Chart
        ax2 = fig.add_subplot(gs[0, 1])
        if top_features and all(feat in df.columns for feat in top_features):
            # Get mean values of top features by attack type
            feature_means = df.groupby('attack_label')[top_features[:5]].mean()
            feature_means.plot(kind='bar', ax=ax2)
            ax2.set_title('Top 5 Features by Attack Type')
            ax2.tick_params(axis='x', rotation=45)
            ax2.set_xlabel('')
            ax2.legend(loc='upper right')

        # 3. Protocol Distribution if available
        ax3 = fig.add_subplot(gs[1, 0])
        if 'Protocol' in df.columns:
            protocol_counts = df['Protocol'].value_counts()
            protocol_counts.plot(kind='bar', ax=ax3, color='skyblue')
            ax3.set_title('Protocol Distribution')
            ax3.set_xlabel('Protocol')
            ax3.set_ylabel('Count')
        else:
            ax3.set_title('Protocol Distribution (Data Not Available)')
            ax3.axis('off')

        # 4. Flow Duration Histogram if available
        ax4 = fig.add_subplot(gs[1, 1])
        if 'Flow Duration' in df.columns:
            df['Flow Duration'].hist(bins=50, ax=ax4, color='lightgreen')
            ax4.set_title('Flow Duration Distribution')
            ax4.set_xlabel('Flow Duration')
            ax4.set_ylabel('Frequency')
            # Use log scale if values span multiple orders of magnitude
            if df['Flow Duration'].max() / (df['Flow Duration'].min() + 1) > 1000:
                ax4.set_xscale('log')
        else:
            ax4.set_title('Flow Duration Distribution (Data Not Available)')
            ax4.axis('off')

        # 5. Flow Bytes/s by Attack Type if available
        ax5 = fig.add_subplot(gs[2, :])
        flowbytes_cols = [col for col in df.columns if 'bytes' in col.lower()]

        if flowbytes_cols:
            col = flowbytes_cols[0]
            boxplot = sns.boxplot(x='attack_label', y=col, data=df, ax=ax5)
            ax5.set_title(f'{col} by Attack Type')
            ax5.set_xlabel('Attack Type')
            ax5.set_ylabel(col)
            ax5.tick_params(axis='x', rotation=45)

            # If values span multiple orders, use log scale
            if df[col].max() / (df[col].min() + 1) > 1000:
                ax5.set_yscale('log')
        else:
            ax5.set_title('Flow Bytes Analysis (No appropriate columns available)')
            ax5.axis('off')

        # 6. Key Statistics Table
        ax6 = fig.add_subplot(gs[3, :])
        ax6.axis('tight')
        ax6.axis('off')

        # Gather key statistics
        total_samples = len(df)
        attack_pct = (df['attack_label'] != 'BENIGN').mean() * 100 if 'BENIGN' in df['attack_label'].unique() else 100
        most_common_attack = df['attack_label'].value_counts().index[0]
        attack_count = df['attack_label'].value_counts().iloc[0]
        attack_percent = attack_count / total_samples * 100

        # Create data for the table
        table_data = [
            ['Total Samples', f"{total_samples:,}"],
            ['Attack Percentage', f"{attack_pct:.2f}%"],
            ['Most Common Attack', most_common_attack],
            ['Count of Most Common Attack', f"{attack_count:,} ({attack_percent:.2f}%)"],
            ['Number of Attack Types', f"{df['attack_label'].nunique()}"],
        ]

        # Add more statistics if columns are available
        if 'Protocol' in df.columns:
            table_data.append(['Most Common Protocol', df['Protocol'].value_counts().index[0]])

        flowbytes_cols = [col for col in df.columns if 'bytes' in col.lower()]
        if flowbytes_cols:
            max_flow_attack = df.groupby('attack_label')[flowbytes_cols[0]].mean().idxmax()
            table_data.append(['Attack with Max Flow Bytes (avg)', max_flow_attack])

        # Create the table
        table = ax6.table(cellText=table_data, colLabels=['Statistic', 'Value'],
                          loc='center', cellLoc='left')
        table.auto_set_font_size(False)
        table.set_fontsize(12)
        table.scale(1, 2)

        plt.suptitle('IoT Cyber Attack Analysis Dashboard', fontsize=20)
        plt.tight_layout(rect=[0, 0, 1, 0.96])  # Adjust for the suptitle
        plt.savefig(os.path.join(figures_dir, 'summary_dashboard.png'), dpi=200, bbox_inches='tight')
        plt.close()

        print("Summary dashboard created")
    except Exception as e:
        print(f"Error creating summary dashboard: {e}")

def main():
    """Main function to execute the entire analysis pipeline."""
    print("Starting IoT Cyber Attack Analysis")

    # Mount Google Drive
    base_path = mount_drive()

    # Create output directory
    figures_dir = create_output_dir(base_path)

    try:
        # Load data from all CSV files
        data = load_data(base_path)

        # Preprocess the data
        processed_data, label_mapping = preprocess_data(data)

        # Perform exploratory data analysis
        exploratory_data_analysis(processed_data, label_mapping, figures_dir)

        # Perform correlation analysis
        numeric_cols = correlation_analysis(processed_data, figures_dir)

        # Analyze feature importance (using optimized function to avoid memory issues)
        top_features = feature_importance_analysis_optimized(processed_data, numeric_cols, figures_dir)

        # Create summary dashboard
        create_summary_dashboard(processed_data, top_features, figures_dir)

        print("\n--- Analysis Complete ---")
        print(f"All visualizations have been saved to the '{figures_dir}' directory.")

    except Exception as e:
        print(f"Error during analysis: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

Starting IoT Cyber Attack Analysis
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully.
Output directory '/content/drive/My Drive/Data_2023/figures' is ready.
Found 1 CSV files in the directory.
Loading Merged01.csv...
Combined dataset shape: (712311, 41)

--- Preprocessing Data ---
Initial data shape: (712311, 41)
Columns: ['Header_Length', 'Protocol Type', 'Time_To_Live', 'Rate', 'fin_flag_number', 'syn_flag_number', 'rst_flag_number', 'psh_flag_number', 'ack_flag_number', 'ece_flag_number', 'cwr_flag_number', 'ack_count', 'syn_count', 'fin_count', 'rst_count', 'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP', 'UDP', 'DHCP', 'ARP', 'ICMP', 'IGMP', 'IPv', 'LLC', 'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Variance', 'Label', 'source_file']
Detected label column: Label
Total missing values: 22
Missing values handled. Remaining missing: 0
P

<Figure size 1600x1000 with 0 Axes>