In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
IoT Cyber Attack Classification Model for Google Colab (RF, XGBoost, NN only)
------------------------------------------------------
This script trains and evaluates selected machine learning models (Random Forest,
XGBoost, and Neural Network) for classifying IoT cyber attacks, and provides
visualizations of model performance.
"""

import os
import glob
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
import time
import warnings
from google.colab import drive

# Suppress warnings
warnings.filterwarnings('ignore')

# Set up styling for plots
plt.style.use('ggplot')
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

def mount_drive():
    """Mount Google Drive and return the base path."""
    drive.mount('/content/drive')
    print("Google Drive mounted successfully.")
    return '/content/drive'

def create_output_dirs(base_path):
    """Create output directories for figures and models if they don't exist."""
    figures_dir = os.path.join(base_path, 'My Drive/Data_2023/Model_Figures_1')
    models_dir = os.path.join(base_path, 'My Drive/Data_2023/Models_1')

    for dir_path in [figures_dir, models_dir]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    print(f"Output directories '{figures_dir}' and '{models_dir}' are ready.")
    return figures_dir, models_dir

def load_data(base_path):
    """Load all CSV files from the specified directory and concatenate them."""
    # Path to data folder
    data_folder = os.path.join(base_path, 'My Drive/Data_2023')

    # Get all CSV files
    all_files = glob.glob(os.path.join(data_folder, '*1.csv'))

    if not all_files:
        raise FileNotFoundError(f"No CSV files found in {data_folder}")

    print(f"Found {len(all_files)} CSV files in the directory.")

    # Initialize an empty list to store each dataframe
    dfs = []

    # Loop through each CSV file and load it into a dataframe
    for file in all_files:
        try:
            print(f"Loading {os.path.basename(file)}...")
            df = pd.read_csv(file)
            dfs.append(df)
        except Exception as e:
            print(f"Error loading {file}: {e}")

    # Concatenate all dataframes
    if not dfs:
        raise ValueError("No valid CSV files could be loaded.")

    data = pd.concat(dfs, ignore_index=True)
    print(f"Combined dataset shape: {data.shape}")

    return data

def preprocess_data(df, models_dir):
    """Clean and preprocess the data for modeling."""
    print("\n--- Preprocessing Data ---")

    # Make a copy to avoid modifying the original
    data = df.copy()

    # Display initial info
    print(f"Initial data shape: {data.shape}")

    # Get the label column (assuming it's 'Label' or the last column)
    if 'Label' in data.columns:
        label_column = 'Label'
    else:
        label_column = data.columns[-1]

    print(f"Detected label column: {label_column}")

    # Check for missing values
    missing_values = data.isnull().sum().sum()
    print(f"Total missing values: {missing_values}")

    if missing_values > 0:
        # Fill numeric columns with their median
        numeric_cols = data.select_dtypes(include=[np.number]).columns
        data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())

        # Fill categorical columns with mode
        cat_cols = data.select_dtypes(exclude=[np.number]).columns
        for col in cat_cols:
            data[col] = data[col].fillna(data[col].mode()[0])

        print(f"Missing values handled. Remaining missing: {data.isnull().sum().sum()}")

    # Handle potential infinite values
    numeric_cols = data.select_dtypes(include=[np.number]).columns
    data.replace([np.inf, -np.inf], np.nan, inplace=True)
    data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())

    # Identify and convert categorical columns to numeric
    cat_cols = data.select_dtypes(include=['object']).columns.tolist()

    # Remove label column from this list if it's categorical
    if label_column in cat_cols:
        cat_cols.remove(label_column)

    # OneHot encoding for categorical features with low cardinality
    for col in cat_cols:
        if data[col].nunique() < 10:  # Only one-hot encode if fewer than 10 unique values
            one_hot = pd.get_dummies(data[col], prefix=col, drop_first=True)
            data = pd.concat([data, one_hot], axis=1)
            data.drop(col, axis=1, inplace=True)
        else:
            # For high cardinality, use label encoding
            le = LabelEncoder()
            data[col] = le.fit_transform(data[col].astype(str))

    # Encode the label column
    le = LabelEncoder()
    data['attack_encoded'] = le.fit_transform(data[label_column])

    # Create a mapping of encoded values to original labels
    label_mapping = dict(zip(le.transform(le.classes_), le.classes_))

    # Keep the original label for reference
    data['attack_label'] = data[label_column]

    # Save label encoder for future predictions
    joblib.dump(le, os.path.join(models_dir, 'label_encoder.joblib'))

    print(f"Processed data shape: {data.shape}")
    print(f"Attack types: {data['attack_label'].unique().tolist()}")
    print(f"Label mapping: {label_mapping}")

    return data, label_mapping

def train_and_evaluate_models(df, label_mapping, figures_dir, models_dir):
    """Train and evaluate selected models for attack classification."""
    print("\n--- Training and Evaluating Models ---")

    # Prepare features and target
    X = df.select_dtypes(include=[np.number]).drop(['attack_encoded'], axis=1)
    y = df['attack_encoded']

    # Get feature names
    feature_names = X.columns.tolist()

    # Save feature names for later use
    with open(os.path.join(models_dir, 'feature_names.txt'), 'w') as f:
        for feature in feature_names:
            f.write(f"{feature}\n")

    # Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Save the scaler
    joblib.dump(scaler, os.path.join(models_dir, 'scaler.joblib'))

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42, stratify=y)

    print(f"Training set shape: {X_train.shape}")
    print(f"Test set shape: {X_test.shape}")

    # Define only the selected models to evaluate (RF, XGBoost, NN)
    models = {
        'Random Forest': RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42, n_jobs=-1),
        'XGBoost': XGBClassifier(n_estimators=50, max_depth=5, random_state=42, n_jobs=-1),
        'Neural Network': MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)
    }

    # Results storage
    results = {}
    training_times = {}

    # Train and evaluate each model
    for name, model in models.items():
        print(f"\nTraining {name}...")
        start_time = time.time()

        try:
            # Train the model
            model.fit(X_train, y_train)

            # Record training time
            training_time = time.time() - start_time
            training_times[name] = training_time

            # Make predictions
            y_pred = model.predict(X_test)

            # Evaluate the model
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, average='weighted')
            recall = recall_score(y_test, y_pred, average='weighted')
            f1 = f1_score(y_test, y_pred, average='weighted')

            results[name] = {
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1_score': f1,
                'training_time': training_time
            }

            print(f"{name} - Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}, Training Time: {training_time:.2f}s")

            # Save the model
            joblib.dump(model, os.path.join(models_dir, f"{name.replace(' ', '_').lower()}.joblib"))

            # Plot confusion matrix for top attack types
            # Limit to top 10 attack types for readability
            attack_counts = df['attack_encoded'].value_counts()
            top_attacks = attack_counts.nlargest(10).index.tolist()

            # Filter test data to only include top attacks
            top_attacks_mask = np.isin(y_test, top_attacks)
            y_test_top = y_test[top_attacks_mask]
            y_pred_top = y_pred[top_attacks_mask]

            if len(y_test_top) > 0:
                cm = confusion_matrix(y_test_top, y_pred_top)
                plt.figure(figsize=(14, 12))

                # Get class names for the top attacks
                class_names = [label_mapping[i] for i in top_attacks if i in y_test_top.unique()]

                sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                           xticklabels=class_names,
                           yticklabels=class_names)
                plt.title(f'Confusion Matrix - {name} (Top Attack Types)')
                plt.xlabel('Predicted')
                plt.ylabel('Actual')
                plt.xticks(rotation=45, ha='right')
                plt.yticks(rotation=45)
                plt.tight_layout()
                plt.savefig(os.path.join(figures_dir, f'confusion_matrix_{name.replace(" ", "_").lower()}.png'))
                plt.close()

            # Generate classification report
            report = classification_report(y_test, y_pred, target_names=[label_mapping[i] for i in sorted(label_mapping.keys())], output_dict=True)

            # Save the report as a dataframe
            report_df = pd.DataFrame(report).transpose()
            report_df.to_csv(os.path.join(models_dir, f'classification_report_{name.replace(" ", "_").lower()}.csv'))

        except Exception as e:
            print(f"Error training {name}: {e}")
            continue

    # Compare models performance
    if results:
        compare_models(results, training_times, figures_dir, models_dir)

        # Select the best model based on F1 score
        best_model_name = max(results, key=lambda x: results[x]['f1_score'])
        print(f"\nBest model based on F1 score: {best_model_name}")

        # Analyze feature importance for the best model if it's tree-based
        if best_model_name in ['Random Forest', 'XGBoost']:
            try:
                # Load the best model
                best_model = joblib.load(os.path.join(models_dir, f"{best_model_name.replace(' ', '_').lower()}.joblib"))
                analyze_feature_importance(best_model, feature_names, best_model_name, figures_dir)
            except Exception as e:
                print(f"Error analyzing feature importance: {e}")

        return best_model_name
    else:
        print("No models were successfully trained.")
        return None

def compare_models(results, training_times, figures_dir, models_dir):
    """Compare model performance metrics."""
    print("\n--- Comparing Model Performance ---")

    # Create dataframe from results
    results_df = pd.DataFrame(results).transpose()

    # Plot model comparison
    metrics = ['accuracy', 'precision', 'recall', 'f1_score']

    # Performance metrics plot
    fig, ax = plt.subplots(figsize=(12, 8))
    results_df[metrics].plot(kind='bar', ax=ax)
    plt.title('Model Performance Comparison')
    plt.xlabel('Model')
    plt.ylabel('Score')
    plt.ylim(0.7, 1.0)  # Adjust as needed for better visualization
    plt.grid(axis='y', alpha=0.3)
    plt.legend(loc='lower right')
    plt.tight_layout()
    plt.savefig(os.path.join(figures_dir, 'model_performance_comparison.png'))
    plt.close()

    # Training time plot
    plt.figure(figsize=(12, 6))
    plt.bar(training_times.keys(), training_times.values(), color='skyblue')
    plt.title('Model Training Time Comparison')
    plt.xlabel('Model')
    plt.ylabel('Training Time (seconds)')
    plt.xticks(rotation=45, ha='right')
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.savefig(os.path.join(figures_dir, 'model_training_time.png'))
    plt.close()

    # Save results to CSV
    results_df.to_csv(os.path.join(models_dir, 'model_performance_comparison.csv'))

    print("Model comparison visualization saved.")

def analyze_feature_importance(model, feature_names, model_name, figures_dir):
    """Analyze and visualize feature importance for the best model."""
    print("\n--- Analyzing Feature Importance ---")

    # Get feature importances (works for tree-based models)
    if hasattr(model, 'feature_importances_'):
        try:
            # Create dataframe of feature importances
            feature_importances = pd.DataFrame({
                'feature': feature_names,
                'importance': model.feature_importances_
            }).sort_values('importance', ascending=False)

            # Visualize top 20 features
            top_n = min(20, len(feature_importances))
            plt.figure(figsize=(12, 10))

            top_features = feature_importances.head(top_n)
            sns.barplot(x='importance', y='feature', data=top_features)
            plt.title(f'Top {top_n} Features by Importance ({model_name})')
            plt.xlabel('Importance')
            plt.ylabel('Feature')
            plt.tight_layout()
            plt.savefig(os.path.join(figures_dir, 'top_features_importance.png'))
            plt.close()

            print(f"Feature importance analysis completed for {model_name}")
        except Exception as e:
            print(f"Error in feature importance visualization: {e}")
    else:
        print(f"Feature importance analysis not applicable for {model_name}")

def create_model_summary(models_dir):
    """Create a summary of model performance and recommendations."""
    print("\n--- Creating Model Summary ---")

    # Load model performance data
    try:
        model_performance_path = os.path.join(models_dir, 'model_performance_comparison.csv')
        if os.path.exists(model_performance_path):
            model_performance = pd.read_csv(model_performance_path)
            model_performance.set_index('Unnamed: 0', inplace=True)
            model_performance.index.name = 'Model'

            # Identify best model based on F1 score
            best_model = model_performance['f1_score'].idxmax()
            best_f1 = model_performance.loc[best_model, 'f1_score']
            best_accuracy = model_performance.loc[best_model, 'accuracy']

            # Create summary text file
            with open(os.path.join(models_dir, 'model_summary.txt'), 'w') as f:
                f.write("IoT Cyber Attack Classification Model Summary\n")
                f.write("=============================================\n\n")

                f.write("Model Performance Comparison:\n")
                f.write("--------------------------\n")
                f.write(model_performance.to_string())
                f.write("\n\n")

                f.write("Best Performing Model:\n")
                f.write("-------------------\n")
                f.write(f"Model: {best_model}\n")
                f.write(f"F1 Score: {best_f1:.4f}\n")
                f.write(f"Accuracy: {best_accuracy:.4f}\n\n")

                f.write("Recommendation:\n")
                f.write("--------------\n")
                f.write(f"Based on the evaluation metrics, the {best_model} model is recommended for IoT cyber attack classification.\n")
                f.write("This model provides the best balance between precision and recall (F1 score).\n\n")

                f.write("Model Usage Instructions:\n")
                f.write("-----------------------\n")
                f.write("To use this model for prediction:\n")
                f.write("1. Load the model: model = joblib.load('./models/[model_filename].joblib')\n")
                f.write("2. Load the scaler: scaler = joblib.load('./models/scaler.joblib')\n")
                f.write("3. Preprocess new data (same features as training data)\n")
                f.write("4. Scale the features: X_scaled = scaler.transform(X_new)\n")
                f.write("5. Make predictions: predictions = model.predict(X_scaled)\n")
                f.write("6. Convert numeric predictions to labels using the label encoder: joblib.load('./models/label_encoder.joblib')\n")

            print("Model summary created.")
        else:
            print("Model performance comparison file not found.")
    except Exception as e:
        print(f"Error creating model summary: {e}")

def main():
    """Main function to execute the model training and evaluation pipeline."""
    print("Starting IoT Cyber Attack Classification Model Training")

    # Mount Google Drive
    base_path = mount_drive()

    # Create output directories
    figures_dir, models_dir = create_output_dirs(base_path)

    try:
        # Load data from all CSV files
        data = load_data(base_path)

        # Preprocess the data
        processed_data, label_mapping = preprocess_data(data, models_dir)

        # Sample the data if it's too large for Google Colab memory
        if len(processed_data) > 300000:  # Adjust threshold based on your Colab RAM
            print(f"Dataset is very large ({len(processed_data)} rows). Sampling 300,000 rows for model training.")
            processed_data = processed_data.sample(300000, random_state=42)

        # Train and evaluate models
        best_model = train_and_evaluate_models(processed_data, label_mapping, figures_dir, models_dir)

        # Create model summary
        create_model_summary(models_dir)

        print("\n--- Model Training and Evaluation Complete ---")
        if best_model:
            print(f"Best model: {best_model}")
        print(f"All models and visualizations have been saved to the '{models_dir}' and '{figures_dir}' directories.")

    except Exception as e:
        print(f"Error during model training and evaluation: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

Starting IoT Cyber Attack Classification Model Training
Mounted at /content/drive
Google Drive mounted successfully.
Output directories '/content/drive/My Drive/Data_2023/Model_Figures_1' and '/content/drive/My Drive/Data_2023/Models_1' are ready.
Found 1 CSV files in the directory.
Loading Merged01.csv...
Combined dataset shape: (712311, 40)

--- Preprocessing Data ---
Initial data shape: (712311, 40)
Detected label column: Label
Total missing values: 22
Missing values handled. Remaining missing: 0
Processed data shape: (712311, 42)
Attack types: ['DDOS-PSHACK_FLOOD', 'MIRAI-GREIP_FLOOD', 'DOS-UDP_FLOOD', 'DNS_SPOOFING', 'DDOS-ICMP_FLOOD', 'DDOS-TCP_FLOOD', 'DDOS-SYN_FLOOD', 'DDOS-UDP_FLOOD', 'MITM-ARPSPOOFING', 'DDOS-SYNONYMOUSIP_FLOOD', 'DOS-TCP_FLOOD', 'VULNERABILITYSCAN', 'DOS-SYN_FLOOD', 'DDOS-RSTFINFLOOD', 'BENIGN', 'DDOS-SLOWLORIS', 'DDOS-ICMP_FRAGMENTATION', 'MIRAI-GREETH_FLOOD', 'RECON-HOSTDISCOVERY', 'MIRAI-UDPPLAIN', 'RECON-PORTSCAN', 'DDOS-ACK_FRAGMENTATION', 'DDOS-UDP_FRA