In [None]:
# prompt: Unzip this /content/preprocessed_data.zip

!unzip /content/preprocessed_data.zip -d /content/


Archive:  /content/preprocessed_data.zip
replace /content/content/preprocessed_data/y_train.npy? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: /content/content/preprocessed_data/y_train.npy  
  inflating: /content/content/preprocessed_data/X_test.npy  
  inflating: /content/content/preprocessed_data/selected_features.csv  
  inflating: /content/content/preprocessed_data/y_test.npy  
  inflating: /content/content/preprocessed_data/X_train.npy  
  inflating: /content/content/preprocessed_data/scaler.pkl  


# Phase 2: Model Development and Training for Intrusion Detection System
##This notebook focuses on developing and training machine learning models using the **preprocessed CICIDS2017 dataset** to create an effective intrusion detection system
# with reduced false positives.

### 1. Import Required Libraries

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib
import time
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_curve, roc_auc_score,
    precision_recall_curve, average_precision_score
)
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import to_categorical
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)
import tensorflow as tf
tf.random.set_seed(42)

### 2. Load Preprocessed Data

In [None]:
def load_preprocessed_data(data_dir='/content/content/preprocessed_data'):
    """
    Load the preprocessed data saved from Phase 1.

    Parameters:
    data_dir (str): Directory containing preprocessed data files

    Returns:
    tuple: (X_train, X_test, y_train, y_test, selected_features, scaler)
    """
    print("Loading preprocessed data...")

    # Load training and testing data
    X_train = np.load(f"{data_dir}/X_train.npy")
    X_test = np.load(f"{data_dir}/X_test.npy")
    y_train = np.load(f"{data_dir}/y_train.npy")
    y_test = np.load(f"{data_dir}/y_test.npy")

    # Load feature names
    selected_features = pd.read_csv(f"{data_dir}/selected_features.csv").iloc[:, 0].tolist()

    # Load scaler
    scaler = joblib.load(f"{data_dir}/scaler.pkl")

    print(f"Loaded training data: {X_train.shape}")
    print(f"Loaded testing data: {X_test.shape}")
    print(f"Number of selected features: {len(selected_features)}")

    return X_train, X_test, y_train, y_test, selected_features, scaler

# Load the preprocessed data
X_train, X_test, y_train, y_test, selected_features, scaler = load_preprocessed_data()

# Check if we're dealing with binary or multiclass classification
n_classes = len(np.unique(y_train))
problem_type = "Binary Classification" if n_classes == 2 else "Multiclass Classification"
print(f"\nProblem type: {problem_type} with {n_classes} classes")

Loading preprocessed data...
Loaded training data: (1818082, 30)
Loaded testing data: (779179, 30)
Number of selected features: 30

Problem type: Binary Classification with 2 classes


### 3. Define Model Evaluation Functions

In [None]:
def evaluate_model(model, X_test, y_test, model_name, is_neural_network=False):
    """
    Evaluate a trained model on the test set with a focus on false positive reduction.

    Parameters:
    model: Trained model
    X_test: Test features
    y_test: Test labels
    model_name: Name of the model for display purposes
    is_neural_network: Flag to indicate if model is a neural network

    Returns:
    dict: Dictionary containing evaluation metrics
    """
    start_time = time.time()

    # Make predictions
    if is_neural_network:
        y_prob = model.predict(X_test)
        if y_prob.shape[1] > 1:  # Multiclass
            y_pred = np.argmax(y_prob, axis=1)
        else:  # Binary
            y_pred = (y_prob > 0.5).astype('int').flatten()
    else:
        y_pred = model.predict(X_test)
        if hasattr(model, "predict_proba"):
            y_prob = model.predict_proba(X_test)
        else:
            # For models that don't have predict_proba (like SVM without probability=True)
            y_prob = None

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)

    if n_classes == 2:  # Binary classification
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        # Calculate false positive rate
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        specificity = tn / (tn + fp)
        false_positive_rate = fp / (fp + tn)
        false_negative_rate = fn / (fn + tp)

        # ROC curve only for binary classification
        if y_prob is not None:
            if is_neural_network:
                fpr, tpr, _ = roc_curve(y_test, y_prob)
                auc = roc_auc_score(y_test, y_prob)
            else:
                if y_prob.shape[1] > 1:
                    fpr, tpr, _ = roc_curve(y_test, y_prob[:, 1])
                    auc = roc_auc_score(y_test, y_prob[:, 1])
                else:
                    fpr, tpr, _ = roc_curve(y_test, y_prob)
                    auc = roc_auc_score(y_test, y_prob)
        else:
            fpr, tpr, auc = None, None, None
    else:  # Multiclass
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')

        # For multiclass, calculate per-class metrics
        class_report = classification_report(y_test, y_pred, output_dict=True)

        # Calculate false positive and negative rates manually for multiclass
        cm = confusion_matrix(y_test, y_pred)
        false_positive_rate = []
        false_negative_rate = []
        specificity = []

        for i in range(n_classes):
            fp = np.sum(cm[:, i]) - cm[i, i]
            tn = np.sum(cm) - np.sum(cm[i, :]) - np.sum(cm[:, i]) + cm[i, i]
            fn = np.sum(cm[i, :]) - cm[i, i]
            tp = cm[i, i]

            false_positive_rate.append(fp / (fp + tn) if (fp + tn) > 0 else 0)
            false_negative_rate.append(fn / (fn + tp) if (fn + tp) > 0 else 0)
            specificity.append(tn / (tn + fp) if (tn + fp) > 0 else 0)

        # Average the rates
        false_positive_rate = np.mean(false_positive_rate)
        false_negative_rate = np.mean(false_negative_rate)
        specificity = np.mean(specificity)

        # No simple ROC curve for multiclass
        fpr, tpr, auc = None, None, None

    # Create confusion matrix plot
    plt.figure(figsize=(10, 8))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(f'confusion_matrix_{model_name.replace(" ", "_").lower()}.png')
    plt.close()

    # Create ROC curve for binary classification
    if n_classes == 2 and fpr is not None and tpr is not None:
        plt.figure(figsize=(10, 8))
        plt.plot(fpr, tpr, label=f'AUC = {auc:.3f}')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC Curve - {model_name}')
        plt.legend(loc='lower right')
        plt.savefig(f'roc_curve_{model_name.replace(" ", "_").lower()}.png')
        plt.close()

    # Calculate execution time
    execution_time = time.time() - start_time

    # Compile and return results
    results = {
        'model_name': model_name,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'false_positive_rate': false_positive_rate,
        'false_negative_rate': false_negative_rate,
        'specificity': specificity,
        'execution_time': execution_time
    }

    if auc is not None:
        results['auc'] = auc

    # Print results
    print(f"\nEvaluation Results for {model_name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"False Positive Rate: {false_positive_rate:.4f}")
    print(f"False Negative Rate: {false_negative_rate:.4f}")
    print(f"Specificity: {specificity:.4f}")
    if auc is not None:
        print(f"AUC: {auc:.4f}")
    print(f"Execution Time: {execution_time:.2f} seconds")

    return results

def compare_models(results):
    """
    Compare multiple models based on their evaluation metrics.

    Parameters:
    results (list): List of dictionaries containing model evaluation results
    """
    # Extract key metrics for comparison
    models = [r['model_name'] for r in results]
    accuracy = [r['accuracy'] for r in results]
    precision = [r['precision'] for r in results]
    recall = [r['recall'] for r in results]
    f1 = [r['f1_score'] for r in results]
    fpr = [r['false_positive_rate'] for r in results]

    # Create DataFrame for easy manipulation
    comparison_df = pd.DataFrame({
        'Model': models,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'False Positive Rate': fpr
    })

    # Sort by false positive rate (ascending) and then by f1 score (descending)
    comparison_df = comparison_df.sort_values(by=['False Positive Rate', 'F1 Score'], ascending=[True, False])

    # Plot comparison
    plt.figure(figsize=(15, 10))

    # Plot metrics comparison
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
    df_plot = comparison_df.set_index('Model')[metrics]
    df_plot.plot(kind='bar', figsize=(15, 6))
    plt.title('Model Performance Comparison')
    plt.ylabel('Score')
    plt.ylim(0, 1)
    plt.legend(loc='lower right')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig('model_performance_comparison.png')
    plt.close()

    # Plot false positive rate specifically
    plt.figure(figsize=(15, 6))
    sns.barplot(x='Model', y='False Positive Rate', data=comparison_df)
    plt.title('False Positive Rate Comparison')
    plt.xticks(rotation=45)
    plt.ylabel('False Positive Rate')
    plt.ylim(0, min(1.0, max(fpr) * 1.5))  # Adjust y-axis for better visualization
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig('false_positive_rate_comparison.png')
    plt.close()

    # Print comparison table
    print("\nModel Comparison Table (sorted by False Positive Rate):")
    comparison_df['Rank'] = range(1, len(comparison_df) + 1)
    print(comparison_df.to_string(index=False, float_format='{:.4f}'.format))

    return comparison_df

### 4. Define and Train Traditional Machine Learning Models

In [None]:
def train_traditional_ml_models(X_train, y_train, X_test, y_test):
    """
    Train and evaluate multiple traditional machine learning models.

    Parameters:
    X_train, y_train: Training data
    X_test, y_test: Testing data

    Returns:
    list: Results for each model
    """
    print("\nTraining traditional machine learning models...")
    results = []

    # 1. Random Forest - good baseline for classification tasks
    print("\nTraining Random Forest...")
    rf_model = RandomForestClassifier(
        n_estimators=100,
        max_depth=20,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1
    )
    rf_model.fit(X_train, y_train)
    rf_results = evaluate_model(rf_model, X_test, y_test, "Random Forest")
    results.append(rf_results)

    # 2. Gradient Boosting - often performs well with imbalanced data
    print("\nTraining Gradient Boosting...")
    gb_model = GradientBoostingClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5,
        random_state=42
    )
    gb_model.fit(X_train, y_train)
    gb_results = evaluate_model(gb_model, X_test, y_test, "Gradient Boosting")
    results.append(gb_results)

    # 3. Support Vector Machine - good for high-dimensional spaces
    print("\nTraining SVM...")
    svm_model = SVC(
        kernel='rbf',
        C=1.0,
        gamma='scale',
        probability=True,
        random_state=42
    )
    svm_model.fit(X_train, y_train)
    svm_results = evaluate_model(svm_model, X_test, y_test, "SVM")
    results.append(svm_results)

    # 4. KNN - simple but effective for some classification tasks
    print("\nTraining KNN...")
    knn_model = KNeighborsClassifier(
        n_neighbors=5,
        weights='distance',
        n_jobs=-1
    )
    knn_model.fit(X_train, y_train)
    knn_results = evaluate_model(knn_model, X_test, y_test, "KNN")
    results.append(knn_results)

    # 5. Decision Tree - provides a baseline and feature importance
    print("\nTraining Decision Tree...")
    dt_model = DecisionTreeClassifier(
        max_depth=15,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42
    )
    dt_model.fit(X_train, y_train)
    dt_results = evaluate_model(dt_model, X_test, y_test, "Decision Tree")
    results.append(dt_results)

    return results, rf_model, gb_model, svm_model  # Return best models for further optimization

### 5. Define and Train Neural Network

In [None]:
def create_neural_network(input_dim, output_dim):
    """
    Create a neural network architecture for intrusion detection.

    Parameters:
    input_dim (int): Input dimension (number of features)
    output_dim (int): Output dimension (number of classes)

    Returns:
    model: Compiled neural network model
    """
    model = Sequential()

    # Input layer
    model.add(Dense(128, input_dim=input_dim, activation='relu'))
    model.add(Dropout(0.3))

    # Hidden layers
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))

    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.2))

    # Output layer
    if output_dim == 2:  # Binary classification
        model.add(Dense(1, activation='sigmoid'))
        loss = 'binary_crossentropy'
    else:  # Multiclass classification
        model.add(Dense(output_dim, activation='softmax'))
        loss = 'categorical_crossentropy'

    # Compile model
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss=loss,
        metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
    )

    return model

def train_neural_network(X_train, y_train, X_test, y_test):
    """
    Train a neural network for intrusion detection.

    Parameters:
    X_train, y_train: Training data
    X_test, y_test: Testing data

    Returns:
    tuple: (results, model)
    """
    print("\nTraining Neural Network...")

    # Prepare data for neural network
    input_dim = X_train.shape[1]

    if n_classes == 2:  # Binary classification
        output_dim = 2  # For architecture purposes
        y_train_nn = y_train  # Keep as is for binary
        y_test_nn = y_test
    else:  # Multiclass classification
        output_dim = n_classes
        y_train_nn = to_categorical(y_train)
        y_test_nn = to_categorical(y_test)

    # Create and compile the model
    nn_model = create_neural_network(input_dim, output_dim)

    # Define callbacks
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True
    )

    # Set up model checkpoint to save the best model
    checkpoint_path = "ids_neural_network.keras"
    model_checkpoint = ModelCheckpoint(
        checkpoint_path,
        monitor='val_loss',
        save_best_only=True,
        verbose=1
    )

    # Train the model
    history = nn_model.fit(
        X_train, y_train_nn if n_classes > 2 else y_train,
        epochs=30,
        batch_size=128,
        validation_split=0.2,
        callbacks=[early_stopping, model_checkpoint],
        verbose=1
    )

    # Plot training history
    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.tight_layout()
    plt.savefig('neural_network_training_history.png')
    plt.close()

    # Evaluate the model
    nn_results = evaluate_model(nn_model, X_test, y_test_nn if n_classes > 2 else y_test, "Neural Network", is_neural_network=True)

    return nn_results, nn_model


### 6. Optimize Best Performing Model

In [None]:
def optimize_best_model(best_model_type, X_train, y_train, X_test, y_test):
    """
    Optimize hyperparameters for the best performing model to further reduce false positives.

    Parameters:
    best_model_type (str): Type of the best performing model
    X_train, y_train: Training data
    X_test, y_test: Testing data

    Returns:
    tuple: (optimized_model, results)
    """
    print(f"\nOptimizing {best_model_type} model...")

    if best_model_type == "Random Forest":
        # Define parameter grid for RandomForest
        param_grid = {
            'n_estimators': [100, 200, 300],
            'max_depth': [20, 30, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'class_weight': [None, 'balanced']
        }

        base_model = RandomForestClassifier(random_state=42, n_jobs=-1)

    elif best_model_type == "Gradient Boosting":
        # Define parameter grid for GradientBoosting
        param_grid = {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.05, 0.1, 0.2],
            'max_depth': [3, 5, 7],
            'min_samples_split': [2, 5, 10],
            'subsample': [0.8, 0.9, 1.0]
        }

        base_model = GradientBoostingClassifier(random_state=42)

    elif best_model_type == "SVM":
        # Define parameter grid for SVM
        param_grid = {
            'C': [0.1, 1, 10],
            'gamma': ['scale', 'auto', 0.1, 1],
            'kernel': ['rbf', 'poly'],
            'class_weight': [None, 'balanced']
        }

        base_model = SVC(probability=True, random_state=42)

    else:
        # If not one of the expected models, default to Random Forest
        print(f"Unexpected model type: {best_model_type}. Defaulting to Random Forest optimization.")
        param_grid = {
            'n_estimators': [100, 200, 300],
            'max_depth': [20, 30, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'class_weight': [None, 'balanced']
        }

        base_model = RandomForestClassifier(random_state=42, n_jobs=-1)

    # Use RandomizedSearchCV to optimize hyperparameters
    # This is faster than GridSearchCV with similar results when the parameter space is large
    random_search = RandomizedSearchCV(
        base_model,
        param_distributions=param_grid,
        n_iter=10,  # Number of parameter settings sampled
        scoring='precision',  # Focus on precision to reduce false positives
        cv=3,  # 3-fold cross-validation
        n_jobs=-1,
        random_state=42,
        verbose=1
    )

    # Train the model
    random_search.fit(X_train, y_train)

    # Get best parameters and model
    best_params = random_search.best_params_
    print(f"Best parameters: {best_params}")

    # Train a new model with the best parameters
    optimized_model = random_search.best_estimator_

    # Evaluate the optimized model
    optimized_results = evaluate_model(optimized_model, X_test, y_test, f"Optimized {best_model_type}")

    return optimized_model, optimized_results


### 7. Execute the Training and Evaluation Pipeline

In [None]:
# Train traditional models
traditional_results, rf_model, gb_model, svm_model = train_traditional_ml_models(X_train, y_train, X_test, y_test)

# Train neural network
nn_results, nn_model = train_neural_network(X_train, y_train, X_test, y_test)

# Combine all results
all_results = traditional_results + [nn_results]

# Compare model performance
comparison_df = compare_models(all_results)

# Find best model based on false positive rate
best_model_idx = comparison_df['False Positive Rate'].idxmin()
best_model_name = comparison_df.loc[best_model_idx, 'Model']

print(f"\nBest model based on false positive rate: {best_model_name}")

# Map model name to model object
model_mapping = {
    "Random Forest": rf_model,
    "Gradient Boosting": gb_model,
    "SVM": svm_model,
    "Neural Network": nn_model
}

# Determine best traditional model for optimization
best_traditional_model_name = [m for m in model_mapping.keys() if m != "Neural Network" and m in comparison_df['Model'].values]
best_traditional_model_name = comparison_df[comparison_df['Model'].isin(best_traditional_model_name)].sort_values('False Positive Rate').iloc[0]['Model']

# Optimize the best traditional model
if best_traditional_model_name in model_mapping:
    optimized_model, optimized_results = optimize_best_model(
        best_traditional_model_name,
        X_train, y_train,
        X_test, y_test
    )

    # Add optimized results to all results and update comparison
    all_results.append(optimized_results)
    final_comparison = compare_models(all_results)

    # Final best model
    final_best_model_idx = final_comparison['False Positive Rate'].idxmin()
    final_best_model_name = final_comparison.loc[final_best_model_idx, 'Model']

    print(f"\nFinal best model: {final_best_model_name}")
else:
    print(f"Could not optimize model: {best_traditional_model_name} not found in model mapping")
    final_best_model_name = best_model_name



Training traditional machine learning models...

Training Random Forest...

Evaluation Results for Random Forest:
Accuracy: 0.9977
Precision: 0.9968
Recall: 0.9978
F1 Score: 0.9973
False Positive Rate: 0.0024
False Negative Rate: 0.0022
Specificity: 0.9976
AUC: 0.9999
Execution Time: 11.06 seconds

Training Gradient Boosting...

Evaluation Results for Gradient Boosting:
Accuracy: 0.9965
Precision: 0.9948
Recall: 0.9972
F1 Score: 0.9960
False Positive Rate: 0.0039
False Negative Rate: 0.0028
Specificity: 0.9961
AUC: 0.9998
Execution Time: 5.71 seconds

Training SVM...
