CS446 FINAL PROJECT - Machine Learning in Intrusion Detection Systems (IDS)

This project focuses on reproducing key findings from the study “Intrusion Detection Using Machine Learning: A Comparison Study” (Biswas, 2018), which analyzes how different feature selection methods and classifiers impact detection performance. By re-implementing and evaluating selected techniques, this project explores how machine learning can enhance intrusion detection.

Author(s) : Jophene Campbell and Sophia Sasko

In [None]:

#Author(s): Jophene Campbell and Sophia Sasko

# Importing all the required libraries
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Preprocessing + feature selection
from sklearn.preprocessing import LabelEncoder, StandardScaler, KBinsDiscretizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.feature_selection import mutual_info_classif

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.base import clone

sns.set()
RANDOM_STATE = 42   # Ensures reproducibility
TOP_K = 30          # Number of top features to keep after ranking
CV_FOLDS = 5        # Number of cross-validation folds


In [None]:
#Author(s): Sophia Sasko
"""
    Loads the NSL-KDD train and test files using the correct whitespace delimiter.
    NSL-KDD is NOT comma-separated — it uses variable-length whitespace.
"""

def load_nsl_kdd(path="./Dataset", train="KDDTrain+.txt", test="KDDTest+.txt"):
    train_path = os.path.join(path, train)
    test_path = os.path.join(path, test)

    # Read 5000 rows from each file since we're limiting to 10000 rows per research paper
    df_train = pd.read_csv(train_path, header=None, nrows=5000)
    df_test  = pd.read_csv(test_path,  header=None, nrows=5000)

    # Put two files together
    df = pd.concat([df_train, df_test], ignore_index=True)

    # Remove the difficulty column
    df = df.iloc[:, :-1]

    # Rename our columns to show 42 features and the 1 label
    df.columns = [*(f"f{i}" for i in range(df.shape[1]-1)), "label"]

    print("Loaded data:", df.shape)
    return df

df = load_nsl_kdd()
with pd.option_context('display.max_rows', 20,
                       'display.max_columns', 10,
                       ):
    print(df)


In [None]:
#Author(s): Jophene Campbell and Sophia Sasko
def preprocess(df):
    """
    Preprocess the NSL-KDD dataset:
    - Clean and normalize the label column
    - Encode categorical features (protocol_type, service, flag)
    - Scale numerical features
    - Return X (features) and y (labels) in machine-learning-ready form
    """

    # Make a safe copy so we don't modify the original DataFrame
    df = df.copy()

    # Convert the label to lowercase string and classify:
    # If the label contains the word "normal", mark it as normal
    # Otherwise, treat it as an attack
    df["label"] = df["label"].apply(
        lambda x: "normal" if str(x).lower() == "normal" else "attack"
    )

    X = df.drop(columns=["label"])   # all input features
    y = df["label"]                  # target label


    # NSL-KDD contains 3 symbolic/categorical columns:
    #   - protocol_type
    #   - service
    #   - flag
    # For each column that is not numeric, convert text → integer codes
    for col in X.select_dtypes(include=["object"]).columns:
        X[col] = LabelEncoder().fit_transform(X[col])

    y = LabelEncoder().fit_transform(y) #encode labels

    # Standardization is critical for:
    #   - kNN (distance-based)
    #   - SVM (kernel-based)
    #   - Gradient methods
    # Converts each feature to zero-mean and unit-variance.
    X = pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns)

    print("Preprocessing complete.")
    print("X shape:", X.shape)
    print("y shape:", y.shape)

    return X, y #return features and labels


# Run the preprocessing
X, y = preprocess(df)


In [None]:
#Author(s): Jophene Campbell
"""
    Compute Information Gain Ratio (IGR) for each feature.
    Higher IGR = more informative feature.
"""
def igr_ranking(X, y):
    # mutual_info gives raw information gain values
    mi = mutual_info_classif(X.values, y, random_state=RANDOM_STATE)

    # Normalize into IGR (relative to total entropy)
    igr = mi / (mi.sum() + 1e-9)
    out = pd.Series(igr, index=X.columns).sort_values(ascending=False)

    return out # Return sorted feature ranking


In [None]:
#Author(s): Jophene Campbell
"""
    Approximate Correlation-Based Feature Selection (CFS):
    - High mutual information with class label
    - Low correlation with other features (avoids redundancy)
"""
def cfs_ranking(X, y):
    mi = mutual_info_classif(X.values, y)  # relevance
    corr = X.corr().abs()                  # redundancy
    np.fill_diagonal(corr.values, 0)       #ignore self-correlation

    # CFS score = relevance / redundancy
    redundancy = corr.mean(axis=1) + 1e-9
    scores = mi / redundancy
    out = pd.Series(scores, index=X.columns).sort_values(ascending=False)
    return out


In [None]:
#Author(s): Jophene Campbell
# Calculate IGR and CFS scores
igr_scores = igr_ranking(X, y)
cfs_scores = cfs_ranking(X, y)

# Select top-k features for each ranking method
top_igr = igr_scores.head(TOP_K).index.tolist()
top_cfs = cfs_scores.head(TOP_K).index.tolist()

print("Top IGR features:", top_igr)
print("Top CFS features:", top_cfs)

In [None]:
#Author(s): Jophene Campbell and Sophia Sasko
"""
    Evaluate multiple classifiers over selected feature subsets
    using Stratified K-Fold cross validation.

    Models:
      - kNN
      - Decision Tree
      - SVM

    Metrics:
      - Accuracy
      - Precision
      - Recall
      - F1
      - AUC
"""
def evaluate(X, y, feature_sets, folds=CV_FOLDS):

    classifiers = {
        "kNN": KNeighborsClassifier(n_neighbors=5),
        "DecisionTree": DecisionTreeClassifier(random_state=RANDOM_STATE),
        "SVM": SVC(kernel="rbf", probability=True, random_state=RANDOM_STATE)
    }

    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
    rows = []

    for fs_name, features in feature_sets.items():
        print(f"\nEvaluating feature set: {fs_name}")
        X_sub = X[features].values

        for clf_name, clf in classifiers.items():
            print(f"  Running classifier: {clf_name}")

            accs=[]; precs=[]; recs=[]; f1s=[]; aucs=[]

            #Cross Validation Loop
            for tr, te in skf.split(X_sub, y):

                # Skip folds that collapse into one class
                if len(np.unique(y[tr])) < 2 or len(np.unique(y[te])) < 2:
                    continue

                model = clone(clf)
                model.fit(X_sub[tr], y[tr])
                pred = model.predict(X_sub[te])

                #Standard Metrics
                accs.append(accuracy_score(y[te], pred))
                precs.append(precision_score(y[te], pred, zero_division=0))
                recs.append(recall_score(y[te], pred, zero_division=0))
                f1s.append(f1_score(y[te], pred, zero_division=0))

                # Probability-based metric (AUC)
                try:
                    probs = model.predict_proba(X_sub[te])
                    if probs.shape[1] == 2:
                        aucs.append(roc_auc_score(y[te], probs[:,1]))
                    else:
                        aucs.append(np.nan)
                except:
                    aucs.append(np.nan)

            # Save averages across folds
            rows.append({
                "feature_set": fs_name,
                "classifier": clf_name,
                "accuracy": np.mean(accs),
                "precision": np.mean(precs),
                "recall": np.mean(recs),
                "f1": np.mean(f1s),
                "auc": np.nanmean(aucs)
            })

    # Convert to DataFrame
    results = pd.DataFrame(rows)
    print("\n=== FINAL RESULTS ===")
    display(results)
    return results

results = evaluate(X, y, {"IGR": top_igr, "CFS": top_cfs})


In [None]:
#Author(s): Jophene Campbell and Sophia Sasko

# Heatmap comparing accuracy of each classifier + feature selection method
plt.figure(figsize=(6,4))
pivot = results.pivot(index="classifier", columns="feature_set", values="accuracy")
sns.heatmap(pivot, annot=True, cmap="viridis", fmt=".3f")
plt.title("Accuracy by Classifier and Feature Selection")
plt.show()


In [None]:
#Author(s): Sophia Sasko
"""
    Plots the top-20 feature selection scores for visualization.
"""
def plot_top(series, title, x_label, y_label):
    plt.figure(figsize=(8,6))
    sns.barplot(x=series.values[:20], y=series.index[:20])
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.show()

plot_top(igr_scores, "Top IGR Features (Scores)", "Score", "IGR Features")
plot_top(cfs_scores, "Top CFS Features (Scores)", "Score", "CFS Features")