In [1]:
pip install docx

Collecting docx
  Downloading docx-0.2.4.tar.gz (54 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/54.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.9/54.9 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: docx
  Building wheel for docx (setup.py) ... [?25l[?25hdone
  Created wheel for docx: filename=docx-0.2.4-py3-none-any.whl size=53893 sha256=e136059b30effe152a39097d679f28578c92991a275830b58497f2c9065d83dc
  Stored in directory: /root/.cache/pip/wheels/c1/3e/c3/e81c11effd0be5658a035947c66792dd993bcff317eae0e1ed
Successfully built docx
Installing collected packages: docx
Successfully installed docx-0.2.4


In [2]:
!pip install python-docx


Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m235.5/244.3 kB[0m [31m6.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.2


In [5]:
import pandas as pd
import numpy as np
import requests
from io import BytesIO
from docx import Document
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense, Dropout, GlobalAveragePooling1D, Layer
from tensorflow.keras.models import Model
#Get Feature Names from .docx

In [6]:
#  Load Dataset
def load_dataset(name):
    if name == "NSL-KDD":
        url = "/content/KDDTrain+.txt"
        df = pd.read_csv(url, header=None)

        #col_names = get_feature_names_from_docx()
        col_names = [
    'duration','protocol_type','service','flag','src_bytes','dst_bytes','land','wrong_fragment','urgent',
    'hot','num_failed_logins','logged_in','num_compromised','root_shell','su_attempted','num_root',
    'num_file_creations','num_shells','num_access_files','num_outbound_cmds','is_host_login',
    'is_guest_login','count','srv_count','serror_rate','srv_serror_rate','rerror_rate',
    'srv_rerror_rate','same_srv_rate','diff_srv_rate','srv_diff_host_rate','dst_host_count',
    'dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate','dst_host_srv_diff_host_rate','dst_host_serror_rate',
    'dst_host_srv_serror_rate','dst_host_rerror_rate','dst_host_srv_rerror_rate',
    'class', 'difficulty'
]
        df.columns = col_names

        df.drop_duplicates(inplace=True)
        df['class'] = df['class'].apply(lambda x: 'normal' if x == 'normal' else 'attack')

    elif name == "CICIDS2017":
        df = pd.read_csv("/content/combine.csv")  # Provide cleaned CSV locally
        columns_to_fix = [' Destination Port']  # Add others if needed
        for col in columns_to_fix:
          df[col] = pd.to_numeric(df[col], errors='coerce')

        df.dropna(subset=columns_to_fix, inplace=True)
        df['class'] = df[' Label'].apply(lambda x: 'normal' if x == 'BENIGN' else 'attack')
        df.drop(columns=[' Label'], inplace=True)

    elif name == "UNSW-NB15":
        df = pd.read_csv("/content/UNSW-NB15_1.csv")  # Provide cleaned CSV locally
        df = df.dropna().drop_duplicates()
        df['class'] = df['label'].apply(lambda x: 'normal' if x == 0 else 'attack')
        df.drop(columns=['label'], inplace=True)



    else:
        raise ValueError("Unknown dataset name")

    # Label encode categorical columns
    for col in df.select_dtypes(include=['object']).columns:
        if col != 'class':
            df[col] = LabelEncoder().fit_transform(df[col])

    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    # Then fill or drop NaNs
    df.fillna(df.median(numeric_only=True), inplace=True)
    X = df.drop('class', axis=1)
    y = LabelEncoder().fit_transform(df['class'])
    # Replace infinite values with NaN


    # Normalize features
    X_scaled = StandardScaler().fit_transform(X)
    X_scaled = X_scaled.reshape(X_scaled.shape[0], X_scaled.shape[1], 1)

    return train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [7]:
#  Custom Attention Layer

class SelfAttention(tf.keras.layers.Layer):
    def __init__(self, d_model):
        super(SelfAttention, self).__init__()
        self.query_dense = tf.keras.layers.Dense(d_model)
        self.key_dense = tf.keras.layers.Dense(d_model)
        self.value_dense = tf.keras.layers.Dense(d_model)
        self.softmax = tf.keras.layers.Softmax(axis=-1)

    def call(self, inputs):
        query = self.query_dense(inputs)
        key = self.key_dense(inputs)
        value = self.value_dense(inputs)

        score = tf.matmul(query, key, transpose_b=True)
        weights = self.softmax(score)
        output = tf.matmul(weights, value)
        return output

In [8]:
#  Build Model

def build_model(input_shape, num_classes):
    input_layer = Input(shape=input_shape)
    x = Conv1D(64, kernel_size=3, activation='relu', padding='same')(input_layer)
    x = MaxPooling1D(pool_size=2)(x)
    x = Bidirectional(LSTM(64, return_sequences=True))(x)
    x = SelfAttention(64)(x)
    x = GlobalAveragePooling1D()(x)
    x = Dense(64, activation='relu')(x)

    x = Dropout(0.3)(x)


    output_layer = Dense(num_classes, activation='softmax')(x)

    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [9]:
# Train and Evaluate

def train_and_evaluate(dataset_name):
    print(f"\nTraining on {dataset_name}...\n")

    # Load dataset
    X_train, X_test, y_train, y_test = load_dataset(dataset_name)

    # Number of classes in the dataset
    num_classes = len(np.unique(y_train))

    # Convert labels to categorical (one-hot encoding)
    y_train_cat = to_categorical(y_train, num_classes=num_classes)
    y_test_cat = to_categorical(y_test, num_classes=num_classes)

    # Build model
    model = build_model(input_shape=X_train.shape[1:], num_classes=num_classes)

    # Model training
    model.fit(X_train, y_train_cat, epochs=15, batch_size=128, validation_split=0.1, verbose=1)

    # Model prediction
    y_pred_probs = model.predict(X_test)  # Probabilities for AUC, etc.
    y_pred_classes = np.argmax(y_pred_probs, axis=1)  # Predicted class labels

    # Classification Report and Confusion Matrix
    print(f"Classification Report for {dataset_name}:\n", classification_report(y_test, y_pred_classes))
    print(f"Confusion Matrix:\n", confusion_matrix(y_test, y_pred_classes))

    #  Metrics
    acc = accuracy_score(y_test, y_pred_classes)
    prec = precision_score(y_test, y_pred_classes, average='weighted')
    rec = recall_score(y_test, y_pred_classes, average='weighted')
    f1 = f1_score(y_test, y_pred_classes, average='weighted')
    auc = roc_auc_score(y_test_cat, y_pred_probs, multi_class='ovr', average='weighted')  # AUC for multi-class

    cm = confusion_matrix(y_test, y_pred_classes)
    FP = cm.sum(axis=0) - np.diag(cm)  # False Positives
    FN = cm.sum(axis=1) - np.diag(cm)  # False Negatives
    TP = np.diag(cm)  # True Positives
    TN = cm.sum() - (FP + FN + TP)  # True Negatives

    FPR = FP / (FP + TN + 1e-10)
    TNR = TN / (TN + FP + 1e-10)

    print("\n Evaluation Metrics:")
    print(f" Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1-Score:  {f1:.4f}")
    print(f"AUC:       {auc:.4f}")
    print(f"Avg FPR:   {np.mean(FPR):.4f}")
    print(f"Avg TNR:   {np.mean(TNR):.4f}")

#  Run on NSL-KDD
train_and_evaluate("CICIDS2017")



Training on CICIDS2017...



  df = pd.read_csv("/content/combine.csv")  # Provide cleaned CSV locally


Epoch 1/15
[1m12457/12457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 12ms/step - accuracy: 0.9401 - loss: 0.1352 - val_accuracy: 0.9723 - val_loss: 0.0609
Epoch 2/15
[1m12457/12457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 12ms/step - accuracy: 0.9793 - loss: 0.0497 - val_accuracy: 0.9832 - val_loss: 0.0401
Epoch 3/15
[1m12457/12457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 11ms/step - accuracy: 0.9826 - loss: 0.0419 - val_accuracy: 0.9830 - val_loss: 0.0393
Epoch 4/15
[1m12457/12457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 12ms/step - accuracy: 0.9834 - loss: 0.0391 - val_accuracy: 0.9836 - val_loss: 0.0387
Epoch 5/15
[1m12457/12457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 11ms/step - accuracy: 0.9840 - loss: 0.0374 - val_accuracy: 0.9847 - val_loss: 0.0352
Epoch 6/15
[1m12457/12457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 12ms/step - accuracy: 0.9844 - loss: 0.0361 - val_accuracy: 0.9850 - val