<a href="https://colab.research.google.com/github/ASR16arif/Machine-Learning/blob/main/CS_NIDS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout
from tensorflow.keras.utils import to_categorical

In [2]:
# Install kaggle and required dependencies
!pip install kaggle



In [3]:
from google.colab import files

In [4]:
# Upload the Kaggle credentials
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"asrarif","key":"be07789079edad86d416ac2615c24996"}'}

In [5]:
# Create a directory and copy credentials
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [6]:
# Download the dataset from Kaggle
! kaggle datasets download -d galaxyh/kdd-cup-1999-data

Dataset URL: https://www.kaggle.com/datasets/galaxyh/kdd-cup-1999-data
License(s): unknown
Downloading kdd-cup-1999-data.zip to /content
 77% 68.0M/87.8M [00:00<00:00, 100MB/s]
100% 87.8M/87.8M [00:00<00:00, 116MB/s]


In [7]:
# Download the dataset from Kaggle
! kaggle datasets download -d solarmainframe/ids-intrusion-csv

Dataset URL: https://www.kaggle.com/datasets/solarmainframe/ids-intrusion-csv
License(s): Attribution 4.0 International (CC BY 4.0)
Downloading ids-intrusion-csv.zip to /content
100% 1.60G/1.60G [00:22<00:00, 115MB/s] 
100% 1.60G/1.60G [00:22<00:00, 77.5MB/s]


In [8]:
# Unzip the dataset
!unzip kdd-cup-1999-data.zip

Archive:  kdd-cup-1999-data.zip
  inflating: corrected.gz            
  inflating: corrected/corrected     
  inflating: kddcup.data.corrected   
  inflating: kddcup.data.gz          
  inflating: kddcup.data/kddcup.data  
  inflating: kddcup.data_10_percent.gz  
  inflating: kddcup.data_10_percent/kddcup.data_10_percent  
  inflating: kddcup.data_10_percent_corrected  
  inflating: kddcup.names            
  inflating: kddcup.newtestdata_10_percent_unlabeled.gz  
  inflating: kddcup.newtestdata_10_percent_unlabeled/kddcup.newtestdata_10_percent_unlabeled  
  inflating: kddcup.testdata.unlabeled.gz  
  inflating: kddcup.testdata.unlabeled/kddcup.testdata.unlabeled  
  inflating: kddcup.testdata.unlabeled_10_percent.gz  
  inflating: kddcup.testdata.unlabeled_10_percent/kddcup.testdata.unlabeled_10_percent  
  inflating: training_attack_types   
  inflating: typo-correction.txt     


In [9]:
# Unzip the dataset
!unzip ids-intrusion-csv.zip

Archive:  ids-intrusion-csv.zip
  inflating: 02-14-2018.csv          
  inflating: 02-15-2018.csv          
  inflating: 02-16-2018.csv          
  inflating: 02-20-2018.csv          
  inflating: 02-21-2018.csv          
  inflating: 02-22-2018.csv          
  inflating: 02-23-2018.csv          
  inflating: 02-28-2018.csv          
  inflating: 03-01-2018.csv          
  inflating: 03-02-2018.csv          


In [10]:
# Load the KDD CUP 1999 dataset
kdd_df = pd.read_csv('kddcup.data.corrected', header=None)

In [11]:
kdd_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,37,38,39,40,41
0,0,tcp,http,SF,215,45076,0,0,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,162,4528,0,0,0,0,...,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,236,1228,0,0,0,0,...,2,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,233,2032,0,0,0,0,...,3,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,239,486,0,0,0,0,...,4,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,normal.


In [12]:
#Reset column names for KDD CUP 1999 dataset
kdd_df.columns = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
'num_access_files', 'num_outbound_cmds', 'is_host_login',
'is_guest_login', 'count', 'srv_count', 'serror_rate',
'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
'dst_host_srv_count', 'dst_host_same_srv_rate','dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
'dst_host_srv_rerror_rate', 'label']

In [13]:
kdd_df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,http,SF,215,45076,0,0,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,162,4528,0,0,0,0,...,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,236,1228,0,0,0,0,...,2,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,233,2032,0,0,0,0,...,3,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,239,486,0,0,0,0,...,4,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,normal.


In [14]:
# Load the CSE-CIC-IDS2018 dataset
cic_df = pd.read_csv('03-02-2018.csv')

In [15]:
cic_df.head()

Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,443,6,02/03/2018 08:47:38,141385,9,7,553,3773.0,202,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1,49684,6,02/03/2018 08:47:38,281,2,1,38,0.0,38,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
2,443,6,02/03/2018 08:47:40,279824,11,15,1086,10527.0,385,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
3,443,6,02/03/2018 08:47:40,132,2,0,0,0.0,0,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
4,443,6,02/03/2018 08:47:41,274016,9,13,1285,6141.0,517,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign


In [16]:
# Select and rename columns for consistency with KDD CUP 1999 dataset
cic_df = cic_df[['Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts',
                 'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min', 'Fwd Pkt Len Mean',
                 'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Label']]


In [17]:
cic_df.columns = ['duration', 'src_bytes', 'dst_bytes', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts',
                  'Fwd Pkt Len Max', 'Fwd Pkt Len Min', 'Fwd Pkt Len Mean',
                  'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'label']

In [18]:
cic_df.head()

Unnamed: 0,duration,src_bytes,dst_bytes,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Bwd Pkt Len Max,Bwd Pkt Len Min,Bwd Pkt Len Mean,label
0,141385,9,7,553,3773.0,202,0,61.444444,1460,0,539.0,Benign
1,281,2,1,38,0.0,38,0,19.0,0,0,0.0,Benign
2,279824,11,15,1086,10527.0,385,0,98.727273,1460,0,701.8,Benign
3,132,2,0,0,0.0,0,0,0.0,0,0,0.0,Benign
4,274016,9,13,1285,6141.0,517,0,142.777778,1460,0,472.384615,Benign


In [19]:
# Encode categorical features in the KDD CUP 1999 dataset
le = LabelEncoder()
kdd_df['protocol_type'] = le.fit_transform(kdd_df['protocol_type'])
kdd_df['service'] = le.fit_transform(kdd_df['service'])
kdd_df['flag'] = le.fit_transform(kdd_df['flag'])
kdd_df['label'] = le.fit_transform(kdd_df['label'])

In [20]:
# Encode labels for consistency
cic_df['label'] = le.fit_transform(cic_df['label'])

In [21]:
# Fill missing values in CSE-CIC-IDS2018 dataset with 0
cic_df.fillna(0, inplace=True)

In [22]:
# Select common columns between both datasets for merging
common_columns = ['duration', 'src_bytes', 'dst_bytes', 'label']

In [23]:
# Filter both datasets by common columns
kdd_df_common = kdd_df[common_columns]
cic_df_common = cic_df[common_columns]

In [24]:
# Combine the datasets
combined_df = pd.concat([kdd_df_common, cic_df_common], ignore_index=True)

In [41]:
combined_df.head()

Unnamed: 0,duration,src_bytes,dst_bytes,label
0,0,215,45076,11
1,0,162,4528,11
2,0,236,1228,11
3,0,233,2032,11
4,0,239,486,11


In [25]:
# Separate features and target variable
X = combined_df.drop('label', axis=1)
y = combined_df['label']

In [26]:
# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [27]:
# Convert the target to categorical for multiclass classification
y = to_categorical(y)

In [28]:
# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
# Reshape data to fit Conv1D input requirements (samples, timesteps, features)
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

In [30]:
# Build the CNN model
model = Sequential()


In [31]:
# Input Layer with Conv1D layer using kernel size of 1 to avoid shape mismatch issues
model.add(Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [32]:
# Add a second convolutional layer with a kernel size of 1
model.add(Conv1D(filters=64, kernel_size=1, activation='relu'))


In [33]:
# Dropout Layer
model.add(Dropout(0.25))

In [34]:
# Flatten layer to convert data for Dense layers
model.add(Flatten())

In [35]:
# Dense Layer
model.add(Dense(128, activation='relu'))

In [36]:
# Dropout Layer to avoid overfitting
model.add(Dropout(0.5))

In [37]:
# Output Layer for classification
model.add(Dense(y_train.shape[1], activation='softmax'))

In [38]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [39]:
# Model summary
model.summary()

In [40]:
# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))


Epoch 1/10
[1m 69870/148676[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m5:16[0m 4ms/step - accuracy: 0.7904 - loss: 1.0623

KeyboardInterrupt: 

In [None]:

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score
import seaborn as sns
import matplotlib.pyplot as plt
from tensorflow.keras.utils import plot_model
from tensorflow.keras.callbacks import EarlyStopping
from PIL import Image
import io
from IPython.display import display

In [None]:
# Evaluate model performance
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")

In [None]:
# Predict the classes for the test data
y_pred = model.predict(X_test)
y_pred_classes = y_pred.argmax(axis=-1)

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test.argmax(axis=1), y_pred.argmax(axis=1))

In [None]:
# Normalize the confusion matrix
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

In [None]:
cm_normalized = np.random.rand(9, 9)  # Example: Random data for 9x9 matrix
cm_normalized = cm_normalized / cm_normalized.sum(axis=1)[:, np.newaxis]  # Normalize

In [None]:
# Define the labels for the 9 classes
labels = [f'Class_{i}' for i in range(1, 10)]

In [None]:
plt.figure(figsize=(8, 6))  # Adjust figure size for 9x9 matrix
sns.heatmap(cm_normalized, annot=True, fmt='.3f', cmap='Blues',
            xticklabels=labels,
            yticklabels=labels,
            annot_kws={"size": 8},  # Decrease annotation font size
            cbar_kws={"shrink": .75})  # Shrink color bar

plt.title('Normalized Confusion Matrix', fontsize=10)
plt.xlabel('Predicted Label', fontsize=8)
plt.ylabel('True Label', fontsize=8)
plt.xticks(fontsize=8)
plt.yticks(fontsize=8)
plt.tight_layout()  # Adjust layout
plt.show()

In [None]:
from sklearn.metrics import classification_report

In [None]:
# Assuming y_pred contains probabilities for 23 labels
threshold = 0.5  # Set a threshold for classification
y_pred_binary = (y_pred > threshold).astype(int) # Convert probabilities to binary values


In [None]:
# Precision score
precision = precision_score(y_true, y_pred_classes, average='macro')
print(f"Precision Score: {precision}")

In [None]:
# Correlation matrix of features
df_features = pd.DataFrame(X_train.reshape(X_train.shape[0], -1))
corr_matrix = df_features.corr()

In [None]:
# Plot the correlation matrix
plt.figure(figsize=(6, 5))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm')
plt.title('Correlation Matrix of Features')
plt.show()

In [None]:
import matplotlib.image as mpimg
from matplotlib.patches import Rectangle
from matplotlib.patches import FancyBboxPatch, Arrow

import keras

In [None]:
def draw_cnn_layer(ax, center, width, height, n_dots, label, color, alpha=0.1):
    # Draw transparent rectangle representing the layer
    rect = FancyBboxPatch((center[0] - width/2, center[1] - height/2), width, height,
                          boxstyle="round,pad=0.1", edgecolor=color, facecolor=color, lw=2, alpha=alpha)
    ax.add_patch(rect)

    # Scatter dots to represent the features
    x_dots = np.random.uniform(center[0] - width/2 + 0.1, center[0] + width/2 - 0.1, n_dots)
    y_dots = np.random.uniform(center[1] - height/2 + 0.1, center[1] + height/2 - 0.1, n_dots)
    ax.scatter(x_dots, y_dots, s=10, color=color, alpha=0.7)

    # Add the label for the layer
    ax.text(center[0], center[1] + height/2 + 0.3, label, ha='center', va='center', fontsize=10, color=color)

def draw_cnn_architecture():
    fig, ax = plt.subplots(figsize=(10, 6))

    # Layer properties
    layer_width = 2
    layer_height = 0.8
    n_dots = 30

    # Draw the input layer on the left
    input_center = (-4, 0)
    draw_cnn_layer(ax, input_center, layer_width, layer_height, n_dots, "Input Layer", 'blue', alpha=0.3)

    # Draw the CNN layers
    layers_centers = [(x, 0) for x in np.arange(-2, 4, 2)]  # Position layers horizontally
    labels = ["Conv Layer 1", "Pooling Layer 1", "Conv Layer 2", "Pooling Layer 2"]
    colors = ['orange', 'green', 'orange', 'green']

    for center, label, color in zip(layers_centers, labels, colors):
        draw_cnn_layer(ax, center, layer_width, layer_height, n_dots, label, color, alpha=0.3)

    # Draw the fully connected and output layers on the right
    fully_connected_center = (6, 0)
    output_center = (8, 0)

    draw_cnn_layer(ax, fully_connected_center, layer_width, layer_height, n_dots, "Fully Connected", 'purple', alpha=0.3)
    draw_cnn_layer(ax, output_center, layer_width, layer_height, n_dots, "Output Layer", 'red', alpha=0.3)

    # Draw arrows between the layers to show data flow
    for i in range(len(layers_centers) + 1):
        ax.arrow(input_center[0] + i*2 + 0.8, 0, 0.4, 0, head_width=0.1, head_length=0.2, fc='black', ec='black')

    # Arrow between the fully connected and output layer
    ax.arrow(fully_connected_center[0] + 0.8, 0, 0.4, 0, head_width=0.1, head_length=0.2, fc='black', ec='black')

    # Add labels for the input and output comparison
    plt.text(input_center[0], -0.8, "Input Features", fontsize=12, ha='center', color='blue')
    plt.text(output_center[0], -0.8, "Predicted Classes", fontsize=12, ha='center', color='red')

    # Remove axis
    ax.axis('off')

    # Show the plot
    plt.title("Simplified CNN Architecture", fontsize=16)
    plt.show()

# Call the function to draw the CNN architecture
draw_cnn_architecture()

In [None]:
# Frequency Distributions of Scaled Features of Testing Set
plt.figure(figsize=(12, 10))
for i, feature in enumerate(combined_df.columns[:-1]):
    plt.subplot(3, 3, i+1)
    plt.hist(X_test[:, i], bins=20, color='blue', alpha=0.7)
    plt.title(f'{feature}')
    plt.tight_layout()

plt.show()

In [None]:
# Correlation Matrix
corr_matrix = pd.DataFrame(X_test.reshape(X_test.shape[0], X_test.shape[1]), columns=combined_df.columns[:-1]).corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='Blues')
plt.title('Correlation Matrix of Scaled Features')
plt.show()

In [None]:
# Plotting the training/validation accuracy and loss

# Accuracy
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

# Loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

In [None]:
# Display the plots
plt.tight_layout()
plt.show()