# CIC-IDS


ALL CODE IS MINE UNLESS OTHERWISE STATED

In [None]:
import os
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix, classification_report

import tensorflow as tf
import keras_tuner as kt
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Dense, Dropout, BatchNormalization, Input, Add, GlobalAveragePooling1D, ReLU
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

In [34]:
def lr_scheduler(epoch, lr):
    return max(lr * 0.95, 1e-6) # Prevents learning rate from going too low

In [35]:
# Load dataset
df_list = []
dataset_files = [
    "data/CIC-2017/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv",
    "data/CIC-2017/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv",
    "data/CIC-2017/Friday-WorkingHours-Morning.pcap_ISCX.csv",
    "data/CIC-2017/Monday-WorkingHours.pcap_ISCX.csv",
    "data/CIC-2017/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv",
    "data/CIC-2017/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv"
]

In [36]:
# Concatenate all datasets
for file in dataset_files:
    df_list.append(pd.read_csv(file))
df = pd.concat(df_list)
df = df.rename(columns={" Label": "Label"})

In [37]:
df['Label'] = df['Label'].apply(lambda x: 0 if x.strip() == "BENIGN" else 1)

In [38]:
# Handle missing and infinite values
df.replace('Infinity', -1, inplace=True)
df.replace([np.inf, -np.inf], -1, inplace=True)
df.fillna(df.max().max(), inplace=True)

In [39]:
# Encoding
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le
joblib.dump(label_encoders, 'label_encoders.pkl')

['label_encoders.pkl']

In [40]:
X = df.drop(columns=['Label'])
y = df['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

feature_names = X.columns.tolist()
joblib.dump(feature_names, 'features.pkl')

['features.pkl']

In [41]:
# Standardization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

In [42]:
# Apply SMOTE for class balancing
smote = SMOTE(sampling_strategy=0.3, random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train_scaled, y_train)

In [43]:
# Convert labels to categorical
y_train_bal = to_categorical(y_train_bal)
y_test = to_categorical(y_test)

In [44]:
# Reshape for CNN
X_train_reshaped = X_train_bal.reshape((X_train_bal.shape[0], X_train_bal.shape[1], 1))
X_test_reshaped = X_test_scaled.reshape((X_test_scaled.shape[0], X_test_scaled.shape[1], 1))

In [45]:
def residual_block(x, filters):
    shortcut = x
    # Apply Convolution, Batch Normalization, and ReLU
    x = Conv1D(filters, kernel_size=3, padding='same')(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)

    x = Conv1D(filters, kernel_size=3, padding='same')(x)
    x = BatchNormalization()(x)

    # Adjust shortcut to match the shape if needed
    if shortcut.shape[-1] != filters:
        shortcut = Conv1D(filters, kernel_size=1, padding='same')(shortcut)

    # Add residual connection
    x = Add()([x, shortcut])
    x = ReLU()(x)

    return x

In [46]:
# Build CNN Model
def build_cnn_model(hp):
    inputs = Input(shape=(X_train_reshaped.shape[1], 1))
    
    # First Conv Layer
    x = Conv1D(hp.Int('filters_1', 64, 256, step=64), kernel_size=3, activation='relu')(inputs)
    x = BatchNormalization()(x)
    x = MaxPooling1D(pool_size=2)(x)

    # Residual blocks
    for i in range(hp.Int('num_res_blocks', 1, 3)):
        x = residual_block(x, filters=hp.Int(f'filters_res_{i}', 64, 256, step=64))

    # Global Average Pooling
    x = GlobalAveragePooling1D()(x)

    # Dense Layer
    x = Dense(hp.Int('dense_units', 64, 256, step=64), activation='relu')(x)
    x = Dropout(0.5)(x)

    # Output Layer
    outputs = Dense(2, activation='softmax')(x)

    # Compile model
    model = Model(inputs, outputs)
    model.compile(optimizer=Adam(learning_rate=hp.Choice('learning_rate', values=[0.001, 0.0001])),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    return model

In [47]:
# Tune model
print("Starting Hyperparameter Tuning with Pruning...")
tuner = kt.Hyperband(
    build_cnn_model,
    objective='val_accuracy',
    max_epochs=20,
    factor=3,
    directory='cnn_tuner',
    project_name='CIC_IDS_Tuning',
    executions_per_trial=1
)

early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
tuner.search(X_train_reshaped, y_train_bal, epochs=20, validation_split=0.2, callbacks=[early_stop])

Starting Hyperparameter Tuning with Pruning...
Reloading Tuner from cnn_tuner\CIC_IDS_Tuning\tuner0.json


In [48]:
# Get Best Model
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
best_cnn_model = tuner.hypermodel.build(best_hps)

In [49]:
# Train the best model
history = best_cnn_model.fit(X_train_reshaped, y_train_bal, epochs=20, batch_size=64, validation_split=0.3, callbacks=[early_stop], verbose=1)

Epoch 1/20
[1m13945/13945[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 10ms/step - accuracy: 0.9651 - loss: 0.0927 - val_accuracy: 0.9754 - val_loss: 0.0706
Epoch 2/20
[1m13945/13945[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 10ms/step - accuracy: 0.9787 - loss: 0.0529 - val_accuracy: 0.9676 - val_loss: 0.1069
Epoch 3/20
[1m13945/13945[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 10ms/step - accuracy: 0.9801 - loss: 0.0490 - val_accuracy: 0.9182 - val_loss: 0.1506
Epoch 4/20
[1m13945/13945[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 11ms/step - accuracy: 0.9810 - loss: 0.0466 - val_accuracy: 0.9741 - val_loss: 0.0650
Epoch 5/20
[1m13945/13945[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 11ms/step - accuracy: 0.9814 - loss: 0.0444 - val_accuracy: 0.7607 - val_loss: 0.4598
Epoch 6/20
[1m13945/13945[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 10ms/step - accuracy: 0.9814 - loss: 0.0440 - val_accuracy: 0.9224 - val

In [50]:
# Evaluate
loss, accuracy = best_cnn_model.evaluate(X_test_reshaped, y_test)
print(f'Test Accuracy: {accuracy:.4f}')

[1m15864/15864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 2ms/step - accuracy: 0.9871 - loss: 0.0367
Test Accuracy: 0.9871


In [51]:
# Train model
y_pred = best_cnn_model.predict(X_test_reshaped)

# Convert model probabilities to class labels
y_pred_labels = np.argmax(y_pred, axis=1)
y_test_labels = np.argmax(y_test, axis=1)

# False Negative: Attack (1) misclassified as Benign (0)
false_negatives = (y_pred_labels == 0) & (y_test_labels == 1)

# False Positive: Benign (0) misclassified as Attack (1)
false_positives = (y_pred_labels == 1) & (y_test_labels == 0)

# Print summary
print(f"False Negatives (Attacks misclassified as Benign): {sum(false_negatives)}")
print(f"False Positives (Benign misclassified as Attacks): {sum(false_positives)}")

# Show a few misclassified samples
misclassified_fn = np.where(false_negatives)[0][:5]  # First 5 false negatives
misclassified_fp = np.where(false_positives)[0][:5]  # First 5 false positives

print("\nFalse Negatives (Missed Attacks) - Example Indices:", misclassified_fn)
print("False Positives (Incorrectly Flagged Benign) - Example Indices:", misclassified_fp)

print(confusion_matrix(y_test_labels, y_pred_labels))
print(classification_report(y_test_labels, y_pred_labels))

[1m15864/15864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 2ms/step
False Negatives (Attacks misclassified as Benign): 1235
False Positives (Benign misclassified as Attacks): 5297

False Negatives (Missed Attacks) - Example Indices: [ 164  418 1180 1309 1456]
False Positives (Incorrectly Flagged Benign) - Example Indices: [ 27 133 152 295 374]
[[415001   5297]
 [  1235  86107]]
              precision    recall  f1-score   support

           0       1.00      0.99      0.99    420298
           1       0.94      0.99      0.96     87342

    accuracy                           0.99    507640
   macro avg       0.97      0.99      0.98    507640
weighted avg       0.99      0.99      0.99    507640



In [52]:
# Save Model
best_cnn_model.save('model.h5')
best_cnn_model.save('model.keras')

