# Importing Required Libraries & DataSource


In [None]:
pip install tensorflow==2.15.0  # for SHAP. Restart the kernel after running this cell

In [None]:
# Libraries for Data Manipulation
import pandas as pd
import numpy as np

# import gc

# Libraries for Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt
# import altair as alt
# from scipy.stats import skew
sns.set(style="white", font_scale=1.5)
sns.set(rc={"axes.facecolor":"#FFFAF0", "figure.facecolor":"#FFFAF0"})
sns.set_context("poster", font_scale=.7)
# import matplotlib.ticker as ticker

# Libraries to Handle Warnings
import warnings
warnings.filterwarnings('ignore')

# Libraries for Statistical Analysis
# from scipy import stats
# from scipy.stats import chi2, chi2_contingency

# Setting Display Options
pd.set_option("display.max.columns", None)

# Machine Learning Algorithms
# from sklearn.utils.class_weight import compute_class_weight
# from sklearn.feature_selection import mutual_info_regression
# from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# import joblib
import os
import random
import shap

import tensorflow as tf
# from tensorflow.keras.initializers import HeNormal
# from tensorflow.keras.regularizers import l1,l2
# import keras_tuner
import keras

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
# from sklearn.feature_selection import RFE, RFECV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score, roc_curve

from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Input, ReLU, LeakyReLU, Concatenate, Dense, Dropout, Layer
from tensorflow.keras.layers import BatchNormalization, Activation, Add, GlobalAveragePooling1D
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import F1Score
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.initializers import GlorotUniform


In [None]:
print(tf.__version__)

In [None]:
# for reproducibility
def reset_random_seeds(seed=42):
   tf.random.set_seed(seed)
   np.random.seed(seed)
   random.seed(seed)
   tf.keras.utils.set_random_seed(seed)
   os.environ['PYTHONHASHSEED']=str(seed)
   os.environ['TF_DETERMINISTIC_OPS'] = '1'
   os.environ['TF_CUDNN_DETERMINISTIC'] = '1'

reset_random_seeds()
# As described in the TF docs: "Calling tf.keras.utils.set_random_seed sets the Python seed, the NumPy seed, and the TensorFlow seed." So it is not necessary to set them separately.

## 1. Load and Inspect Dataset


In [None]:
data_set = pd.read_csv("/kaggle/input/edgeiiotset-cyber-security-dataset-of-iot-iiot/Edge-IIoTset dataset/Selected dataset for ML and DL/DNN-EdgeIIoT-dataset.csv")
data_set.head()

## 2. Remove specific columns for generalization 


In [None]:
col_drop=["frame.time", "ip.src_host", "ip.dst_host", "arp.src.proto_ipv4","arp.dst.proto_ipv4", 

         "http.file_data","http.request.full_uri","icmp.transmit_timestamp",

         "http.request.uri.query", "tcp.options","tcp.payload","tcp.srcport",

         "tcp.dstport", "udp.port", "mqtt.msg"]

data_set=data_set.drop(columns=col_drop)

# Replace '0' with 'Normal' and '1' with 'Attack' in Target column
data_set['Attack_label'] = data_set['Attack_label'].replace(0, 'Normal')
data_set['Attack_label'] = data_set['Attack_label'].replace(1, 'Attack')

# Choose target column
target_column='Attack_label' # Multiclass: Attack_type | Binary: Attack_label
unique_values = data_set[target_column].value_counts().index.to_numpy()
print(unique_values)
data_set

In [None]:
for col in ['http.request.method', 'http.referer', 'http.request.version','dns.qry.name.len','mqtt.conack.flags','mqtt.protoname','mqtt.topic']:
    data_set[col] = data_set[col].astype('category').cat.codes
    
data_set

# 2 | Data Analysis and Basic preprocessing 


## 3. **Checking if There's Any Duplicate Records.**

In [None]:
print("Duplicates in dataset: ",data_set.duplicated().sum())

In [None]:
data_set = data_set.drop_duplicates()
print("Duplicates in dataset: ",data_set.duplicated().sum())

## 4. **Computing Total No. of Missing Values and the Percentage of Missing Values**

In [None]:
missing_data = data_set.isnull().sum().to_frame().rename(columns={0:"Total No. of Missing Values"})
missing_data["% of Missing Values"] = round((missing_data["Total No. of Missing Values"]/len(data_set))*100,2)
missing_data

# 3 | Exploratory Data Analysis (EDA)

## **3. Visualising Class Distribution**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def pie_bar_plot(df, col):
    plt.figure(figsize=(9, 9))
    
    # Extract value counts for the specified column
    value_counts = df[col].value_counts()#.sort_index()

    ax1 = value_counts
    plt.title(f"Distribution by {col}", fontweight="black", size=14, pad=15)
    colors = sns.color_palette('Set1', len(ax1))
    plt.pie(ax1.values, labels=None, autopct="", startangle=90, colors=colors)
    center_circle = plt.Circle((0, 0), 0.4, fc='white')
    fig = plt.gcf()
    fig.gca().add_artist(center_circle)

    # Create a legend with labels and values
    legend_labels = [f"{label}: {value} ({round(100*value/sum(value_counts),2)}%)" for label, value in zip(unique_values, value_counts)]
    plt.legend(legend_labels, loc="lower right", fontsize=8)
    plt.savefig('Class distribution.png')
    plt.show()

pie_bar_plot(data_set, target_column)

In [None]:
# Print class distribution before resampling
print("Before resampling:", data_set['Traffic'].value_counts())

# 4 | Preprocessing


## **4. Splitting the features in dependent and independent features**


In [None]:
x = data_set.drop(['Attack_label', 'Attack_type'], axis=1)
y = data_set[target_column]

# Get feature names
feature_names = list(x.columns)

y

## **2. Target feature Encoding**

In [None]:
reset_random_seeds()

label_encoder = LabelEncoder()
ohe = OneHotEncoder(sparse=False, categories=[unique_values])#, categories=[unique_values]

used_encoder=ohe

In [None]:
reset_random_seeds()

used_encoder.fit(data_set[[target_column]])

# Override the classes_ attribute with a custom order
used_encoder.classes_ = np.array(unique_values)  # Custom order


# Transform the data
encoded = used_encoder.transform(data_set[[target_column]])

y = pd.DataFrame(encoded, index=data_set.index,  dtype='int', columns= unique_values  ) # unique_values ['Label'] 

y

In [None]:
used_encoder.classes_

In [None]:
pf=y.value_counts()
pdd=pd.DataFrame(pf)
pdd

## Dividing in train-test-split


In [None]:
reset_random_seeds()

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
x_test, x_valid, y_test, y_valid = train_test_split(x_test, y_test, test_size=0.5, random_state=42)

print('Train:',x_train.shape, y_train.shape)
print('Validation:',x_valid.shape, y_valid.shape)
print('Test:', x_test.shape, y_test.shape)

In [None]:
pf=y_train.value_counts()
pdd=pd.DataFrame(pf)
pdd

## **7.Feature Scaling**


In [None]:
reset_random_seeds()

# scaler = StandardScaler()
scaler=MinMaxScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.transform(x_valid)
x_test_scaled = scaler.transform(x_test)


## Feature Selection

In [None]:
# Feature Selection without normalization
# x_train=x_train.to_numpy()
# x_valid=x_valid.to_numpy()
# x_test=x_test.to_numpy()

# selected_ids=[8,32,34,35,37,38,39,40]
# x_train_selected=x_train[:,selected_ids]
# x_valid_selected=x_valid[:,selected_ids]
# x_test_selected=x_test[:,selected_ids]

In [None]:
# Feature Selection with normalization
# Multilabel Classification
# selected_ids=[7,8,16,18,19,22,31,34,39,40]  # 10 features based on histogram (frequency of repetition)
# selected_ids=[7,8,19,22,31,34,39,40] # 8 features
    
# Binary Classification
selected_ids=[6,8,15,32,34,35,37,38,39,40] # 10 features
# selected_ids=[8,32,34,35,37,38,39,40] # 8 features

x_train_selected=x_train_scaled[:,selected_ids]
x_valid_selected=x_valid_scaled[:,selected_ids]
x_test_selected=x_test_scaled[:,selected_ids]

In [None]:
# x_train=x_train
# x_valid=x_valid
# x_test=x_test

x_train=x_train_scaled
x_valid=x_valid_scaled
x_test=x_test_scaled

# x_train=x_train_selected
# x_valid=x_valid_selected
# x_test=x_test_selected

# Deep Learning Modeling

In [None]:
# VGG1D
reset_random_seeds()

# Initialize and train the model
input_shape = (x_train.shape[1],1)
num_classes = y_train.shape[1]


inputs = Input(shape=input_shape)
# By default, layers like Dense and Conv use random initializations. You can make the weight initialization constant by specifying kernel_initializer.
# First Conv Block
xx = Conv1D(filters=64, kernel_size=3, padding='same', activation='relu', kernel_initializer=GlorotUniform(seed=42))(inputs)
xx = Conv1D(filters=64, kernel_size=3, padding='same', activation='relu', kernel_initializer=GlorotUniform(seed=42))(xx)
xx = MaxPooling1D(pool_size=2)(xx)

# Second Conv Block
xx = Conv1D(filters=128, kernel_size=3, padding='same', activation='relu', kernel_initializer=GlorotUniform(seed=42))(xx)
xx = Conv1D(filters=128, kernel_size=3, padding='same', activation='relu', kernel_initializer=GlorotUniform(seed=42))(xx)
xx = MaxPooling1D(pool_size=2)(xx)

# Third Conv Block
xx = Conv1D(filters=256, kernel_size=3, padding='same', activation='relu', kernel_initializer=GlorotUniform(seed=42))(xx)
xx = Conv1D(filters=256, kernel_size=3, padding='same', activation='relu', kernel_initializer=GlorotUniform(seed=42))(xx)
xx = MaxPooling1D(pool_size=2)(xx)

# Fourth Conv Block
# xx = Conv1D(filters=512, kernel_size=3, padding='same', activation='relu')(xx)
# xx = Conv1D(filters=512, kernel_size=3, padding='same', activation='relu')(xx)
# xx = MaxPooling1D(pool_size=2)(xx)

# Flatten and Fully Connected Layers
xx = Flatten()(xx)
xx = Dense(512, activation='relu', kernel_initializer=GlorotUniform(seed=42))(xx)
xx = Dropout(0.5)(xx)
xx = Dense(512, activation='relu', kernel_initializer=GlorotUniform(seed=42))(xx)
xx = Dropout(0.5)(xx)
outputs=Dense(num_classes, activation='softmax', kernel_initializer=GlorotUniform(seed=42))(xx) # softmax  sigmoid

# Compile the model
model = Model(inputs, outputs)
# Compile the model
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy','Precision','Recall',F1Score(average='weighted')], jit_compile=False) #'F1Score' binary_crossentropy   categorical_crossentropy

In [None]:
tf.keras.utils.plot_model(model, show_shapes=True, show_layer_names=True)

In [None]:
# For tensorflow 2.15.0
y_train = y_train.astype('float32')
y_valid = y_valid.astype('float32')
y_test = y_test.astype('float32')


In [None]:
# Step 3: Define callbacks
reset_random_seeds()

# 1. Save the best model during training
checkpoint = ModelCheckpoint(filepath='best_model.keras', 
                             monitor='val_f1_score', #val_accuracy
                             save_best_only=True, 
                             verbose=1, 
                             mode='max')

# 2. Early stopping to avoid overfitting
early_stopping = EarlyStopping(monitor='val_loss', 
                               patience=5, 
                               restore_best_weights=True, 
                               verbose=1)

# 3. Reduce learning rate when validation loss plateaus
reduce_lr = ReduceLROnPlateau(monitor='val_loss', 
                              factor=0.5, 
                              patience=5, 
                              min_lr=0.00001, 
                              verbose=1)

history = model.fit(x_train, y_train, 
                         epochs=20, 
                         batch_size=64, #32
                         validation_data=(x_valid, y_valid),
                         callbacks=[checkpoint, early_stopping, reduce_lr], 
                         verbose=1)

In [None]:
# Step 5: Plot the accuracy and loss growth graph
plt.figure(figsize=(12, 4))

# Plot accuracy
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Accuracy Growth over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
# Plot loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss Growth over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.savefig('accuracy_loss.png')

# plt.show()

# Step 6: Evaluate on the test set
best_model = load_model('best_model.keras') #best_model.keras
model1 = best_model
#model1 = model

model1.evaluate(x_test, y_test, verbose=1)

# Step 7: Save the final model and training history
# model1.save('final_model.h5')  # Save the final model


# Step 8: Confusion Matrix
# Predict on the test set
y_pred = model1.predict(x_test) 

Y_pred_classes = np.argmax(y_pred, axis=1)
Y_true_classes = np.argmax(y_test, axis=1)


baseline_score = accuracy_score(Y_true_classes, Y_pred_classes)

# Create the confusion matrix
cm = confusion_matrix(Y_true_classes, Y_pred_classes)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
cm_array_df = pd.DataFrame(cm_normalized, index=unique_values, columns=unique_values) 

# Plot the confusion matrix
fig, ax = plt.subplots(figsize=(15,11))#figsize=(9,7) figsize=(15,11)
sns.heatmap(cm_array_df, annot=True, ax=ax ,cbar = True, fmt='0.2f', cmap='Blues')
ax.set_title('Confusion Matrix')
ax.set_ylabel('True Label',fontsize=14)
ax.set_xlabel('Predicted Label',fontsize=14)

# # Save confusion matrix as PNG
plt.savefig('confusion_matrix.png')

plt.show()

In [None]:
aa=pd.DataFrame(y_test)
aa.reset_index(drop=True, inplace=True)
# pf=aa.value_counts()
# pdd=pd.DataFrame(pf)
# pdd
aa.loc[aa['Password'] == 1]


## SHAP

In [None]:
reset_random_seeds()

# Select a background dataset (a subset of training data)
background = x_train[:100]
background = background[..., np.newaxis]  # Convert (100, 41) to (100, 41, 1)

# Create a SHAP explainer
explainer = shap.DeepExplainer(model1, background)
# explainer = shap.Explainer(model1, background)


In [None]:
# Select a single sample or multiple samples to explain

class_id=1 # from 0 to num_classes-1

# For Multiclass classification 
# num_classes=15
# match class_id:
#     case 0:
#         sample_index = 1 # Class 0
#     case 1:
#         sample_index = 46 # Class 1
#     case 2:
#         sample_index = 3 # Class 2
#     case 3:
#         sample_index = 0 # Class 3
#     case 4:
#         sample_index = 77 # Class 4
#     case 5:
#         sample_index = 2
#     case 6:
#         sample_index = 52
#     case 7:
#         sample_index = 26 
#     case 8:
#         sample_index = 41 
#     case 9:
#         sample_index = 139 
#     case 10:
#         sample_index = 188
#     case 11:
#         sample_index = 83
#     case 12:
#         sample_index = 177 
#     case 13:
#         sample_index = 601 
#     case 14:
#         sample_index = 140 


# For Binary classification 
num_classes=2
match class_id:
    case 0:
        sample_index = 1 # Class Normal
    case 1:
        sample_index = 0 # Class Attack
        


sample_to_explain = x_test[sample_index:sample_index + 1]
sample_to_explain = sample_to_explain[..., np.newaxis]
# Compute SHAP values
shap_values = explainer.shap_values(sample_to_explain)

In [None]:
reset_random_seeds()

data=shap_values[class_id][0]

N = 10

# Get indices sorted by value (descending)
sorted_indices = sorted(range(len(data)), key=lambda i: data[i], reverse=True)

# Select the first N indices
indices = sorted_indices[:N]

print("Indices of N largest values:", indices)
print("Name of N largest values:", [feature_names[i] for i in indices])
print("Values of N largest values:", [data[i].item() for i in indices])


In [None]:
print("True Label")
print(Y_true_classes[sample_index])
print("Predicted Label")
print(Y_pred_classes[sample_index])


plt.figure(figsize=(14, 10))

for i in range(num_classes):  # num_classes
    plt.plot(shap_values[i][0], label=unique_values[i])


plt.xticks(np.arange(len(feature_names)), feature_names, rotation=90)
plt.xlabel("Features")
plt.ylabel("SHAP Values")
plt.legend(loc='best') #lower right

plt.savefig('SHAP_plot.png')
plt.show()

In [None]:
print("True Label")
print(Y_true_classes[sample_index])
print("Predicted Label")
print(Y_pred_classes[sample_index])

for i in range(num_classes):
    
    plt.figure(figsize=(12, 4))
    # plt.plot(sample_to_explain[0], label="Input Signal", alpha=0.6)
    plt.plot(shap_values[i][0], label="SHAP Values", alpha=0.8)
    plt.title("Class:"+unique_values[i])
    plt.xlabel("Feature")
    plt.ylabel("Value")
    plt.legend()
    plt.show()

## Cross validation

In [None]:
#define a function to fit the model
def fit_and_evaluate(tr_x,ts_x, tr_y, ts_y):
    model = None
    if os.path.exists("/kaggle/working/best_model.keras"):
        os.remove("/kaggle/working/best_model.keras")
    model = create_model()
    results = model.fit(tr_x, tr_y, 
                     epochs=20, 
                     batch_size=64, #32
                     validation_split=0.2,
                     callbacks=[checkpoint, early_stopping, reduce_lr], 
                     verbose=1)  
    print("Val Score: ")
    model = load_model('best_model.keras')
    model.evaluate(ts_x, ts_y, verbose=1)
    return results


n_folds=5
epochs=20
batch_size=64

#save the model history in a list after fitting so that we can plot later
model_history = [] 
reset_random_seeds()
scaler=MinMaxScaler()
x1 = scaler.fit_transform(x)
y1=y.astype('float32')
for i in range(n_folds):
    print("Training on Fold: ",i+1)
    random_state=np.random.randint(i,1000)
    print("Random state: ",random_state)
    tr_x, ts_x, tr_y, ts_y = train_test_split(x1, y1, test_size=0.15, 
                                               random_state = random_state)
    model_history.append(fit_and_evaluate(tr_x,ts_x, tr_y, ts_y))
    print("======="*12, end="\n\n\n")

In [None]:
# plt.title('Accuracies vs Epochs')
plt.plot(model_history[0].history['accuracy'], label='Training Fold 1', color='blue')
plt.plot(model_history[0].history['val_accuracy'], label='Validation Fold 1', color='blue', linestyle = "dashdot")
plt.plot(model_history[1].history['accuracy'], label='Training Fold 2', color='darkorange')
plt.plot(model_history[1].history['val_accuracy'], label='Validation Fold 2', color='darkorange', linestyle = "dashdot")
plt.plot(model_history[2].history['accuracy'], label='Training Fold 3', color='green')
plt.plot(model_history[2].history['val_accuracy'], label='Validation Fold 3', color='green', linestyle = "dashdot")
plt.plot(model_history[3].history['accuracy'], label='Training Fold 4', color='violet')
plt.plot(model_history[3].history['val_accuracy'], label='Validation Fold 4', color='violet', linestyle = "dashdot")
plt.plot(model_history[4].history['accuracy'], label='Training Fold 5', color='red')
plt.plot(model_history[4].history['val_accuracy'], label='Validation Fold 5', color='red', linestyle = "dashdot")


plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(fontsize=13)
plt.savefig('cross_validation.png')
plt.show()