In [None]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout
import pickle
from sklearn.preprocessing import  LabelEncoder

In [None]:
DATASET_DIRECTORY = '/kaggle/input/cic-iot-2023'
# load dataset files names and sort them
df_sets = [k for k in os.listdir(DATASET_DIRECTORY) if k.endswith('.csv')]
df_sets.sort()
df_sets= []
for i in range(0,20):
    filename = f"{DATASET_DIRECTORY}/part-0000{i}-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv"
    if i >=10 :
        filename = f"{DATASET_DIRECTORY}/part-000{i}-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv"
    df_test = pd.read_csv(f"{DATASET_DIRECTORY}/part-00030-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv")
    df = pd.read_csv(filename)
    df_sets.append(df)


In [None]:
X_columns = [
    'flow_duration', 'Header_Length', 'Protocol Type', 'Duration',
       'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
       'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
       'ece_flag_number', 'cwr_flag_number', 'ack_count',
       'syn_count', 'fin_count', 'urg_count', 'rst_count',
    'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP',
       'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min',
       'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
       'Radius', 'Covariance', 'Variance', 'Weight',
]
y_column = 'label'

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = StandardScaler()

In [None]:
# Train the scaler
for set in tqdm(df_sets):
    scaler.fit(set[X_columns])

In [None]:
# map the labels to 7 classes
dict_7classes = {}
dict_7classes['DDoS-RSTFINFlood'] = 'DDoS'
dict_7classes['DDoS-PSHACK_Flood'] = 'DDoS'
dict_7classes['DDoS-SYN_Flood'] = 'DDoS'
dict_7classes['DDoS-UDP_Flood'] = 'DDoS'
dict_7classes['DDoS-TCP_Flood'] = 'DDoS'
dict_7classes['DDoS-ICMP_Flood'] = 'DDoS'
dict_7classes['DDoS-SynonymousIP_Flood'] = 'DDoS'
dict_7classes['DDoS-ACK_Fragmentation'] = 'DDoS'
dict_7classes['DDoS-UDP_Fragmentation'] = 'DDoS'
dict_7classes['DDoS-ICMP_Fragmentation'] = 'DDoS'
dict_7classes['DDoS-SlowLoris'] = 'DDoS'
dict_7classes['DDoS-HTTP_Flood'] = 'DDoS'

dict_7classes['DoS-UDP_Flood'] = 'DoS'
dict_7classes['DoS-SYN_Flood'] = 'DoS'
dict_7classes['DoS-TCP_Flood'] = 'DoS'
dict_7classes['DoS-HTTP_Flood'] = 'DoS'


dict_7classes['Mirai-greeth_flood'] = 'Mirai'
dict_7classes['Mirai-greip_flood'] = 'Mirai'
dict_7classes['Mirai-udpplain'] = 'Mirai'

dict_7classes['Recon-PingSweep'] = 'Recon'
dict_7classes['Recon-OSScan'] = 'Recon'
dict_7classes['Recon-PortScan'] = 'Recon'
dict_7classes['VulnerabilityScan'] = 'Recon'
dict_7classes['Recon-HostDiscovery'] = 'Recon'

dict_7classes['DNS_Spoofing'] = 'Spoofing'
dict_7classes['MITM-ArpSpoofing'] = 'Spoofing'

dict_7classes['BenignTraffic'] = 'Benign'

dict_7classes['BrowserHijacking'] = 'Web'
dict_7classes['Backdoor_Malware'] = 'Web'
dict_7classes['XSS'] = 'Web'
dict_7classes['Uploading_Attack'] = 'Web'
dict_7classes['SqlInjection'] = 'Web'
dict_7classes['CommandInjection'] = 'Web'


dict_7classes['DictionaryBruteForce'] = 'BruteForce'

In [None]:
# load the dataset and scale it
pd_list = []
for set in tqdm(df_sets):
    d = set
    d[X_columns] = scaler.transform(d[X_columns])
    new_y = [dict_7classes[k] for k in d[y_column]]
    d[y_column] = new_y
    pd_list.append(d)
    del d

In [None]:
# concat the dataset
ciciot_data = pd.concat(pd_list,axis=0)

In [None]:
# checking for duplicates
ciciot_data.duplicated().sum()
# removing duplicates
ciciot_data = ciciot_data.drop_duplicates()
# remove Web and BruteForce
ciciot_data = ciciot_data[ciciot_data[y_column] != 'Web']
ciciot_data = ciciot_data[ciciot_data[y_column] != 'BruteForce']

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(data=ciciot_data,x='label')
plt.xticks(rotation=45)
plt.show()

In [None]:
# balance the dataset
from sklearn.utils import resample
ciciot_data_balanced = pd.DataFrame()
for label in ciciot_data[y_column].unique():
    if ciciot_data[ciciot_data[y_column] == label].shape[0] > 30000:
        ciciot_data_balanced = pd.concat([ciciot_data_balanced,ciciot_data[ciciot_data[y_column] == label].sample(30000,replace=False)],axis=0)
    else:
        ciciot_data_balanced = pd.concat([ciciot_data_balanced,ciciot_data[ciciot_data[y_column] == label].sample(30000,replace=True)],axis=0)

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(data=ciciot_data_balanced,x='label')
plt.xticks(rotation=45)
plt.show()

In [None]:
# correlation matrix without showing the values inside the cells
to_show = []
for x in X_columns:
    if ciciot_data[x].sum() != 0:
        to_show.append(x)
corrmat = ciciot_data[to_show].corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
plt.title('Correlation Matrix')
plt.show(sns.heatmap(ciciot_data[top_corr_features].corr(),annot=False,cmap="YlGnBu"))


In [None]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(ciciot_data[X_columns], ciciot_data[y_column], test_size=0.3)

In [None]:
ef evaluate_model(model,title, X_test, y_test):
    score = model.score(X_test, y_test)
    print(f'{title} Accuracy: {score}')
    y_pred = model.predict(X_test)
    print(f'{title} Classification Report: \n', classification_report(y_test, y_pred))
    cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
    cm = pd.DataFrame(cm, index=model.classes_, columns=model.classes_)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, cmap='coolwarm', fmt='d')
    plt.title(f'{title} Confusion Matrix')
    plt.show()

In [None]:
# SVM model
svm_model = SVC(kernel='rbf', C=1, gamma='scale')
svm_model.fit(X_train,y_train)
# evaluate the model
evaluate_model(svm_model,'SVM', X_test, y_test)

In [None]:
# random forest model
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10)
rf_model.fit(X_train, y_train)
# evaluate the model
evaluate_model(rf_model,'Random Forest', X_test, y_test)

In [None]:
# reshape the data for deep learning models
X_train_reshaped = X_train.values.reshape(-1, X_train.shape[1], 1)
X_test_reshaped = X_test.values.reshape(-1, X_test.shape[1], 1)

encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
y_test_encoded = encoder.transform(y_test)
# encoder classes mapping
encoder_mapping = dict(zip(range(len(encoder.classes_)), encoder.classes_))
def evaluate_deep_model(model, title, X_test, y_test):

    # Predict the labels for the test set
    y_pred = model.predict(X_test)

    # Check if y_test is one-hot encoded and convert if necessary
    if y_test.ndim > 1 and y_test.shape[-1] > 1:
        y_test = np.argmax(y_test, axis=1)

    # Convert predictions to class indices if they're in one-hot format
    if y_pred.shape[-1] > 1:
        y_pred = np.argmax(y_pred, axis=1)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f'{title} Accuracy: {accuracy}')

    # Generate and print classification report
    cls_report = classification_report(y_test, y_pred, target_names=encoder_mapping.values())
    print(f'{title} Classification Report: \n{cls_report}')

    # Compute confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    # Plot confusion matrix
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt='d', cmap='coolwarm', xticklabels=encoder_mapping.values(), yticklabels=encoder_mapping.values())
    plt.title(f'{title} Confusion Matrix')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.show()

In [None]:
def VGGNet_1D(input_shape, num_classes):
    model = Sequential([
        Conv1D(64, 3, padding='same', activation='relu', input_shape=input_shape),
        Conv1D(64, 3, padding='same', activation='relu'),
        MaxPooling1D(2),
        Conv1D(128, 3, padding='same', activation='relu'),
        Conv1D(128, 3, padding='same', activation='relu'),
        MaxPooling1D(2),
        Conv1D(256, 3, padding='same', activation='relu'),
        Conv1D(256, 3, padding='same', activation='relu'),
        MaxPooling1D(2),
        Flatten(),
        Dense(256, activation='relu'),
        Dropout(0.5),
        Dense(256, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [None]:
input_shape = (X_train_reshaped.shape[1], 1)
num_classes = len(encoder.classes_)
val_data = (X_test_reshaped, y_test_encoded)
vgg_model = VGGNet_1D(input_shape, num_classes)

In [None]:
# Train the model
history = vgg_model.fit(X_train_reshaped, y_train_encoded, epochs=10, batch_size=64, validation_data=val_data)
evaluate_deep_model(vgg_model, 'VGGNet', X_test_reshaped, y_test_encoded)

In [None]:
def DNN(input_shape, num_classes):
    model = Sequential([
        Dense(256, activation='relu', input_shape=input_shape),
        Dropout(0.5),
        Dense(256, activation='relu'),
        Dropout(0.5),
        Flatten(),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model
    input_shape = (X_train_reshaped.shape[1], 1)
num_classes = len(encoder.classes_)
val_data = (X_test_reshaped, y_test_encoded)
dnn_model = DNN(input_shape, num_classes)
dnn_model.summary()

In [None]:
history = dnn_model.fit(X_train_reshaped, y_train_encoded, epochs=10, batch_size=64, validation_data=val_data)
evaluate_deep_model(dnn_model, 'DNN', X_test_reshaped, y_test_encoded)