In [18]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler

In [19]:
!ls CICIoMT2024

[34mWiFI_and_MQTT[m[m


In [20]:

# Define the main directory path
main_folder_path = "CICIoMT2024/WiFI_and_MQTT/"

In [21]:

# Function to load CSV files into DataFrames with labels
def load_and_label_csv(file_path, label):
    # print(f"Reading file: {file_path}")
    df = pd.read_csv(file_path)
    df["Label"] = label
    return df


# Function to traverse directories and load CSV files with labels
def load_csv_files_with_labels(folder_path, label_mapping):
    all_data = pd.DataFrame()
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".csv"):
                file_path = os.path.join(root, file)
                label = label_mapping.get(file)
                if label is not None:
                    print(f"Loading file: {file_path} with label: {label}")
                    df = load_and_label_csv(file_path, label)
                    all_data = pd.concat([all_data, df], ignore_index=True)
    return all_data




In [22]:

# Define label mappings for 19 classes
label_mapping = {}

# Benign
for file in os.listdir(os.path.join(main_folder_path, "profiling/CSV")):
    label_mapping[file] = 0

# Spoofing - ARP
for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/test")):
    if "Spoofing" in file:
        label_mapping[file] = 1
for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/train")):
    if "Spoofing" in file:
        label_mapping[file] = 1

# DDoS - ICMP
for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/test")):
    if "DDoS-ICMP" in file:
        label_mapping[file] = 2
for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/train")):
    if "DDoS-ICMP" in file:
        label_mapping[file] = 2

# DDoS - SYN
for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/test")):
    if "DDoS-SYN" in file:
        label_mapping[file] = 3
for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/train")):
    if "DDoS-SYN" in file:
        label_mapping[file] = 3

# DDoS - TCP
for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/test")):
    if "DDoS-TCP" in file:
        label_mapping[file] = 4
for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/train")):
    if "DDoS-TCP" in file:
        label_mapping[file] = 4

# DDoS - UDP
for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/test")):
    if "DDoS-UDP" in file:
        label_mapping[file] = 5
for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/train")):
    if "DDoS-UDP" in file:
        label_mapping[file] = 5

# DoS - ICMP
for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/test")):
    if "DoS-ICMP" in file:
        label_mapping[file] = 6
for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/train")):
    if "DoS-ICMP" in file:
        label_mapping[file] = 6

# DoS - SYN
for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/test")):
    if "DoS-SYN" in file:
        label_mapping[file] = 7
for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/train")):
    if "DoS-SYN" in file:
        label_mapping[file] = 7

# DoS - TCP
for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/test")):
    if "DoS-TCP" in file:
        label_mapping[file] = 8
for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/train")):
    if "DoS-TCP" in file:
        label_mapping[file] = 8

# DoS - UDP
for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/test")):
    if "DoS-UDP" in file:
        label_mapping[file] = 9
for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/train")):
    if "DoS-UDP" in file:
        label_mapping[file] = 9

# MQTT DDoS - Connect Flood
for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/test")):
    if "MQTT-DDoS-Connect_Flood" in file:
        label_mapping[file] = 10
for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/train")):
    if "MQTT-DDoS-Connect_Flood" in file:
        label_mapping[file] = 10

# MQTT DDoS - Publish Flood
for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/test")):
    if "MQTT-DDoS-Publish_Flood" in file:
        label_mapping[file] = 11
for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/train")):
    if "MQTT-DDoS-Publish_Flood" in file:
        label_mapping[file] = 11

# MQTT DoS - Connect Flood
for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/test")):
    if "MQTT-DoS-Connect_Flood" in file:
        label_mapping[file] = 12
for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/train")):
    if "MQTT-DoS-Connect_Flood" in file:
        label_mapping[file] = 12

# MQTT DoS - Publish Flood
for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/test")):
    if "MQTT-DoS-Publish_Flood" in file:
        label_mapping[file] = 13
for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/train")):
    if "MQTT-DoS-Publish_Flood" in file:
        label_mapping[file] = 13

# MQTT Malformed Data
for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/test")):
    if "Malformed_Data" in file:
        label_mapping[file] = 14
for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/train")):
    if "Malformed_Data" in file:
        label_mapping[file] = 14

# Reconnaissance - OS Scan
for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/test")):
    if "OS_Scan" in file:
        label_mapping[file] = 15
for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/train")):
    if "OS_Scan" in file:
        label_mapping[file] = 15

# Reconnaissance - Ping Sweep
for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/test")):
    if "Ping_Sweep" in file:
        label_mapping[file] = 16
for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/train")):
    if "Ping_Sweep" in file:
        label_mapping[file] = 16

# Reconnaissance - Port Scan
for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/test")):
    if "Port_Scan" in file:
        label_mapping[file] = 17
for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/train")):
    if "Port_Scan" in file:
        label_mapping[file] = 17

# Reconnaissance - VulScan
for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/test")):
    if "VulScan" in file:
        label_mapping[file] = 18
for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/train")):
    if "VulScan" in file:
        label_mapping[file] = 18


In [25]:
# # Combine all label mappings
# label_mapping = {
#     **label_mapping_profiling,
#     **label_mapping_attacks_test,
#     **label_mapping_attacks_train,
# }


print("Loading profiling data (benign)...")
# Load the profiling CSV files (benign)
profiling_folder_path = os.path.join(main_folder_path, "profiling/CSV")
profiling_data = load_csv_files_with_labels(profiling_folder_path, label_mapping)

print("Loading attack data (malicious)...")
# Load the attack CSV files (malicious)
attack_folder_path_test = os.path.join(main_folder_path, "attacks/csv/test")
attack_folder_path_train = os.path.join(main_folder_path, "attacks/csv/train")



Loading profiling data (benign)...
Loading file: CICIoMT2024/WiFI_and_MQTT/profiling/CSV/SenseUBaby_Power.pcap.csv with label: 0
Loading file: CICIoMT2024/WiFI_and_MQTT/profiling/CSV/Singcall_WAN_PHYSICAL.pcap.csv with label: 0
Loading file: CICIoMT2024/WiFI_and_MQTT/profiling/CSV/Owltron_Camera_LAN_MIC.pcap.csv with label: 0
Loading file: CICIoMT2024/WiFI_and_MQTT/profiling/CSV/Owltron_Camera_LAN_WATCH.pcap.csv with label: 0
Loading file: CICIoMT2024/WiFI_and_MQTT/profiling/CSV/Blink_Camera_WAN_MIC.pcap.csv with label: 0
Loading file: CICIoMT2024/WiFI_and_MQTT/profiling/CSV/M1T_Camera_WAN_PHOTO.pcap.csv with label: 0
Loading file: CICIoMT2024/WiFI_and_MQTT/profiling/CSV/Ecobee_Camera_LAN_MIC.pcap.csv with label: 0
Loading file: CICIoMT2024/WiFI_and_MQTT/profiling/CSV/Owltron_Camera_WAN_WATCH.pcap.csv with label: 0
Loading file: CICIoMT2024/WiFI_and_MQTT/profiling/CSV/M1T_Camera_LAN_PHOTO.pcap.csv with label: 0
Loading file: CICIoMT2024/WiFI_and_MQTT/profiling/CSV/Multifunctional_Pager

In [26]:
attack_data_test = load_csv_files_with_labels(attack_folder_path_test, label_mapping)
attack_data_train = load_csv_files_with_labels(attack_folder_path_train, label_mapping)


print("Combining data...")
# Combine the data
combined_data = pd.concat([profiling_data, attack_data_test, attack_data_train], ignore_index=True)
attack_data_test = load_csv_files_with_labels(
    attack_folder_path_test, label_mapping_attacks_test
)
attack_data_train = load_csv_files_with_labels(
    attack_folder_path_train, label_mapping_attacks_train
)


print("Combining data...")
# Combine the data
combined_data = pd.concat([profiling_data, attack_data_test, attack_data_train], ignore_index=True)


Loading file: CICIoMT2024/WiFI_and_MQTT/attacks/csv/test/TCP_IP-DDoS-ICMP1_test.pcap.csv with label: 2
Loading file: CICIoMT2024/WiFI_and_MQTT/attacks/csv/test/TCP_IP-DDoS-UDP2_test.pcap.csv with label: 2
Loading file: CICIoMT2024/WiFI_and_MQTT/attacks/csv/test/MQTT-DDoS-Connect_Flood_test.pcap.csv with label: 2
Loading file: CICIoMT2024/WiFI_and_MQTT/attacks/csv/test/Recon-Port_Scan_test.pcap.csv with label: 3
Loading file: CICIoMT2024/WiFI_and_MQTT/attacks/csv/test/MQTT-DoS-Publish_Flood_test.pcap.csv with label: 1
Loading file: CICIoMT2024/WiFI_and_MQTT/attacks/csv/test/TCP_IP-DDoS-UDP1_test.pcap.csv with label: 2
Loading file: CICIoMT2024/WiFI_and_MQTT/attacks/csv/test/TCP_IP-DDoS-ICMP2_test.pcap.csv with label: 2
Loading file: CICIoMT2024/WiFI_and_MQTT/attacks/csv/test/TCP_IP-DoS-UDP_test.pcap.csv with label: 1
Loading file: CICIoMT2024/WiFI_and_MQTT/attacks/csv/test/TCP_IP-DoS-SYN_test.pcap.csv with label: 1
Loading file: CICIoMT2024/WiFI_and_MQTT/attacks/csv/test/TCP_IP-DDoS-SYN

In [None]:

print("Preprocessing data...")
# Preprocess the data
# Handling missing values by filling them with the mean of the column
combined_data.fillna(combined_data.mean(), inplace=True)

# Feature Selection: Drop columns that are not useful for classification
irrelevant_columns = ["Label"]  # Add any other non-numeric columns if needed
X = combined_data.drop(columns=irrelevant_columns)
y = combined_data["Label"]



Preprocessing data...


In [None]:
print("Standardizing data...")
# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


Standardizing data...


In [None]:

# %%

print("Splitting data into training and testing sets...")
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


print(X_train, X_test)
print(y_train, y_test)



Splitting data into training and testing sets...
[[-1.29868625e-01 -3.33901893e-01 -9.99581778e-02 ... -1.31304873e-01
  -1.76946424e-01  8.99889466e-04]
 [-3.78474230e-02  1.41192453e+00 -9.99581778e-02 ... -1.31309209e-01
  -4.12682267e-01  8.99889466e-04]
 [-1.30060753e-01 -3.33901893e-01 -9.99581778e-02 ... -1.31309209e-01
  -4.12682267e-01  8.99889466e-04]
 ...
 [-8.52838942e-02  1.41192453e+00 -9.99581778e-02 ... -1.31309209e-01
  -4.12682267e-01  8.99889466e-04]
 [-1.30060753e-01 -3.33901893e-01 -9.99581778e-02 ... -1.31309209e-01
  -4.12682267e-01  8.99889466e-04]
 [-1.30188870e-01 -1.12745936e+00 -9.99581778e-02 ... -1.31309209e-01
  -4.12682267e-01  8.99889466e-04]] [[-8.87929451e-02  1.41192453e+00 -9.99581778e-02 ... -1.28239491e-01
  -1.76946424e-01  8.99889466e-04]
 [-4.48993809e-02 -3.33901893e-01 -9.99581778e-02 ...  2.01993883e+00
   2.10183339e+00  8.99889466e-04]
 [-1.30060753e-01 -3.33901893e-01 -9.99581778e-02 ... -1.31309209e-01
  -4.12682267e-01  8.99889466e-04]


In [None]:

# %%

print("Training Random Forest Classifier...")
# Train a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)


Training Random Forest Classifier...


In [None]:


# %%

print("Predicting on the test set...")
# Predict on the test set
y_pred = clf.predict(X_test)



Predicting on the test set...


In [None]:
# %%
print("Printing classification report and accuracy...")
# Print classification report and accuracy
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Accuracy:", accuracy_score(y_test, y_pred))

print("Finished!")

Printing classification report and accuracy...
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     77918
           1       1.00      1.00      1.00    458852
           2       1.00      1.00      1.00   1218901
           3       0.99      0.98      0.99     26387
           4       0.92      0.83      0.87      3535
           5       0.95      0.84      0.89      1333

    accuracy                           1.00   1786926
   macro avg       0.97      0.94      0.96   1786926
weighted avg       1.00      1.00      1.00   1786926

Accuracy: 0.9990296184620964
Finished!
