In [2]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.ensemble import BalancedRandomForestClassifier
import tqdm

In [3]:
!ls CICIoMT2024

[34mBluetooth[m[m     [31mREADME.pdf[m[m    [34mWiFI_and_MQTT[m[m


In [4]:
# Define the main directory path
main_folder_path = "CICIoMT2024/WiFI_and_MQTT/"


# Function to load CSV files into DataFrames with labels
def load_and_label_csv(file_path, label):
    df = pd.read_csv(file_path)
    df["Label"] = label
    return df

In [5]:
# Function to traverse directories and load CSV files with labels
def load_csv_files_with_labels(folder_path, label_mapping):
    all_data = pd.DataFrame()
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".csv"):
                file_path = os.path.join(root, file)
                label = label_mapping.get(file)
                if label is not None:
                    print(f"Loading file: {file_path} with label: {label}")
                    df = load_and_label_csv(file_path, label)
                    all_data = pd.concat([all_data, df], ignore_index=True)
    return all_data

In [6]:
# Define label mappings for 19 classes
label_mapping = {
    **{file: 0 for file in os.listdir(os.path.join(main_folder_path, "profiling/CSV"))},
    **{
        file: 1
        for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/test"))
        if "Spoofing" in file
    },
    **{
        file: 1
        for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/train"))
        if "Spoofing" in file
    },
    **{
        file: 2
        for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/test"))
        if "DDoS-ICMP" in file
    },
    **{
        file: 2
        for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/train"))
        if "DDoS-ICMP" in file
    },
    **{
        file: 3
        for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/test"))
        if "DDoS-SYN" in file
    },
    **{
        file: 3
        for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/train"))
        if "DDoS-SYN" in file
    },
    **{
        file: 4
        for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/test"))
        if "DDoS-TCP" in file
    },
    **{
        file: 4
        for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/train"))
        if "DDoS-TCP" in file
    },
    **{
        file: 5
        for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/test"))
        if "DDoS-UDP" in file
    },
    **{
        file: 5
        for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/train"))
        if "DDoS-UDP" in file
    },
    **{
        file: 6
        for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/test"))
        if "DoS-ICMP" in file
    },
    **{
        file: 6
        for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/train"))
        if "DoS-ICMP" in file
    },
    **{
        file: 7
        for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/test"))
        if "DoS-SYN" in file
    },
    **{
        file: 7
        for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/train"))
        if "DoS-SYN" in file
    },
    **{
        file: 8
        for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/test"))
        if "DoS-TCP" in file
    },
    **{
        file: 8
        for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/train"))
        if "DoS-TCP" in file
    },
    **{
        file: 9
        for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/test"))
        if "DoS-UDP" in file
    },
    **{
        file: 9
        for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/train"))
        if "DoS-UDP" in file
    },
    **{
        file: 10
        for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/test"))
        if "MQTT-DDoS-Connect_Flood" in file
    },
    **{
        file: 10
        for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/train"))
        if "MQTT-DDoS-Connect_Flood" in file
    },
    **{
        file: 11
        for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/test"))
        if "MQTT-DDoS-Publish_Flood" in file
    },
    **{
        file: 11
        for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/train"))
        if "MQTT-DDoS-Publish_Flood" in file
    },
    **{
        file: 12
        for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/test"))
        if "MQTT-DoS-Connect_Flood" in file
    },
    **{
        file: 12
        for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/train"))
        if "MQTT-DoS-Connect_Flood" in file
    },
    **{
        file: 13
        for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/test"))
        if "MQTT-DoS-Publish_Flood" in file
    },
    **{
        file: 13
        for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/train"))
        if "MQTT-DoS-Publish_Flood" in file
    },
    **{
        file: 14
        for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/test"))
        if "Malformed_Data" in file
    },
    **{
        file: 14
        for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/train"))
        if "Malformed_Data" in file
    },
    **{
        file: 15
        for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/test"))
        if "OS_Scan" in file
    },
    **{
        file: 15
        for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/train"))
        if "OS_Scan" in file
    },
    **{
        file: 16
        for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/test"))
        if "Ping_Sweep" in file
    },
    **{
        file: 16
        for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/train"))
        if "Ping_Sweep" in file
    },
    **{
        file: 17
        for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/test"))
        if "Port_Scan" in file
    },
    **{
        file: 17
        for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/train"))
        if "Port_Scan" in file
    },
    **{
        file: 18
        for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/test"))
        if "VulScan" in file
    },
    **{
        file: 18
        for file in os.listdir(os.path.join(main_folder_path, "attacks/csv/train"))
        if "VulScan" in file
    },
}

In [7]:
print("Loading profiling data (benign)...")

# Load the profiling CSV files (benign)
profiling_folder_path = os.path.join(main_folder_path, "profiling/CSV")
profiling_data = load_csv_files_with_labels(profiling_folder_path, label_mapping)

Loading profiling data (benign)...
Loading file: CICIoMT2024/WiFI_and_MQTT/profiling/CSV/SenseUBaby_Power.pcap.csv with label: 0
Loading file: CICIoMT2024/WiFI_and_MQTT/profiling/CSV/Singcall_WAN_PHYSICAL.pcap.csv with label: 0
Loading file: CICIoMT2024/WiFI_and_MQTT/profiling/CSV/Owltron_Camera_LAN_MIC.pcap.csv with label: 0
Loading file: CICIoMT2024/WiFI_and_MQTT/profiling/CSV/Owltron_Camera_LAN_WATCH.pcap.csv with label: 0
Loading file: CICIoMT2024/WiFI_and_MQTT/profiling/CSV/Blink_Camera_WAN_MIC.pcap.csv with label: 0
Loading file: CICIoMT2024/WiFI_and_MQTT/profiling/CSV/M1T_Camera_WAN_PHOTO.pcap.csv with label: 0
Loading file: CICIoMT2024/WiFI_and_MQTT/profiling/CSV/Ecobee_Camera_LAN_MIC.pcap.csv with label: 0
Loading file: CICIoMT2024/WiFI_and_MQTT/profiling/CSV/Owltron_Camera_WAN_WATCH.pcap.csv with label: 0
Loading file: CICIoMT2024/WiFI_and_MQTT/profiling/CSV/M1T_Camera_LAN_PHOTO.pcap.csv with label: 0
Loading file: CICIoMT2024/WiFI_and_MQTT/profiling/CSV/Multifunctional_Pager

In [8]:
print("Loading attack data (malicious)...")

# Load the attack CSV files (malicious)
attack_folder_path_test = os.path.join(main_folder_path, "attacks/csv/test")
attack_folder_path_train = os.path.join(main_folder_path, "attacks/csv/train")

# Load the attack CSV files (malicious)
attack_data_test = load_csv_files_with_labels(attack_folder_path_test, label_mapping)
attack_data_train = load_csv_files_with_labels(attack_folder_path_train, label_mapping)

Loading attack data (malicious)...
Loading file: CICIoMT2024/WiFI_and_MQTT/attacks/csv/test/TCP_IP-DDoS-ICMP1_test.pcap.csv with label: 6
Loading file: CICIoMT2024/WiFI_and_MQTT/attacks/csv/test/TCP_IP-DDoS-UDP2_test.pcap.csv with label: 9
Loading file: CICIoMT2024/WiFI_and_MQTT/attacks/csv/test/MQTT-DDoS-Connect_Flood_test.pcap.csv with label: 10
Loading file: CICIoMT2024/WiFI_and_MQTT/attacks/csv/test/Recon-Port_Scan_test.pcap.csv with label: 17
Loading file: CICIoMT2024/WiFI_and_MQTT/attacks/csv/test/MQTT-DoS-Publish_Flood_test.pcap.csv with label: 13
Loading file: CICIoMT2024/WiFI_and_MQTT/attacks/csv/test/TCP_IP-DDoS-UDP1_test.pcap.csv with label: 9
Loading file: CICIoMT2024/WiFI_and_MQTT/attacks/csv/test/TCP_IP-DDoS-ICMP2_test.pcap.csv with label: 6
Loading file: CICIoMT2024/WiFI_and_MQTT/attacks/csv/test/TCP_IP-DoS-UDP_test.pcap.csv with label: 9
Loading file: CICIoMT2024/WiFI_and_MQTT/attacks/csv/test/TCP_IP-DoS-SYN_test.pcap.csv with label: 7
Loading file: CICIoMT2024/WiFI_and

In [9]:
print("Combining data...")
# Combine the data
combined_data = pd.concat(
    [profiling_data, attack_data_test, attack_data_train], ignore_index=True
)

Combining data...


In [10]:
print("Preprocessing data...")
# Preprocess the data
# Handling missing values by filling them with the mean of the column
combined_data.fillna(combined_data.mean(), inplace=True)

# Feature Selection: Drop columns that are not useful for classification
irrelevant_columns = ["Label"]  # Add any other non-numeric columns if needed
X = combined_data.drop(columns=irrelevant_columns)
y = combined_data["Label"]

Preprocessing data...


In [11]:
print("Standardizing data...")
# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# # Address class imbalance using SMOTE
# print("Applying SMOTE for class imbalance...")
# smote = SMOTE(random_state=42)
# X_scaled, y = smote.fit_resample(X_scaled, y)

Standardizing data...


In [12]:
print("Splitting data into training and testing sets...")
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

print(X_train, X_test)
print(y_train, y_test)

Splitting data into training and testing sets...
[[-1.29868625e-01 -3.33901893e-01 -9.99581778e-02 ... -1.31304873e-01
  -1.76946424e-01  8.99889466e-04]
 [-3.78474230e-02  1.41192453e+00 -9.99581778e-02 ... -1.31309209e-01
  -4.12682267e-01  8.99889466e-04]
 [-1.30060753e-01 -3.33901893e-01 -9.99581778e-02 ... -1.31309209e-01
  -4.12682267e-01  8.99889466e-04]
 ...
 [-8.52838942e-02  1.41192453e+00 -9.99581778e-02 ... -1.31309209e-01
  -4.12682267e-01  8.99889466e-04]
 [-1.30060753e-01 -3.33901893e-01 -9.99581778e-02 ... -1.31309209e-01
  -4.12682267e-01  8.99889466e-04]
 [-1.30188870e-01 -1.12745936e+00 -9.99581778e-02 ... -1.31309209e-01
  -4.12682267e-01  8.99889466e-04]] [[-8.87929451e-02  1.41192453e+00 -9.99581778e-02 ... -1.28239491e-01
  -1.76946424e-01  8.99889466e-04]
 [-4.48993809e-02 -3.33901893e-01 -9.99581778e-02 ...  2.01993883e+00
   2.10183339e+00  8.99889466e-04]
 [-1.30060753e-01 -3.33901893e-01 -9.99581778e-02 ... -1.31309209e-01
  -4.12682267e-01  8.99889466e-04]


In [13]:
print("Training Random Forest Classifier...")
# Train a Random Forest Classifier with class weights
clf = RandomForestClassifier(
    n_estimators=100,
    criterion="gini",
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features="sqrt",
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    bootstrap=True,
    oob_score=False,
    n_jobs=None,
    random_state=42,
    verbose=0,
    warm_start=False,
    class_weight="balanced",
    ccp_alpha=0.0,
    max_samples=None,
)
clf.fit(X_train, y_train)

# print("Training Balanced Random Forest Classifier...")
# clf = BalancedRandomForestClassifier(n_estimators=100, random_state=42)
# clf.fit(X_train, y_train)

Training Random Forest Classifier...


In [14]:
print("Predicting on the test set...")
# Predict on the test set
y_pred = clf.predict(X_test)

Predicting on the test set...


In [15]:
# %%
print("Printing classification report and accuracy...")
# Print classification report and accuracy
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Accuracy:", accuracy_score(y_test, y_pred))

print("Finished!")

Printing classification report and accuracy...
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     77918
           1       0.91      0.84      0.87      3535
           6       1.00      1.00      1.00    480723
           7       1.00      1.00      1.00    303025
           8       1.00      1.00      1.00    289764
           9       1.00      1.00      1.00    540296
          10       1.00      1.00      1.00     42939
          11       1.00      1.00      1.00      7279
          12       1.00      1.00      1.00      3207
          13       1.00      1.00      1.00     10520
          14       0.95      0.84      0.89      1333
          15       0.87      0.74      0.80      4049
          16       0.83      0.82      0.83       198
          17       0.96      0.97      0.96     21497
          18       0.87      0.70      0.78       643

    accuracy                           1.00   1786926
   macro a

In [17]:
import tqdm
# Alternative 1
print("\nTraining Random Forest Classifier with Alternative 1...")
clf_1 = RandomForestClassifier(
    n_estimators=500,
    criterion="gini",
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features="log2",
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    bootstrap=True,
    oob_score=True,
    n_jobs=-1,
    random_state=42,
    verbose=0,
    warm_start=False,
    class_weight="balanced",
    ccp_alpha=0.0,
    max_samples=None,
)


# Number of trees to add incrementally
n_estimators_increment = 10
total_estimators = 500

# Create a progress bar
with tqdm(total=total_estimators) as pbar:
    for i in range(int(total_estimators / n_estimators_increment)):
        clf_1.set_params(n_estimators=clf_1.n_estimators + n_estimators_increment)
        clf_1.fit(X_train, y_train)
        pbar.update(n_estimators_increment)



Training Random Forest Classifier with Alternative 1...


In [None]:
y_pred_1 = clf_1.predict(X_test)

In [None]:
print("Classification Report for Alternative 1:")
print(classification_report(y_test, y_pred_1))
print("Accuracy for Alternative 1:", accuracy_score(y_test, y_pred_1))

In [None]:
# Alternative 2
print("\nTraining Random Forest Classifier with Alternative 2...")
clf_2 = RandomForestClassifier(
    n_estimators=200,
    criterion="entropy",
    max_depth=30,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features="sqrt",
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    bootstrap=True,
    oob_score=False,
    n_jobs=-1,
    random_state=42,
    verbose=0,
    warm_start=False,
    class_weight=None,
    ccp_alpha=0.0,
    max_samples=None,
)
clf_2.fit(X_train, y_train)

In [None]:
y_pred_2 = clf_2.predict(X_test)

In [None]:
print("Classification Report for Alternative 2:")
print(classification_report(y_test, y_pred_2))
print("Accuracy for Alternative 2:", accuracy_score(y_test, y_pred_2))

In [None]:
# Alternative 3
print("\nTraining Random Forest Classifier with Alternative 3...")
clf_3 = RandomForestClassifier(
    n_estimators=300,
    criterion="gini",
    max_depth=None,
    min_samples_split=10,
    min_samples_leaf=4,
    min_weight_fraction_leaf=0.0,
    max_features="log2",
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    bootstrap=True,
    oob_score=True,
    n_jobs=-1,
    random_state=42,
    verbose=0,
    warm_start=False,
    class_weight=None,
    ccp_alpha=0.0,
    max_samples=None,
)
clf_3.fit(X_train, y_train)

In [None]:
y_pred_3 = clf_3.predict(X_test)

In [None]:
print("Classification Report for Alternative 3:")
print(classification_report(y_test, y_pred_3))
print("Accuracy for Alternative 3:", accuracy_score(y_test, y_pred_3))

In [None]:
# Alternative 4
print("\nTraining Random Forest Classifier with Alternative 4...")
clf_4 = RandomForestClassifier(
    n_estimators=150,
    criterion="gini",
    max_depth=10,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features="sqrt",
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    bootstrap=True,
    oob_score=False,
    n_jobs=-1,
    random_state=42,
    verbose=0,
    warm_start=False,
    class_weight="balanced_subsample",
    ccp_alpha=0.0,
    max_samples=None,
)
clf_4.fit(X_train, y_train)

In [None]:
y_pred_4 = clf_4.predict(X_test)

In [None]:
print("Classification Report for Alternative 4:")
print(classification_report(y_test, y_pred_4))
print("Accuracy for Alternative 4:", accuracy_score(y_test, y_pred_4))

In [None]:
# Alternative 5
print("\nTraining Random Forest Classifier with Alternative 5...")
clf_5 = RandomForestClassifier(
    n_estimators=100,
    criterion="gini",
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features="sqrt",
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    bootstrap=True,
    oob_score=False,
    n_jobs=-1,
    random_state=42,
    verbose=0,
    warm_start=True,
    class_weight=None,
    ccp_alpha=0.0,
    max_samples=None,
)
clf_5.fit(X_train, y_train)

In [None]:
clf_5.set_params(n_estimators=200)
clf_5.fit(X_train, y_train)

In [None]:
clf_5.set_params(n_estimators=300)
clf_5.fit(X_train, y_train)

In [None]:
y_pred_5 = clf_5.predict(X_test)

In [None]:
print("Classification Report for Alternative 5:")
print(classification_report(y_test, y_pred_5))
print("Accuracy for Alternative 5:", accuracy_score(y_test, y_pred_5))