In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('iot_intrusion_dataset.csv')
df.head()

Unnamed: 0,Flow_ID,Src_IP,Src_Port,Dst_IP,Dst_Port,Protocol,Timestamp,Flow_Duration,Tot_Fwd_Pkts,Tot_Bwd_Pkts,...,Active_Std,Active_Max,Active_Min,Idle_Mean,Idle_Std,Idle_Max,Idle_Min,Label,Cat,Sub_Cat
0,192.168.0.13-192.168.0.16-10000-10101-17,192.168.0.13,10000,192.168.0.16,10101,17,25/07/2019 03:25:53 AM,75,1,1,...,0.0,0.0,0.0,75.0,0.0,75.0,75.0,Anomaly,Mirai,Mirai-Ackflooding
1,192.168.0.13-222.160.179.132-554-2179-6,222.160.179.132,2179,192.168.0.13,554,6,26/05/2019 10:11:06 PM,5310,1,2,...,0.0,0.0,0.0,2655.0,2261.327486,4254.0,1056.0,Anomaly,DoS,DoS-Synflooding
2,192.168.0.13-192.168.0.16-9020-52727-6,192.168.0.16,52727,192.168.0.13,9020,6,11/07/2019 01:24:48 AM,141,0,3,...,0.0,0.0,0.0,70.5,0.707107,71.0,70.0,Anomaly,Scan,Scan Port OS
3,192.168.0.13-192.168.0.16-9020-52964-6,192.168.0.16,52964,192.168.0.13,9020,6,04/09/2019 03:58:17 AM,151,0,2,...,0.0,0.0,0.0,151.0,0.0,151.0,151.0,Anomaly,Mirai,Mirai-Hostbruteforceg
4,192.168.0.1-239.255.255.250-36763-1900-17,192.168.0.1,36763,239.255.255.250,1900,17,10/09/2019 01:41:18 AM,153,2,1,...,0.0,0.0,0.0,76.5,0.707107,77.0,76.0,Anomaly,Mirai,Mirai-Hostbruteforceg


In [3]:


# List of columns to drop
columns_to_drop = [
    'Flow_ID',        # Unique identifier, not useful for prediction
    'Src_IP',         # Source IP address (irrelevant unless engineered)
    'Dst_IP',         # Destination IP address (irrelevant unless engineered)
    'Timestamp',      # Timestamp (not useful unless doing time-series analysis)
    'Cat',            # Related to the target variable (Label)
    'Sub_Cat',        # Related to the target variable (Label)
    'Src_Port',       # Source port (usually not useful for intrusion detection)
    'Dst_Port',       # Destination port (usually not useful for intrusion detection)
    'Protocol',       # Protocol (can be kept if you want to encode it, but dropping for simplicity)
    'Fwd_URG_Flags',  # Rarely used in modern networks
    'Bwd_URG_Flags',  # Rarely used in modern networks
    'CWE_Flag_Count', # Rarely used in modern networks
    'ECE_Flag_Cnt',   # Rarely used in modern networks
    'Down/Up_Ratio',  # Redundant with other flow features
    'Fwd_Blk_Rate_Avg', # Redundant with other flow features
    'Bwd_Blk_Rate_Avg', # Redundant with other flow features
    'Init_Fwd_Win_Byts', # Redundant with other flow features
    'Init_Bwd_Win_Byts', # Redundant with other flow features
    'Fwd_Seg_Size_Min',  # Redundant with other packet length features
    'Active_Min',        # Redundant with other timing features
    'Idle_Min',          # Redundant with other timing features
]

# Drop the unnecessary columns
df_updated = df.drop(columns=columns_to_drop)

# Display the remaining columns
print("Remaining columns after dropping unnecessary ones:")
print(df.columns)

Remaining columns after dropping unnecessary ones:
Index(['Flow_ID', 'Src_IP', 'Src_Port', 'Dst_IP', 'Dst_Port', 'Protocol',
       'Timestamp', 'Flow_Duration', 'Tot_Fwd_Pkts', 'Tot_Bwd_Pkts',
       'TotLen_Fwd_Pkts', 'TotLen_Bwd_Pkts', 'Fwd_Pkt_Len_Max',
       'Fwd_Pkt_Len_Min', 'Fwd_Pkt_Len_Mean', 'Fwd_Pkt_Len_Std',
       'Bwd_Pkt_Len_Max', 'Bwd_Pkt_Len_Min', 'Bwd_Pkt_Len_Mean',
       'Bwd_Pkt_Len_Std', 'Flow_Byts/s', 'Flow_Pkts/s', 'Flow_IAT_Mean',
       'Flow_IAT_Std', 'Flow_IAT_Max', 'Flow_IAT_Min', 'Fwd_IAT_Tot',
       'Fwd_IAT_Mean', 'Fwd_IAT_Std', 'Fwd_IAT_Max', 'Fwd_IAT_Min',
       'Bwd_IAT_Tot', 'Bwd_IAT_Mean', 'Bwd_IAT_Std', 'Bwd_IAT_Max',
       'Bwd_IAT_Min', 'Fwd_PSH_Flags', 'Bwd_PSH_Flags', 'Fwd_URG_Flags',
       'Bwd_URG_Flags', 'Fwd_Header_Len', 'Bwd_Header_Len', 'Fwd_Pkts/s',
       'Bwd_Pkts/s', 'Pkt_Len_Min', 'Pkt_Len_Max', 'Pkt_Len_Mean',
       'Pkt_Len_Std', 'Pkt_Len_Var', 'FIN_Flag_Cnt', 'SYN_Flag_Cnt',
       'RST_Flag_Cnt', 'PSH_Flag_Cnt', 'ACK_Flag_C

In [4]:
df_updated.head()

Unnamed: 0,Flow_Duration,Tot_Fwd_Pkts,Tot_Bwd_Pkts,TotLen_Fwd_Pkts,TotLen_Bwd_Pkts,Fwd_Pkt_Len_Max,Fwd_Pkt_Len_Min,Fwd_Pkt_Len_Mean,Fwd_Pkt_Len_Std,Bwd_Pkt_Len_Max,...,Subflow_Bwd_Pkts,Subflow_Bwd_Byts,Fwd_Act_Data_Pkts,Active_Mean,Active_Std,Active_Max,Idle_Mean,Idle_Std,Idle_Max,Label
0,75,1,1,982.0,1430.0,982.0,982.0,982.0,0.0,1430.0,...,1,1430,1,0.0,0.0,0.0,75.0,0.0,75.0,Anomaly
1,5310,1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2,0,0,0.0,0.0,0.0,2655.0,2261.327486,4254.0,Anomaly
2,141,0,3,0.0,2806.0,0.0,0.0,0.0,0.0,1388.0,...,3,2806,0,0.0,0.0,0.0,70.5,0.707107,71.0,Anomaly
3,151,0,2,0.0,2776.0,0.0,0.0,0.0,0.0,1388.0,...,2,2776,0,0.0,0.0,0.0,151.0,0.0,151.0,Anomaly
4,153,2,1,886.0,420.0,452.0,434.0,443.0,12.727922,420.0,...,1,420,2,0.0,0.0,0.0,76.5,0.707107,77.0,Anomaly


In [5]:

numerical_features = [
    'Flow_Duration', 'Tot_Fwd_Pkts', 'Tot_Bwd_Pkts', 'TotLen_Fwd_Pkts',
    'TotLen_Bwd_Pkts', 'Fwd_Pkt_Len_Max', 'Fwd_Pkt_Len_Min', 'Fwd_Pkt_Len_Mean',
    'Fwd_Pkt_Len_Std', 'Bwd_Pkt_Len_Max', 'Bwd_Pkt_Len_Min', 'Bwd_Pkt_Len_Mean',
    'Bwd_Pkt_Len_Std', 'Flow_Byts/s', 'Flow_Pkts/s', 'Flow_IAT_Mean',
    'Flow_IAT_Std', 'Flow_IAT_Max', 'Flow_IAT_Min', 'Fwd_IAT_Tot', 'Fwd_IAT_Mean',
    'Fwd_IAT_Std', 'Fwd_IAT_Max', 'Fwd_IAT_Min', 'Bwd_IAT_Tot', 'Bwd_IAT_Mean',
    'Bwd_IAT_Std', 'Bwd_IAT_Max', 'Bwd_IAT_Min', 'Fwd_Header_Len', 'Bwd_Header_Len',
    'Fwd_Pkts/s', 'Bwd_Pkts/s', 'Pkt_Len_Min', 'Pkt_Len_Max', 'Pkt_Len_Mean',
    'Pkt_Len_Std', 'Pkt_Len_Var', 'FIN_Flag_Cnt', 'SYN_Flag_Cnt', 'RST_Flag_Cnt',
    'PSH_Flag_Cnt', 'ACK_Flag_Cnt', 'URG_Flag_Cnt', 'Subflow_Fwd_Pkts',
    'Subflow_Fwd_Byts', 'Subflow_Bwd_Pkts', 'Subflow_Bwd_Byts', 'Active_Mean',
    'Active_Std', 'Active_Max', 'Idle_Mean', 'Idle_Std', 'Idle_Max'
]

# # Convert numerical columns to float32 to save memory
# for col in numerical_features:
#     df[col] = df[col].astype('float32')

# # Initialize the StandardScaler
# scaler = StandardScaler()

# # Normalize in batches (if the dataset is too large)
# batch_size = 50000  # Adjust based on your system's memory
# for i in range(0, len(df), batch_size):
#     batch = df[numerical_features].iloc[i:i + batch_size]
#     df[numerical_features].iloc[i:i + batch_size] = scaler.fit_transform(batch)

# # Display the first few rows of the normalized dataset
# print("Dataset after normalization:")
# print(df.head())

In [6]:
import dask.dataframe as dd

# Load the dataset using Dask
df = dd.read_csv('iot_intrusion_dataset.csv')

# Normalize numerical features
df[numerical_features] = df[numerical_features].map_partitions(
    lambda x: scaler.fit_transform(x), meta=df[numerical_features]
)

# Compute the result (this will execute the operations)
df = df.compute()

: 