In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
project_path = '/content/drive/MyDrive/Labelled_Multiclass_Dataset'
os.chdir(project_path)

In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Labelled_Multiclass_Dataset/combined_data_without_okpVacc_modified.csv')

In [None]:
data.rename(columns={'Unnamed: 0': 'id'}, inplace=True)
drop_columns = [
    "id"
   ]
data.drop(columns=drop_columns, inplace=True, errors='ignore')

In [None]:
data['timestamp'] = pd.to_datetime(data['timestamp'], errors='coerce', format='%d-%m-%Y %H:%M')

In [None]:
# Identifying missing values
missing_val = data.isna().sum()
print(missing_val.loc[missing_val > 0])

Series([], dtype: int64)


In [None]:
# Checking for infinity values
numeric_cols = data.select_dtypes(include = np.number).columns
inf_count = np.isinf(data[numeric_cols]).sum()
print(inf_count[inf_count > 0])

Series([], dtype: int64)


In [None]:
# Replacing any infinite values (positive or negative) with NaN (not a number)
print(f'Initial missing values: {data.isna().sum().sum()}')

data.replace([np.inf, -np.inf], np.nan, inplace = True)

print(f'Missing values after processing infinite values: {data.isna().sum().sum()}')

Initial missing values: 0
Missing values after processing infinite values: 0


In [None]:
# Check for Duplicate Rows
duplicates = data[data.duplicated()]
print(f"Number of duplicate rows: {len(duplicates)}")

Number of duplicate rows: 0


In [None]:
# Define window size  and step size
window_size = pd.Timedelta('5s')  # 1 minute window
step_size = pd.Timedelta('1s')   # 30-second sliding step

In [None]:
# Ensure 'timestamp' is included in the dataset
data['timestamp'] = pd.to_datetime(data['timestamp'], errors='coerce', format='%d-%m-%Y %H:%M')

# Aggregate session-based features and add timestamp information
session_data = data.groupby(['src_ip', 'dst_ip', 'src_port', 'dst_port', 'protocol']).agg(
    # Session-based aggregations
    flow_duration=('flow_duration', 'sum'),
    total_forward_packets=('tot_fwd_pkts', 'sum'),
    total_backward_packets=('tot_bwd_pkts', 'sum'),
    total_bytes_forward=('totlen_fwd_pkts', 'sum'),
    total_bytes_backward=('totlen_bwd_pkts', 'sum'),
    mean_packet_length_forward=('fwd_pkt_len_mean', 'mean'),
    mean_packet_length_backward=('bwd_pkt_len_mean', 'mean'),
    packet_size_mean=('pkt_len_mean', 'mean'),
    flow_iat_mean=('flow_iat_mean', 'mean'),
    down_up_ratio=('down_up_ratio', 'mean'),
    subflow_fwd_pkts=('subflow_fwd_pkts', 'sum'),
    subflow_bwd_pkts=('subflow_bwd_pkts', 'sum'),
    subflow_fwd_byts=('subflow_fwd_byts', 'sum'),
    subflow_bwd_byts=('subflow_bwd_byts', 'sum'),

    # Direction-specific aggregations (Forward flows)
    fwd_pkt_len_mean=('fwd_pkt_len_mean', 'mean'),
    fwd_pkt_len_max=('fwd_pkt_len_max', 'max'),
    fwd_pkt_len_min=('fwd_pkt_len_min', 'min'),
    fwd_pkt_len_std=('fwd_pkt_len_std', 'std'),
    fwd_iat_mean=('fwd_iat_mean', 'mean'),
    fwd_iat_max=('fwd_iat_max', 'max'),
    fwd_iat_min=('fwd_iat_min', 'min'),
    fwd_iat_tot=('fwd_iat_tot', 'sum'),
    fwd_blk_rate_avg=('fwd_blk_rate_avg', 'mean'),

    # Direction-specific aggregations (Backward flows)
    bwd_pkt_len_mean=('bwd_pkt_len_mean', 'mean'),
    bwd_pkt_len_max=('bwd_pkt_len_max', 'max'),
    bwd_pkt_len_min=('bwd_pkt_len_min', 'min'),
    bwd_pkt_len_std=('bwd_pkt_len_std', 'std'),
    bwd_iat_mean=('bwd_iat_mean', 'mean'),
    bwd_iat_max=('bwd_iat_max', 'max'),
    bwd_iat_min=('bwd_iat_min', 'min'),
    bwd_iat_tot=('bwd_iat_tot', 'sum'),
    bwd_blk_rate_avg=('bwd_blk_rate_avg', 'mean'),

    # Total packets and bytes
    total_packets=('tot_fwd_pkts', lambda x: x.sum() + data['tot_bwd_pkts'].sum()),
    total_bytes=('totlen_fwd_pkts', lambda x: x.sum() + data['totlen_bwd_pkts'].sum()),

    # Timestamp info: min and max timestamp per session
    start_time=('timestamp', 'min'),
    end_time=('timestamp', 'max')
).reset_index()

In [None]:
session_data.columns

Index(['src_ip', 'dst_ip', 'src_port', 'dst_port', 'protocol', 'flow_duration',
       'total_forward_packets', 'total_backward_packets',
       'total_bytes_forward', 'total_bytes_backward',
       'mean_packet_length_forward', 'mean_packet_length_backward',
       'packet_size_mean', 'flow_iat_mean', 'down_up_ratio',
       'subflow_fwd_pkts', 'subflow_bwd_pkts', 'subflow_fwd_byts',
       'subflow_bwd_byts', 'fwd_pkt_len_mean', 'fwd_pkt_len_max',
       'fwd_pkt_len_min', 'fwd_pkt_len_std', 'fwd_iat_mean', 'fwd_iat_max',
       'fwd_iat_min', 'fwd_iat_tot', 'fwd_blk_rate_avg', 'bwd_pkt_len_mean',
       'bwd_pkt_len_max', 'bwd_pkt_len_min', 'bwd_pkt_len_std', 'bwd_iat_mean',
       'bwd_iat_max', 'bwd_iat_min', 'bwd_iat_tot', 'bwd_blk_rate_avg',
       'total_packets', 'total_bytes', 'start_time', 'end_time'],
      dtype='object')

In [None]:
# Time-Based Sliding Window Aggregation
def sliding_window_aggregation(data, window_size, step_size):
    window_aggregates = []

    # Generate the start times using pd.date_range
    start_times = pd.date_range(start=data.index.min(), end=data.index.max(), freq=step_size)

    # Iterate over each time window
    for start_time in start_times:
        end_time = start_time + window_size
        window = data[(data.index >= start_time) & (data.index < end_time)]

        # Perform aggregation for each window
        aggregated = {
            'start_time': start_time,
            'end_time': end_time,
            'total_forward_packets_window': window['tot_fwd_pkts'].sum(),
            'total_backward_packets_window': window['tot_bwd_pkts'].sum(),
            'total_forward_bytes_window': window['totlen_fwd_pkts'].sum(),
            'total_backward_bytes_window': window['totlen_bwd_pkts'].sum(),
            'average_packet_size_fwd_window': window['fwd_pkt_len_mean'].mean(),
            'average_packet_size_bwd_window': window['bwd_pkt_len_mean'].mean(),
            'flow_duration_window': window['flow_duration'].sum(),
            'packet_count_window': len(window)  # Count of packets in the window
        }

        window_aggregates.append(aggregated)

    return pd.DataFrame(window_aggregates)

In [None]:
# Set the timestamp as the index for resampling
data.set_index('timestamp', inplace=True)

In [None]:
# Apply sliding window aggregation
sliding_windows_data = sliding_window_aggregation(data, window_size, step_size)

In [None]:
sliding_windows_data.columns

Index(['start_time', 'end_time', 'total_forward_packets_window',
       'total_backward_packets_window', 'total_forward_bytes_window',
       'total_backward_bytes_window', 'average_packet_size_fwd_window',
       'average_packet_size_bwd_window', 'flow_duration_window',
       'packet_count_window'],
      dtype='object')

In [None]:
# Merge sliding window and session data on overlapping time intervals
aggregated_data = pd.merge_asof(
    sliding_windows_data.sort_values('start_time'),
    session_data.sort_values('start_time'),
    left_on='start_time',
    right_on='start_time',
    direction='backward'
)

In [None]:
aggregated_data.columns

Index(['start_time', 'end_time_x', 'total_forward_packets_window',
       'total_backward_packets_window', 'total_forward_bytes_window',
       'total_backward_bytes_window', 'average_packet_size_fwd_window',
       'average_packet_size_bwd_window', 'flow_duration_window',
       'packet_count_window', 'src_ip', 'dst_ip', 'src_port', 'dst_port',
       'protocol', 'flow_duration', 'total_forward_packets',
       'total_backward_packets', 'total_bytes_forward', 'total_bytes_backward',
       'mean_packet_length_forward', 'mean_packet_length_backward',
       'packet_size_mean', 'flow_iat_mean', 'down_up_ratio',
       'subflow_fwd_pkts', 'subflow_bwd_pkts', 'subflow_fwd_byts',
       'subflow_bwd_byts', 'fwd_pkt_len_mean', 'fwd_pkt_len_max',
       'fwd_pkt_len_min', 'fwd_pkt_len_std', 'fwd_iat_mean', 'fwd_iat_max',
       'fwd_iat_min', 'fwd_iat_tot', 'fwd_blk_rate_avg', 'bwd_pkt_len_mean',
       'bwd_pkt_len_max', 'bwd_pkt_len_min', 'bwd_pkt_len_std', 'bwd_iat_mean',
       'bwd_iat_

In [None]:
# Select columns (src_ip, dst_ip, and Label) from the original DataFrame
original_subset = data[['src_ip', 'dst_ip', 'src_port', 'dst_port', 'protocol', 'Label']].drop_duplicates()

In [None]:
# Perform the merge
aggregated_data = aggregated_data.merge(original_subset, on=['src_ip', 'dst_ip', 'src_port', 'dst_port', 'protocol'], how='left')

In [None]:
# Identifying missing values
missing_val = aggregated_data.isna().sum()
print(missing_val.loc[missing_val > 0])

average_packet_size_fwd_window    266420
average_packet_size_bwd_window    266420
fwd_pkt_len_std                   143521
bwd_pkt_len_std                   143521
dtype: int64


In [None]:
# selecting numeric attributes columns from aggregated_data
num_col = list(aggregated_data.select_dtypes(include='number').columns)
print(num_col)

['total_forward_packets_window', 'total_backward_packets_window', 'total_forward_bytes_window', 'total_backward_bytes_window', 'average_packet_size_fwd_window', 'average_packet_size_bwd_window', 'flow_duration_window', 'packet_count_window', 'src_port', 'dst_port', 'protocol', 'flow_duration', 'total_forward_packets', 'total_backward_packets', 'total_bytes_forward', 'total_bytes_backward', 'mean_packet_length_forward', 'mean_packet_length_backward', 'packet_size_mean', 'flow_iat_mean', 'down_up_ratio', 'subflow_fwd_pkts', 'subflow_bwd_pkts', 'subflow_fwd_byts', 'subflow_bwd_byts', 'fwd_pkt_len_mean', 'fwd_pkt_len_max', 'fwd_pkt_len_min', 'fwd_pkt_len_std', 'fwd_iat_mean', 'fwd_iat_max', 'fwd_iat_min', 'fwd_iat_tot', 'fwd_blk_rate_avg', 'bwd_pkt_len_mean', 'bwd_pkt_len_max', 'bwd_pkt_len_min', 'bwd_pkt_len_std', 'bwd_iat_mean', 'bwd_iat_max', 'bwd_iat_min', 'bwd_iat_tot', 'bwd_blk_rate_avg', 'total_packets', 'total_bytes']


In [None]:
# Checking for infinity or NaN values and replacing them with the median value of the column
aggregated_data.replace([np.inf, -np.inf], np.nan, inplace=True)
for col in num_col:
    if aggregated_data[col].isnull().sum() > 0:
        median_value = aggregated_data[col].median()
        aggregated_data[col] = aggregated_data[col].fillna(median_value)  # Direct assignment to avoid warning

In [None]:
# Identifying missing values
missing_val = aggregated_data.isna().sum()
print(missing_val.loc[missing_val > 0])

Series([], dtype: int64)


In [None]:
data_sorted = aggregated_data.sort_values(by='start_time')

In [None]:
data_sorted['start_time'] = pd.to_datetime(data_sorted['start_time'])
data_sorted['end_time_y'] = pd.to_datetime(data_sorted['end_time_y'])

In [None]:
# Set window_size and step_size (which are Timedelta)
window_size = pd.Timedelta('5s')  # 5 seconds
step_size = pd.Timedelta('1s')    # 1 second

In [None]:
# Calculate total time range
total_time = data_sorted['end_time_y'].max() - data_sorted['start_time'].min()

In [None]:
# Calculate the number of windows based on total time range and window_size
n_windows = (total_time // step_size) - (window_size // step_size) + 1

In [None]:
# Create empty list to hold sliding windows
sliding_windows = []

In [None]:
# Get the start time of the dataset
start_time = data_sorted['start_time'].min()

In [None]:
print(n_windows)

287996


In [None]:
for i in range(n_windows):
    # Define window start and end times
    window_start = start_time + i * step_size
    window_end = window_start + window_size

    # Filter data within this window
    window_data = data_sorted[(data_sorted['start_time'] >= window_start) &
                              (data_sorted['end_time_y'] <= window_end)]

    # Extract the relevant features for this window
    window_features = window_data[['total_forward_packets_window', 'total_backward_packets_window', 'total_forward_bytes_window', 'total_backward_bytes_window', 'average_packet_size_fwd_window', 'average_packet_size_bwd_window', 'flow_duration_window', 'packet_count_window', 'src_port', 'dst_port', 'protocol', 'flow_duration', 'total_forward_packets', 'total_backward_packets', 'total_bytes_forward', 'total_bytes_backward', 'mean_packet_length_forward', 'mean_packet_length_backward', 'packet_size_mean', 'flow_iat_mean', 'down_up_ratio', 'subflow_fwd_pkts', 'subflow_bwd_pkts', 'subflow_fwd_byts', 'subflow_bwd_byts', 'fwd_pkt_len_mean', 'fwd_pkt_len_max', 'fwd_pkt_len_min', 'fwd_pkt_len_std', 'fwd_iat_mean', 'fwd_iat_max', 'fwd_iat_min', 'fwd_iat_tot', 'fwd_blk_rate_avg', 'bwd_pkt_len_mean', 'bwd_pkt_len_max', 'bwd_pkt_len_min', 'bwd_pkt_len_std', 'bwd_iat_mean', 'bwd_iat_max', 'bwd_iat_min', 'bwd_iat_tot', 'bwd_blk_rate_avg', 'total_packets', 'total_bytes']].values

    # Add this window's features to the list
    sliding_windows.append(window_features)