In [2]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import random

In [3]:
# Load the existing dataset
df = pd.read_csv("synthetic_network_traffic.csv")

# Convert timestamp to datetime if not already
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [4]:
# Initialize new feature columns
packet_inter_arrival_times = []
packet_duration_ratios = []
syn_packet_frequencies = []

# Initialize variables for calculations
previous_timestamp = None
syn_packet_count = 0
time_window_duration = timedelta(seconds=60)  # 60-second window
time_window_start = df['timestamp'].iloc[0]

# Feature Engineering
for i, row in df.iterrows():
    current_timestamp = row['timestamp']
    packet_size = row['packet_size']
    connection_duration = row['connection_duration']
    
    # Calculate packet inter-arrival time
    if previous_timestamp is not None:
        inter_arrival_time = (current_timestamp - previous_timestamp).total_seconds()
    else:
        inter_arrival_time = 0
    packet_inter_arrival_times.append(inter_arrival_time)

    # Calculate packet duration ratio
    duration_ratio = packet_size / connection_duration if connection_duration > 0 else 0
    packet_duration_ratios.append(duration_ratio)

    # Calculate SYN packet frequency in a 60-second window (simulate SYN packet probability)
    packet_type = 'SYN' if random.random() < 0.1 else 'NORMAL'  # 10% probability for SYN
    time_elapsed = current_timestamp - time_window_start
    if packet_type == 'SYN':
        syn_packet_count += 1

    if time_elapsed >= time_window_duration:
        syn_frequency = syn_packet_count / time_elapsed.total_seconds()
        syn_packet_frequencies.append(syn_frequency)
        syn_packet_count = 0  # Reset for the next window
        time_window_start = current_timestamp
    else:
        # Keep previous frequency if window not elapsed
        syn_packet_frequencies.append(syn_packet_frequencies[-1] if syn_packet_frequencies else 0)

    # Update previous timestamp
    previous_timestamp = current_timestamp


In [5]:
# Add new features to the DataFrame
df['packet_inter_arrival_time'] = packet_inter_arrival_times
df['packet_duration_ratio'] = packet_duration_ratios
df['syn_packet_frequency'] = syn_packet_frequencies

# Save updated dataset
df.to_csv("synthetic_network_traffic_with_features.csv", index=False)
print("Updated dataset with engineered features saved as 'synthetic_network_traffic_with_features.csv'.")

Updated dataset with engineered features saved as 'synthetic_network_traffic_with_features.csv'.
