In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import random

In [2]:
# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

# Parameters for the synthetic data
num_samples = 10000  # Total number of samples
anomaly_ratio = 0.05  # Percentage of anomalies in the dataset
num_anomalies = int(num_samples * anomaly_ratio)

In [3]:
start_time = datetime.now()
timestamps = [start_time + timedelta(seconds=i) for i in range(num_samples)]

# Generate normal data
data = {
    "timestamp": timestamps,
    "packet_size": np.random.normal(loc=500, scale=100, size=num_samples),  # Average packet size of 500 bytes
    "connection_duration": np.random.normal(loc=2, scale=0.5, size=num_samples),  # Average duration of 2 seconds
    "src_ip": [f"192.168.1.{np.random.randint(1, 255)}" for _ in range(num_samples)],
    "dst_ip": [f"10.0.0.{np.random.randint(1, 255)}" for _ in range(num_samples)],
    "label": [0] * num_samples  # Start with all normal labels
}

In [4]:
df = pd.DataFrame(data)

# Inject anomalies
# Increase packet sizes and connection durations randomly for anomaly samples
anomaly_indices = np.random.choice(df.index, size=num_anomalies, replace=False)
df.loc[anomaly_indices, "packet_size"] = np.random.normal(loc=1500, scale=300, size=num_anomalies)  # Larger packet sizes
df.loc[anomaly_indices, "connection_duration"] = np.random.normal(loc=10, scale=3, size=num_anomalies)  # Longer durations
df.loc[anomaly_indices, "label"] = 1  # Mark as anomaly

# Ensure no negative values in packet size or duration
df["packet_size"] = df["packet_size"].clip(lower=0)
df["connection_duration"] = df["connection_duration"].clip(lower=0)

# Save to CSV
df.to_csv("synthetic_network_traffic.csv", index=False)
print("Synthetic network traffic dataset saved as 'synthetic_network_traffic.csv'.")

Synthetic network traffic dataset saved as 'synthetic_network_traffic.csv'.
