<a href="https://colab.research.google.com/github/Ananya-Ahuja/Anomalies/blob/main/port_ip_activity_spikes_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import random
from datetime import datetime, timedelta

### Helper Functions

In [None]:
def random_ip(subnet_prefix='192.168.1.'):
    return f"{subnet_prefix}{random.randint(1, 254)}"

In [None]:
def random_port(common_ports=None):
    if common_ports and random.random() < 0.7:
        return random.choice(common_ports)
    return random.randint(1024, 65535)

In [None]:
def random_protocol():
    return random.choice(['TCP', 'UDP', 'ICMP'])

### Main Function

In [None]:
def generate_activity_spikes_dataset_enhanced(num_records=1000, spike_ratio=0.05, burst_length=5):
    data = []
    base_time = datetime.now()

    # Common service ports for realism
    common_service_ports = [80, 443, 22, 21, 25, 53, 110, 143, 3389]

    # Pool of IPs and ports to simulate repeated connections
    ip_pool = [f'192.168.1.{i}' for i in range(1, 21)]  # 20 IPs
    port_pool = common_service_ports + [random.randint(1024, 65535) for _ in range(20)]

    # Select IPs and ports to cluster spikes
    spike_ips = random.sample(ip_pool, k=max(1, int(len(ip_pool)*0.2)))  # 20% of IPs
    spike_ports = random.sample(common_service_ports, k=max(1, int(len(common_service_ports)*0.3)))  # 30% of ports

    # Plan bursts for spikes
    spike_bursts = []
    num_bursts = int(num_records * spike_ratio / burst_length)
    for _ in range(num_bursts):
        start_idx = random.randint(0, num_records - burst_length)
        ip = random.choice(spike_ips)
        port = random.choice(spike_ports)
        spike_bursts.append((start_idx, ip, port))

    burst_indices = set()
    for start_idx, ip, port in spike_bursts:
        burst_indices.update(range(start_idx, start_idx + burst_length))

    for i in range(num_records):
        timestamp = base_time + timedelta(seconds=i)
        in_burst = i in burst_indices

        if in_burst:
            # Find which burst this index belongs to
            burst = next((b for b in spike_bursts if b[0] <= i < b[0] + burst_length), None)
            src_ip = burst[1]
            dst_ip = random.choice(ip_pool)
            src_port = random_port(common_service_ports)
            dst_port = burst[2]
            # Spike traffic
            bytes_sent = random.randint(10000, 50000)
            packets = random.randint(100, 500)
        else:
            # Normal traffic, with realistic patterns and some benign bursts
            src_ip = random.choice(ip_pool)
            dst_ip = random.choice(ip_pool)
            src_port = random_port(common_service_ports)
            dst_port = random_port(common_service_ports)
            # Occasionally simulate benign burst
            if random.random() < 0.02:
                bytes_sent = random.randint(2000, 8000)
                packets = random.randint(20, 80)
            else:
                bytes_sent = random.randint(40, 1500)
                packets = random.randint(1, 10)

        protocol = random_protocol()

        data.append({
            'Timestamp': timestamp.isoformat(),
            'Source IP': src_ip,
            'Destination IP': dst_ip,
            'Source Port': src_port,
            'Destination Port': dst_port,
            'Protocol': protocol,
            'Bytes': bytes_sent,
            'Packets': packets
        })

    df = pd.DataFrame(data)
    return df

In [None]:
sample_df = generate_activity_spikes_dataset_enhanced(1000)
sample_df.to_csv('port_ip_activity_spikes_dataset.csv', index=False)
print("Activity spikes dataset saved to port_ip_activity_spikes_dataset_enhanced.csv")

Activity spikes dataset saved to port_ip_activity_spikes_dataset_enhanced.csv
