<a href="https://colab.research.google.com/github/Ananya-Ahuja/Anomalyze/blob/ananya_ahuja/IoT_IPDR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import pytz

In [2]:
# Set timezone for India
IST = pytz.timezone('Asia/Kolkata')

def get_iot_time_pattern(start_dt, duration_days, device_type):
    """
    Generate realistic IoT device time patterns based on device type:
    - Sensors: Periodic updates every few minutes/hours
    - Smart home: Event-driven + periodic (peak evening usage)
    - Industrial: 24/7 with maintenance windows
    """
    day_offset = random.randint(0, duration_days - 1)
    base = start_dt + timedelta(days=day_offset)

    if device_type in ['temperature_sensor', 'humidity_sensor', 'pressure_sensor']:
        # Periodic sensors: uniform distribution across 24/7
        hour = random.randint(0, 23)

    elif device_type in ['smart_light', 'smart_plug', 'smart_camera', 'motion_sensor']:
        # Smart home: peak evening usage, some daytime activity
        if random.random() < 0.50:  # 50% evening/night
            hour = random.choice(list(range(18, 24)) + list(range(0, 2)))
        elif random.random() < 0.75:  # 25% daytime
            hour = random.choice(list(range(8, 18)))
        else:  # 25% other times
            hour = random.choice(list(range(2, 8)))

    elif device_type in ['industrial_sensor', 'gateway', 'actuator']:
        # Industrial: 24/7 with slight reduction during maintenance (2-5 AM)
        if random.random() < 0.10:  # 10% maintenance window
            hour = random.randint(2, 5)
        else:  # 90% normal operation
            hour = random.randint(0, 23)
    else:
        # Default: uniform distribution
        hour = random.randint(0, 23)

    minute = random.randint(0, 59)
    second = random.randint(0, 59)
    naive = base.replace(hour=hour, minute=minute, second=second)
    return IST.localize(naive)

In [3]:
def generate_iot_device_ip(network_type):
    """
    Generate realistic IoT device IP addresses based on network type
    """
    if network_type == 'home_wifi':
        # Home WiFi networks (private ranges)
        return f"192.168.{random.randint(1, 10)}.{random.randint(10, 254)}"

    elif network_type == 'industrial':
        # Industrial networks (larger private ranges)
        return f"10.{random.randint(0, 50)}.{random.randint(1, 254)}.{random.randint(1, 254)}"

    elif network_type == 'cellular_iot':
        # Cellular IoT (carrier-grade NAT)
        carriers = ['100.64', '100.65', '100.66', '100.67']  # CGNAT ranges
        prefix = random.choice(carriers)
        return f"{prefix}.{random.randint(1, 254)}.{random.randint(1, 254)}"

    else:
        # Campus/enterprise
        return f"172.16.{random.randint(1, 31)}.{random.randint(1, 254)}"

In [4]:
def generate_cloud_dest_ip(service_type):
    """
    Generate realistic cloud service destination IPs for IoT communication
    """
    # Real IP ranges for major IoT cloud services
    ip_ranges = {
        "aws_iot": [
            "52.{}.{}.{}",       # AWS IoT Core
            "54.{}.{}.{}",       # AWS IoT Device Management
            "13.{}.{}.{}",       # AWS IoT Greengrass
        ],
        "azure_iot": [
            "40.{}.{}.{}",       # Azure IoT Hub
            "52.{}.{}.{}",       # Azure IoT Central
            "13.{}.{}.{}",       # Azure IoT Edge
        ],
        "google_iot": [
            "74.125.{}.{}",      # Google Cloud IoT Core
            "142.250.{}.{}",     # Google Cloud Platform
            "216.58.{}.{}",      # Google Services
        ],
        "industrial_scada": [
            "203.{}.{}.{}",      # Industrial control systems
            "149.{}.{}.{}",      # SCADA networks
        ]
    }

    if service_type in ip_ranges:
        ip_template = random.choice(ip_ranges[service_type])
        return ip_template.format(
            random.randint(1, 254),
            random.randint(1, 254),
            random.randint(1, 254)
        )
    else:
        # Generic cloud service
        return f"52.{random.randint(1, 254)}.{random.randint(1, 254)}.{random.randint(1, 254)}"

In [5]:
def generate_iot_device_identifiers(device_type):
    """
    Generate realistic IoT device identifiers (MAC addresses and device IDs)
    """
    # Common IoT device manufacturer MAC prefixes
    mac_prefixes = {
        'temperature_sensor': ['B8:27:EB', '00:16:3E', 'DC:A6:32'],  # Raspberry Pi, Arduino, ESP32
        'smart_light': ['D8:F1:5B', '50:02:91', '24:6F:28'],        # Philips Hue, TP-Link, Xiaomi
        'smart_camera': ['00:62:6E', 'AC:BC:32', '2C:AA:8E'],       # Hikvision, Dahua, Nest
        'smart_plug': ['50:C7:BF', '24:6F:28', '68:C6:3A'],         # TP-Link, Xiaomi, Amazon
        'motion_sensor': ['28:CC:CD', '00:15:8D', 'E0:E2:E6'],      # Samsung SmartThings, Bosch
        'industrial_sensor': ['00:0C:29', '00:50:56', '00:1B:21'],  # VMware, Siemens, ABB
        'gateway': ['00:11:32', 'B8:AE:ED', '00:04:A3'],            # Synopsys, Intel, Cisco
    }

    if device_type in mac_prefixes:
        prefix = random.choice(mac_prefixes[device_type])
    else:
        # Generic IoT device prefixes
        prefix = random.choice(['B8:27:EB', '00:16:3E', 'DC:A6:32'])

    # Generate remaining 3 bytes
    suffix = f"{random.randint(0, 255):02X}:{random.randint(0, 255):02X}:{random.randint(0, 255):02X}"
    mac_address = f"{prefix}:{suffix}"

    # Generate device ID
    device_id = f"{device_type}_{random.randint(1000, 9999)}"

    return device_id, mac_address

def inject_iot_anomalies(records, anomaly_ratio=0.05):
    """
    Inject realistic IoT device anomalies:
    - Botnet communication (C&C, DDoS)
    - Device malfunction (sensor drift, network issues)
    - Unauthorized access attempts
    - Firmware update anomalies
    """
    num_anomalies = int(len(records) * anomaly_ratio)
    anomaly_indices = random.sample(range(len(records)), num_anomalies)

    for idx in anomaly_indices:
        anomaly_type = random.choice([
            'botnet_cc_communication',      # Command & control communication
            'ddos_participation',           # Participating in DDoS attack
            'device_malfunction',           # Sensor/device malfunction
            'unauthorized_access',          # Brute force, unauthorized connections
            'firmware_update_anomaly'       # Suspicious firmware activity
        ])

        if anomaly_type == 'botnet_cc_communication':
            # Small, frequent communications to suspicious IPs
            records[idx]['dest_ip'] = f"45.{random.randint(32, 63)}.{random.randint(1, 254)}.{random.randint(1, 254)}"
            records[idx]['dest_port'] = random.choice([6667, 8080, 443, 53])  # IRC, HTTP, DNS over HTTPS
            records[idx]['session_duration'] = random.randint(1, 30)  # Very short sessions
            records[idx]['uplink_volume'] = random.randint(100, 1000)    # Small commands
            records[idx]['downlink_volume'] = random.randint(50, 500)     # Small responses
            records[idx]['is_anomaly'] = 1
            records[idx]['anomaly_type'] = 'botnet_cc_communication'

        elif anomaly_type == 'ddos_participation':
            # High volume, short duration attacks
            records[idx]['dest_ip'] = f"203.{random.randint(1, 254)}.{random.randint(1, 254)}.{random.randint(1, 254)}"
            records[idx]['dest_port'] = random.choice([80, 443, 53, 25])   # Common targets
            records[idx]['uplink_volume'] *= 50   # Massive outbound traffic
            records[idx]['downlink_volume'] = random.randint(0, 100)  # Little response
            records[idx]['session_duration'] = random.randint(1, 10)   # Very short bursts
            records[idx]['is_anomaly'] = 1
            records[idx]['anomaly_type'] = 'ddos_participation'

        elif anomaly_type == 'device_malfunction':
            # Unusual data patterns - too frequent or no communication
            if random.choice([True, False]):
                # Too frequent communication (sensor malfunction)
                records[idx]['session_duration'] = random.randint(1, 5)
                records[idx]['uplink_volume'] *= 10
            else:
                # Very long periods of no communication then sudden burst
                records[idx]['session_duration'] = random.randint(3600, 7200)  # 1-2 hours
                records[idx]['uplink_volume'] *= 20
            records[idx]['is_anomaly'] = 1
            records[idx]['anomaly_type'] = 'device_malfunction'

        elif anomaly_type == 'unauthorized_access':
            # Multiple connection attempts, port scanning
            records[idx]['dest_port'] = random.choice([22, 23, 21, 3389])  # SSH, Telnet, FTP, RDP
            records[idx]['session_duration'] = random.randint(1, 5)  # Quick attempts
            records[idx]['uplink_volume'] = random.randint(50, 200)   # Small probes
            records[idx]['downlink_volume'] = random.randint(0, 50)   # Failed connections
            records[idx]['is_anomaly'] = 1
            records[idx]['anomaly_type'] = 'unauthorized_access'

        elif anomaly_type == 'firmware_update_anomaly':
            # Unexpected large downloads from suspicious sources
            records[idx]['dest_ip'] = f"185.{random.randint(1, 254)}.{random.randint(1, 254)}.{random.randint(1, 254)}"
            records[idx]['dest_port'] = random.choice([80, 443, 8080])
            records[idx]['downlink_volume'] *= 100  # Large download
            records[idx]['session_duration'] = random.randint(300, 1800)  # 5-30 minutes
            records[idx]['is_anomaly'] = 1
            records[idx]['anomaly_type'] = 'firmware_update_anomaly'

    return records

def generate_iot_device_network_ipdr_dataset(
    num_records=10000,
    start_date="2024-01-01",
    duration_days=30,
    seed=42
):
    """
    Generate realistic IoT device network IPDR dataset:
    - Only IPDR-valid columns
    - Realistic IoT communication patterns (periodic, event-driven, small volumes)
    - Multiple device types (sensors, smart home, industrial)
    - Real IoT protocols and cloud service IPs
    - Injected anomalies for botnet, malfunction, and security issues
    """
    random.seed(seed)
    np.random.seed(seed)
    start_dt = datetime.strptime(start_date, "%Y-%m-%d")

    # IoT device types with realistic communication patterns
    iot_device_types = [
        {
            "device_type": "temperature_sensor",
            "network_type": "home_wifi",
            "cloud_service": "aws_iot",
            "protocol": random.choice(["TCP", "UDP"]),
            "ports": [1883, 8883, 443],  # MQTT, MQTTS, HTTPS
            "session_duration_range": (5, 30),     # Very short sessions
            "uplink_range": (50, 500),             # 50B - 500B (sensor readings)
            "downlink_range": (20, 200),           # 20B - 200B (acknowledgments)
            "usage_weight": 0.30,  # 30% of IoT traffic
            "update_interval": 300   # Every 5 minutes
        },
        {
            "device_type": "smart_light",
            "network_type": "home_wifi",
            "cloud_service": "google_iot",
            "protocol": "TCP",
            "ports": [443, 1883, 5683],  # HTTPS, MQTT, CoAP
            "session_duration_range": (2, 15),     # Very short control commands
            "uplink_range": (30, 300),             # 30B - 300B (on/off, dimming)
            "downlink_range": (20, 150),           # 20B - 150B (status)
            "usage_weight": 0.20,  # 20% of IoT traffic
            "update_interval": 600   # Every 10 minutes or event-driven
        },
        {
            "device_type": "smart_camera",
            "network_type": "home_wifi",
            "cloud_service": "aws_iot",
            "protocol": "TCP",
            "ports": [443, 554, 1935],  # HTTPS, RTSP, RTMP
            "session_duration_range": (30, 300),   # Longer for video streaming
            "uplink_range": (100000, 5000000),     # 100KB - 5MB (video upload)
            "downlink_range": (1000, 50000),       # 1KB - 50KB (commands)
            "usage_weight": 0.15,  # 15% of IoT traffic
            "update_interval": 60    # Continuous streaming
        },
        {
            "device_type": "motion_sensor",
            "network_type": "home_wifi",
            "cloud_service": "azure_iot",
            "protocol": random.choice(["TCP", "UDP"]),
            "ports": [1883, 5683, 443],  # MQTT, CoAP, HTTPS
            "session_duration_range": (1, 10),     # Very quick event triggers
            "uplink_range": (20, 100),             # 20B - 100B (motion detected)
            "downlink_range": (10, 80),            # 10B - 80B (acknowledgment)
            "usage_weight": 0.10,  # 10% of IoT traffic
            "update_interval": 1800  # Event-driven + periodic heartbeat
        },
        {
            "device_type": "industrial_sensor",
            "network_type": "industrial",
            "cloud_service": "industrial_scada",
            "protocol": "TCP",
            "ports": [502, 44818, 443],  # Modbus, EtherNet/IP, HTTPS
            "session_duration_range": (10, 60),    # Regular monitoring
            "uplink_range": (100, 2000),           # 100B - 2KB (sensor data)
            "downlink_range": (50, 1000),          # 50B - 1KB (control commands)
            "usage_weight": 0.15,  # 15% of IoT traffic
            "update_interval": 60    # Every minute
        },
        {
            "device_type": "smart_plug",
            "network_type": "home_wifi",
            "cloud_service": "google_iot",
            "protocol": "TCP",
            "ports": [443, 1883],  # HTTPS, MQTT
            "session_duration_range": (3, 20),     # Quick on/off commands
            "uplink_range": (40, 400),             # 40B - 400B (power status)
            "downlink_range": (30, 300),           # 30B - 300B (control)
            "usage_weight": 0.10,  # 10% of IoT traffic
            "update_interval": 900   # Every 15 minutes
        }
    ]

    # Generate unique device pool
    num_devices = 200
    devices = []
    for i in range(num_devices):
        device_profile = random.choice(iot_device_types)
        device_id, mac_address = generate_iot_device_identifiers(device_profile["device_type"])
        src_ip = generate_iot_device_ip(device_profile["network_type"])

        devices.append({
            'device_id': device_id,
            'mac_address': mac_address,
            'src_ip': src_ip,
            'device_type': device_profile["device_type"],
            'network_type': device_profile["network_type"],
            'cloud_service': device_profile["cloud_service"]
        })

    records = []
    for i in range(num_records):
        # Weight device selection based on usage patterns
        weights = [d["usage_weight"] for d in iot_device_types]
        device_profile = random.choices(iot_device_types, weights=weights)[0]

        # Select device from pool
        matching_devices = [d for d in devices if d['device_type'] == device_profile["device_type"]]
        device = random.choice(matching_devices)

        # Generate session parameters based on device type
        dest_port = random.choice(device_profile["ports"])
        session_duration = random.randint(*device_profile["session_duration_range"])
        uplink_volume = random.randint(*device_profile["uplink_range"])
        downlink_volume = random.randint(*device_profile["downlink_range"])

        # Generate timing based on device behavior
        start_time = get_iot_time_pattern(start_dt, duration_days, device_profile["device_type"])
        end_time = start_time + timedelta(seconds=session_duration)

        # Generate destination IP based on cloud service
        dest_ip = generate_cloud_dest_ip(device_profile["cloud_service"])

        # Create IoT IPDR record
        record = {
            # Mandatory IPDR fields for IoT networks
            "device_id": device['device_id'],          # IoT device identifier
            "src_ip": device['src_ip'],                # Device IP address
            "src_port": random.randint(1024, 65535),   # Source port
            "dest_ip": dest_ip,                        # Cloud service IP
            "dest_port": dest_port,                    # Service port
            "protocol": device_profile["protocol"],    # TCP/UDP
            "start_time": start_time.strftime("%Y-%m-%d %H:%M:%S"),    # Session start
            "end_time": end_time.strftime("%Y-%m-%d %H:%M:%S"),        # Session end
            "session_duration": session_duration,      # Duration in seconds
            "uplink_volume": uplink_volume,            # Bytes uploaded (sensor data)
            "downlink_volume": downlink_volume,        # Bytes downloaded (commands)
            "total_volume": uplink_volume + downlink_volume,  # Total bytes
            "mac_address": device['mac_address'],      # Device MAC address
            "device_type": device_profile["device_type"],  # For analysis (not real IPDR)
            "network_type": device_profile["network_type"], # For analysis

            # Labels for anomaly detection testing (not in real IPDR)
            "is_anomaly": 0,                          # Anomaly label
            "anomaly_type": "normal"                   # Anomaly type
        }

        records.append(record)

    # Inject realistic IoT anomalies
    records = inject_iot_anomalies(records, anomaly_ratio=0.05)

    # Convert to DataFrame
    df = pd.DataFrame(records)

    # Add derived features for IoT analysis
    df['bytes_per_second'] = df['total_volume'] / df['session_duration']
    df['uplink_ratio'] = df['uplink_volume'] / df['total_volume']
    df['hour'] = pd.to_datetime(df['start_time']).dt.hour
    df['day_of_week'] = pd.to_datetime(df['start_time']).dt.dayofweek
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
    df['is_night_hours'] = ((df['hour'] >= 22) | (df['hour'] <= 6)).astype(int)
    df['is_micro_session'] = (df['session_duration'] <= 10).astype(int)  # Very short IoT sessions
    df['data_efficiency'] = df['total_volume'] / df['session_duration']   # Bytes per second efficiency

    return df

# Generate the dataset
if __name__ == "__main__":
    # Generate IoT device network IPDR dataset
    df_iot_network = generate_iot_device_network_ipdr_dataset(
        num_records=10000,
        start_date="2024-01-01",
        duration_days=30,
        seed=42
    )

    print("=== IOT DEVICE NETWORK IPDR DATASET ===")
    print(f"Total records: {len(df_iot_network)}")
    print(f"Anomalies: {df_iot_network['is_anomaly'].sum()} ({df_iot_network['is_anomaly'].mean()*100:.1f}%)")
    print(f"Columns: {list(df_iot_network.columns)}")
    print(f"Date range: {df_iot_network['start_time'].min()} to {df_iot_network['start_time'].max()}")

    print("\nDevice type distribution:")
    print(df_iot_network['device_type'].value_counts())

    print("\nNetwork type distribution:")
    print(df_iot_network['network_type'].value_counts())

    print("\nAnomaly breakdown:")
    print(df_iot_network['anomaly_type'].value_counts())

    print("\nIoT traffic characteristics:")
    print(f"Average session duration: {df_iot_network['session_duration'].mean():.1f} seconds")
    print(f"Average data volume: {df_iot_network['total_volume'].mean():.0f} bytes")
    print(f"Micro-sessions (≤10s): {df_iot_network['is_micro_session'].sum()}")

    print("\nData volume statistics:")
    df_iot_network['total_kb'] = df_iot_network['total_volume'] / 1024
    print(f"Average per session: {df_iot_network['total_kb'].mean():.2f} KB")
    print(f"95th percentile: {df_iot_network['total_kb'].quantile(0.95):.2f} KB")
    print(f"Max session: {df_iot_network['total_kb'].max():.2f} KB")

    print("\nSample IoT device records:")
    sample = df_iot_network[['device_id', 'device_type', 'src_ip', 'dest_ip', 'total_kb']].head(3)
    for _, row in sample.iterrows():
        print(f"Device: {row['device_id']} ({row['device_type']}), "
              f"Data: {row['total_kb']:.2f}KB, Route: {row['src_ip']} -> {row['dest_ip']}")

    # Save the dataset
    df_iot_network.to_csv("iot_device_network_ipdr_dataset.csv", index=False)
    print(f"\nDataset saved as 'iot_device_network_ipdr_dataset.csv'")

=== IOT DEVICE NETWORK IPDR DATASET ===
Total records: 10000
Anomalies: 500 (5.0%)
Columns: ['device_id', 'src_ip', 'src_port', 'dest_ip', 'dest_port', 'protocol', 'start_time', 'end_time', 'session_duration', 'uplink_volume', 'downlink_volume', 'total_volume', 'mac_address', 'device_type', 'network_type', 'is_anomaly', 'anomaly_type', 'bytes_per_second', 'uplink_ratio', 'hour', 'day_of_week', 'is_weekend', 'is_night_hours', 'is_micro_session', 'data_efficiency']
Date range: 2024-01-01 00:07:54 to 2024-01-30 23:56:56

Device type distribution:
device_type
temperature_sensor    2993
smart_light           1945
industrial_sensor     1534
smart_camera          1496
smart_plug            1021
motion_sensor         1011
Name: count, dtype: int64

Network type distribution:
network_type
home_wifi     8466
industrial    1534
Name: count, dtype: int64

Anomaly breakdown:
anomaly_type
normal                     9500
unauthorized_access         105
firmware_update_anomaly     103
botnet_cc_commun