<a href="https://colab.research.google.com/github/Ananya-Ahuja/Anomalyze/blob/ananya_ahuja/office_broadband_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import pytz

In [2]:
# Set timezone for India
IST = pytz.timezone('Asia/Kolkata')

def get_ist_time(start_dt, duration_days):
    """Generate realistic office time patterns - 8am-8pm weighted for work hours"""
    work_hours = list(range(9, 18))  # Core work hours 9am-6pm
    extended_hours = list(range(8, 9)) + list(range(18, 21))  # Extended hours

    if random.random() < 0.7:  # 70% chance of core work hours
        hour = random.choice(work_hours)
    elif random.random() < 0.9:  # 20% chance of extended hours
        hour = random.choice(extended_hours)
    else:  # 10% chance of odd hours (anomalous)
        hour = random.choice(list(range(0, 8)) + list(range(21, 24)))

    day_offset = random.randint(0, duration_days - 1)
    base = start_dt + timedelta(days=day_offset)
    minute = random.randint(0, 59)
    second = random.randint(0, 59)
    naive = base.replace(hour=hour, minute=minute, second=second)
    return IST.localize(naive)

In [3]:
def generate_realistic_dest_ip(service_type):
    """Generate realistic destination IPs based on service type and real IP ranges"""
    # Real IP ranges for major services (based on research)
    ip_ranges = {
        "web_browsing": [
            "74.125.{}.{}",    # Google
            "173.194.{}.{}",   # Google
            "208.80.{}.{}",    # Wikipedia
            "151.101.{}.{}"    # Reddit/Stack Overflow CDN
        ],
        "email": [
            "74.125.{}.{}",    # Gmail
            "40.96.{}.{}",     # Outlook
            "173.194.{}.{}"    # Gmail
        ],
        "office_suite": [
            "40.126.{}.{}",    # Microsoft Office 365
            "52.96.{}.{}",     # Microsoft
            "74.125.{}.{}",    # Google Docs
            "173.194.{}.{}"    # Google Drive
        ],
        "messaging": [
            "54.230.{}.{}",    # Slack (AWS)
            "52.84.{}.{}",     # Teams
            "149.154.{}.{}"    # Telegram
        ],
        "video_meetings": [
            "8.5.{}.{}",       # Zoom
            "162.255.{}.{}",   # Zoom
            "40.114.{}.{}",    # Teams
            "74.125.{}.{}"     # Google Meet
        ],
        "source_code": [
            "140.82.{}.{}",    # GitHub
            "192.30.{}.{}",    # GitHub
            "185.199.{}.{}"    # GitHub
        ],
        "cloud_storage": [
            "162.125.{}.{}",   # Dropbox
            "40.126.{}.{}",    # OneDrive
            "74.125.{}.{}"     # Google Drive
        ]
    }
    if service_type in ip_ranges:
        ip_template = random.choice(ip_ranges[service_type])
        return ip_template.format(
            random.randint(0, 255),
            random.randint(1, 254)
        )
    else:
        # Generic internet IP
        return f"{random.choice([74, 173, 162, 151])}.{random.randint(1, 254)}.{random.randint(1, 254)}.{random.randint(1, 254)}"

In [4]:
def generate_broadband_src_ip():
    """Generate realistic Indian broadband user IP ranges"""
    # Based on major Indian ISP ranges
    indian_ranges = [
        "49.{}.{}.{}",      # Maharashtra/Mumbai region
        "14.{}.{}.{}",      # Various Indian ISPs
        "117.{}.{}.{}",     # Airtel/BSNL ranges
        "203.{}.{}.{}",     # TATA/VSNL ranges
        "182.{}.{}.{}"      # Reliance/Jio ranges
    ]

    ip_template = random.choice(indian_ranges)
    return ip_template.format(
        random.randint(1, 254),
        random.randint(1, 254),
        random.randint(1, 254)
    )

In [5]:
def generate_mac_address():
    """Generate realistic MAC address for broadband modem/router"""
    # Common vendor prefixes for broadband equipment
    vendor_prefixes = [
        "00:1B:2F",  # Cisco
        "00:50:56",  # VMware
        "00:0C:29",  # VMware
        "08:00:27",  # VirtualBox
        "52:54:00",  # QEMU
        "00:15:5D",  # Microsoft Hyper-V
        "00:1C:42",  # Parallels
        "00:03:FF"   # Microsoft
    ]

    prefix = random.choice(vendor_prefixes)
    suffix = f"{random.randint(0, 255):02X}:{random.randint(0, 255):02X}:{random.randint(0, 255):02X}"
    return f"{prefix}:{suffix}"

In [8]:
def inject_anomalies(records, anomaly_ratio=0.05):
    """Inject realistic anomalies for testing model performance"""
    num_anomalies = int(len(records) * anomaly_ratio)
    anomaly_indices = random.sample(range(len(records)), num_anomalies)

    for idx in anomaly_indices:
        anomaly_type = random.choice(['data_spike', 'very_short_session', 'very_long_session', 'odd_timing', 'rare_ip'])

        if anomaly_type == 'data_spike':
            # Unusual data volume spike (10x normal)
            records[idx]['uplink_volume'] *= 10
            records[idx]['downlink_volume'] *= 15
            records[idx]['is_anomaly'] = 1
            records[idx]['anomaly_type'] = 'data_spike'

        elif anomaly_type == 'very_short_session':
            # Unusually short session (< 10 seconds)
            records[idx]['session_duration'] = random.randint(1, 9)
            records[idx]['is_anomaly'] = 1
            records[idx]['anomaly_type'] = 'very_short_session'

        elif anomaly_type == 'very_long_session':
            # Unusually long session (> 4 hours)
            records[idx]['session_duration'] = random.randint(14400, 28800)  # 4-8 hours
            records[idx]['is_anomaly'] = 1
            records[idx]['anomaly_type'] = 'very_long_session'

        elif anomaly_type == 'odd_timing':
            # Access during very unusual hours (2am-5am)
            odd_hour = random.randint(2, 5)
            original_time = datetime.strptime(records[idx]['start_time'], "%Y-%m-%d %H:%M:%S")
            new_time = original_time.replace(hour=odd_hour)
            records[idx]['start_time'] = new_time.strftime("%Y-%m-%d %H:%M:%S")
            records[idx]['end_time'] = (new_time + timedelta(seconds=records[idx]['session_duration'])).strftime("%Y-%m-%d %H:%M:%S")
            records[idx]['is_anomaly'] = 1
            records[idx]['anomaly_type'] = 'odd_timing'

        elif anomaly_type == 'rare_ip':
            # Connection to suspicious/rare IP ranges
            suspicious_ips = [
                "185.220.{}.{}",     # 2 placeholders - Tor exit nodes
                "198.96.{}.{}",      # 2 placeholders - Suspicious ranges
                "45.33.{}.{}",       # 2 placeholders - Suspicious ranges
                "104.244.{}.{}",     # 2 placeholders - Suspicious ranges
                "192.42.{}.{}"       # 2 placeholders - Suspicious ranges
            ]
            ip_template = random.choice(suspicious_ips)
            records[idx]['dest_ip'] = ip_template.format(
                random.randint(1, 254),
                random.randint(1, 254)
            )
            records[idx]['is_anomaly'] = 1
            records[idx]['anomaly_type'] = 'rare_ip'

    return records

In [9]:
def generate_broadband_office_ipdr_dataset(
    num_records=10000,
    start_date="2024-01-01",
    duration_days=30,
    seed=42
):
    """
    Generate realistic broadband office IPDR dataset with:
    - Only IPDR-valid columns
    - Realistic IP ranges for Indian broadband and destination services
    - Injected anomalies for model testing
    - Proper time patterns for office environment
    """
    random.seed(seed)
    np.random.seed(seed)
    start_dt = datetime.strptime(start_date, "%Y-%m-%d")

    # Office network services with realistic patterns
    office_services = [
        {
            "service_name": "Web Browsing",
            "service_type": "web_browsing",
            "protocol": "TCP",
            "ports": [443, 80],
            "duration_range_sec": (60, 600),
            "uplink_range": (50000, 500000),      # 50KB - 500KB
            "downlink_range": (100000, 5000000)   # 100KB - 5MB
        },
        {
            "service_name": "Email Access",
            "service_type": "email",
            "protocol": "TCP",
            "ports": [443, 993, 587],
            "duration_range_sec": (60, 300),
            "uplink_range": (100000, 1000000),    # 100KB - 1MB
            "downlink_range": (200000, 2000000)   # 200KB - 2MB
        },
        {
            "service_name": "Office Suite",
            "service_type": "office_suite",
            "protocol": "TCP",
            "ports": [443],
            "duration_range_sec": (300, 1800),
            "uplink_range": (200000, 10000000),   # 200KB - 10MB
            "downlink_range": (500000, 20000000)  # 500KB - 20MB
        },
        {
            "service_name": "Video Meetings",
            "service_type": "video_meetings",
            "protocol": random.choice(["TCP", "UDP"]),
            "ports": [443, 3478, 19302],
            "duration_range_sec": (600, 3600),
            "uplink_range": (5000000, 50000000),   # 5MB - 50MB
            "downlink_range": (10000000, 100000000) # 10MB - 100MB
        },
        {
            "service_name": "Messaging",
            "service_type": "messaging",
            "protocol": "TCP",
            "ports": [443, 5222],
            "duration_range_sec": (60, 900),
            "uplink_range": (100000, 2000000),    # 100KB - 2MB
            "downlink_range": (300000, 5000000)   # 300KB - 5MB
        },
        {
            "service_name": "Source Code Management",
            "service_type": "source_code",
            "protocol": "TCP",
            "ports": [443, 22],
            "duration_range_sec": (120, 900),
            "uplink_range": (500000, 5000000),    # 500KB - 5MB
            "downlink_range": (1000000, 15000000) # 1MB - 15MB
        },
        {
            "service_name": "Cloud Storage",
            "service_type": "cloud_storage",
            "protocol": "TCP",
            "ports": [443],
            "duration_range_sec": (300, 1200),
            "uplink_range": (1000000, 20000000),  # 1MB - 20MB
            "downlink_range": (1000000, 50000000) # 1MB - 50MB
        }
    ]

    # Generate unique user identifiers (account IDs for broadband)
    user_accounts = [f"broadband_user_{i:04d}" for i in range(1, 101)]

    records = []
    for i in range(num_records):
        # Select service and generate session
        service = random.choice(office_services)
        dest_port = random.choice(service["ports"])
        session_duration = random.randint(*service["duration_range_sec"])
        uplink_volume = random.randint(*service["uplink_range"])
        downlink_volume = random.randint(*service["downlink_range"])

        # Generate timing
        start_time = get_ist_time(start_dt, duration_days)
        end_time = start_time + timedelta(seconds=session_duration)

        # Generate IPs and identifiers
        src_ip = generate_broadband_src_ip()
        dest_ip = generate_realistic_dest_ip(service["service_type"])
        user_account = random.choice(user_accounts)
        mac_address = generate_mac_address()

        # Create IPDR record with only standard IPDR fields
        record = {
            # Mandatory IPDR fields always present
            "user_account_id": user_account,           # Broadband account ID
            "src_ip": src_ip,                          # Source IP (customer)
            "src_port": random.randint(1024, 65535),   # Source port
            "dest_ip": dest_ip,                        # Destination IP (service)
            "dest_port": dest_port,                    # Destination port
            "protocol": service["protocol"],            # Protocol (TCP/UDP)
            "start_time": start_time.strftime("%Y-%m-%d %H:%M:%S"),    # Session start
            "end_time": end_time.strftime("%Y-%m-%d %H:%M:%S"),        # Session end
            "session_duration": session_duration,      # Duration in seconds
            "uplink_volume": uplink_volume,            # Bytes uploaded
            "downlink_volume": downlink_volume,        # Bytes downloaded
            "total_volume": uplink_volume + downlink_volume,  # Total bytes
            "mac_address": mac_address,                # Device MAC address

            # Labels for anomaly detection testing (not in real IPDR)
            "is_anomaly": 0,                          # Label for model testing
            "anomaly_type": "normal"                   # Type of anomaly
        }

        records.append(record)

    # Inject realistic anomalies for model testing
    records = inject_anomalies(records, anomaly_ratio=0.05)  # 5% anomalies

    # Convert to DataFrame
    df = pd.DataFrame(records)

    # Add some derived features common in IPDR analysis
    df['bytes_per_second'] = df['total_volume'] / df['session_duration']
    df['uplink_ratio'] = df['uplink_volume'] / df['total_volume']
    df['hour'] = pd.to_datetime(df['start_time']).dt.hour
    df['day_of_week'] = pd.to_datetime(df['start_time']).dt.dayofweek
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)

    return df

# Generate the dataset
if __name__ == "__main__":
    # Generate broadband office IPDR dataset
    df_office = generate_broadband_office_ipdr_dataset(
        num_records=10000,
        start_date="2024-01-01",
        duration_days=30,
        seed=42
    )

    print("=== BROADBAND OFFICE IPDR DATASET ===")
    print(f"Total records: {len(df_office)}")
    print(f"Anomalies: {df_office['is_anomaly'].sum()} ({df_office['is_anomaly'].mean()*100:.1f}%)")
    print(f"Columns: {list(df_office.columns)}")
    print(f"Date range: {df_office['start_time'].min()} to {df_office['start_time'].max()}")
    print("\nFirst 5 records:")
    print(df_office.head())

    print("\nAnomaly breakdown:")
    print(df_office['anomaly_type'].value_counts())

    print("\nIP ranges sample:")
    print("Source IPs (Indian broadband):", df_office['src_ip'].head().tolist())
    print("Dest IPs (Services):", df_office['dest_ip'].head().tolist())

    # Save the dataset
    df_office.to_csv("broadband_office_ipdr_dataset.csv", index=False)
    print(f"\nDataset saved as 'broadband_office_ipdr_dataset.csv'")

=== BROADBAND OFFICE IPDR DATASET ===
Total records: 10000
Anomalies: 500 (5.0%)
Columns: ['user_account_id', 'src_ip', 'src_port', 'dest_ip', 'dest_port', 'protocol', 'start_time', 'end_time', 'session_duration', 'uplink_volume', 'downlink_volume', 'total_volume', 'mac_address', 'is_anomaly', 'anomaly_type', 'bytes_per_second', 'uplink_ratio', 'hour', 'day_of_week', 'is_weekend']
Date range: 2024-01-01 02:38:07 to 2024-01-30 22:39:55

First 5 records:
       user_account_id          src_ip  src_port          dest_ip  dest_port  \
0  broadband_user_0084     49.24.56.60     54070   74.125.101.184         80   
1  broadband_user_0045    117.27.24.98     64798  162.125.183.217        443   
2  broadband_user_0030    49.12.170.59     42684    40.114.40.219        443   
3  broadband_user_0060   49.156.163.44     56261    74.125.125.42        443   
4  broadband_user_0034  117.55.168.128     59863    40.126.234.37        443   

  protocol           start_time             end_time  session_