<a href="https://colab.research.google.com/github/Ananya-Ahuja/Anomalyze/blob/ananya_ahuja/Enterprise_secuity_ipdr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import pytz

In [3]:
# Timezone for India
IST = pytz.timezone('Asia/Kolkata')

def get_enterprise_time_pattern(start_dt, duration_days):
    """
    Generate enterprise network usage time patterns:
    - Core business hours: 8 AM–8 PM (80% of sessions)
    - Off-hours maintenance/updates: 8 PM–12 AM & 5 AM–8 AM (15%)
    - Odd anomalies: 12 AM–5 AM (5%)
    """
    core = list(range(8, 20))
    off = list(range(20, 24)) + list(range(5, 8))
    odd = list(range(0, 5))
    r = random.random()
    if r < 0.80:
        hour = random.choice(core)
    elif r < 0.95:
        hour = random.choice(off)
    else:
        hour = random.choice(odd)
    day_offset = random.randint(0, duration_days - 1)
    base = start_dt + timedelta(days=day_offset)
    minute, second = random.randint(0, 59), random.randint(0, 59)
    return IST.localize(base.replace(hour=hour, minute=minute, second=second))

In [4]:
def generate_enterprise_src_ip(department):
    """
    Generate enterprise private IPs by department subnet:
    - Dept A: 10.10.x.x, Dept B: 10.20.x.x, Dept C: 10.30.x.x, Dept D: 10.40.x.x
    """
    base = {"A": "10.10", "B": "10.20", "C": "10.30", "D": "10.40"}[department]
    return f"{base}.{random.randint(1,254)}.{random.randint(1,254)}"

In [5]:
def generate_public_dest_ip(service_type):
    """
    Realistic public IPs for enterprise external services:
    - SaaS (Office365, Salesforce): 52.x.x.x, 40.x.x.x
    - Cloud infra: 34.x.x.x, 13.x.x.x
    - Partner/VPN: 172.27.2.x
    """
    ranges = {
        "saas": ["52.{}.{}.{}", "40.{}.{}.{}"],
        "cloud": ["34.{}.{}.{}", "13.{}.{}.{}"],
        "vpn_partner": ["172.27.2.{}"]
    }
    tpl = random.choice(ranges[service_type])
    parts = [random.randint(1,254) for _ in range(tpl.count("{}"))]
    return tpl.format(*parts)

In [6]:
def inject_enterprise_anomalies(records, anomaly_ratio=0.05):
    """
    Inject enterprise anomalies:
    - lateral_movement: src_ip changes within same account rapidly
    - privilege_escalation: unusual service access (cloud→ vpn_partner)
    - data_exfiltration: high uplink volumes to external IPs
    - insider_threat: off-hour sessions from admin subnet
    """
    n = int(len(records)*anomaly_ratio)
    idxs = random.sample(range(len(records)), n)
    for i in idxs:
        rec = records[i]
        rec['is_anomaly'] = 1
        typ = random.choice([
            'lateral_movement',
            'privilege_escalation',
            'data_exfiltration',
            'insider_threat'
        ])
        rec['anomaly_type'] = typ
        if typ == 'lateral_movement':
            # same account, new src_ip from different dept
            dept = random.choice(['A','B','C','D'])
            rec['src_ip'] = generate_enterprise_src_ip(dept)
        elif typ == 'privilege_escalation':
            rec['dest_ip'] = generate_public_dest_ip('vpn_partner')
            rec['dest_port'] = random.choice([1194,443])
        elif typ == 'data_exfiltration':
            rec['uplink_volume'] *= 20
        elif typ == 'insider_threat':
            # off-hours
            t = datetime.strptime(rec['start_time'], "%Y-%m-%d %H:%M:%S")
            new = t.replace(hour=random.choice(range(0,5)))
            rec['start_time'] = new.strftime("%Y-%m-%d %H:%M:%S")
            rec['end_time'] = (new + timedelta(seconds=rec['session_duration'])).strftime("%Y-%m-%d %H:%M:%S")
    return records

In [7]:
def generate_enterprise_security_ipdr_dataset(
    num_records=10000,
    start_date="2024-01-01",
    duration_days=30,
    seed=42
):
    """
    Generate Enterprise Security Monitoring IPDR dataset:
    - Mix of departments (A–D)
    - Mix of internal (intranet) and external (SaaS, cloud, VPN) sessions
    - Standard IPDR fields + department, user_account, device_id
    - 5% labeled anomalies of four types
    """
    random.seed(seed)
    np.random.seed(seed)
    start_dt = datetime.strptime(start_date, "%Y-%m-%d")

    departments = ['A','B','C','D']
    service_map = {
        "internal": {"type":"internal","ports":[80,443],"service":"intranet"},
        "saas": {"type":"saas","ports":[443],"service":"saas"},
        "cloud": {"type":"cloud","ports":[443],"service":"cloud"},
        "vpn_partner": {"type":"vpn_partner","ports":[1194,443],"service":"vpn"}
    }
    records = []
    for _ in range(num_records):
        dept = random.choice(departments)
        src_ip = generate_enterprise_src_ip(dept)
        account = f"user_{dept}_{random.randint(1,200)}"
        device = f"device_{dept}_{random.randint(1,500)}"

        svc_key = random.choices(
            list(service_map.keys()),
            weights=[0.5,0.2,0.2,0.1]
        )[0]
        svc = service_map[svc_key]
        dest_ip = (generate_public_dest_ip(svc_key)
                   if svc_key!="internal"
                   else f"10.{dept}0.{random.randint(1,254)}.{random.randint(1,254)}")
        start_time = get_enterprise_time_pattern(start_dt,duration_days)
        dur = random.randint(60,7200)  # 1 min–2 h
        end_time = start_time + timedelta(seconds=dur)
        uplink = random.randint(1000,200000)    # 1 KB–200 KB
        downlink = random.randint(1000,500000)  # 1 KB–500 KB

        rec = {
            "department": dept,
            "user_account": account,
            "device_id": device,
            "src_ip": src_ip,
            "src_port": random.randint(1024,65535),
            "dest_ip": dest_ip,
            "dest_port": random.choice(svc["ports"]),
            "protocol": "TCP",
            "start_time": start_time.strftime("%Y-%m-%d %H:%M:%S"),
            "end_time": end_time.strftime("%Y-%m-%d %H:%M:%S"),
            "session_duration": dur,
            "uplink_volume": uplink,
            "downlink_volume": downlink,
            "total_volume": uplink+downlink,
            "is_anomaly": 0,
            "anomaly_type": "normal"
        }
        records.append(rec)

    records = inject_enterprise_anomalies(records, anomaly_ratio=0.05)
    df = pd.DataFrame(records)
    # Derived features
    df['bytes_per_second'] = df['total_volume']/df['session_duration'].replace(0,1)
    df['hour'] = pd.to_datetime(df['start_time']).dt.hour
    df['is_core_hours'] = df['hour'].between(8,19).astype(int)
    df['is_weekend'] = (pd.to_datetime(df['start_time']).dt.dayofweek>=5).astype(int)
    df['internal_flag'] = (df['dest_ip'].str.startswith("10.")).astype(int)
    return df

if __name__ == "__main__":
    df_ent = generate_enterprise_security_ipdr_dataset()
    print("=== ENTERPRISE SECURITY MONITORING IPDR DATASET ===")
    print(f"Records: {len(df_ent)}, Anomalies: {df_ent['is_anomaly'].sum()} ({df_ent['is_anomaly'].mean()*100:.1f}%)")
    print("Columns:", df_ent.columns.tolist())
    df_ent.to_csv("enterprise_security_ipdr_dataset.csv",index=False)

=== ENTERPRISE SECURITY MONITORING IPDR DATASET ===
Records: 10000, Anomalies: 500 (5.0%)
Columns: ['department', 'user_account', 'device_id', 'src_ip', 'src_port', 'dest_ip', 'dest_port', 'protocol', 'start_time', 'end_time', 'session_duration', 'uplink_volume', 'downlink_volume', 'total_volume', 'is_anomaly', 'anomaly_type', 'bytes_per_second', 'hour', 'is_core_hours', 'is_weekend', 'internal_flag']
