<a href="https://colab.research.google.com/github/Ananya-Ahuja/Anomalyze/blob/ananya_ahuja/VPN_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import pytz

In [2]:
# Timezone for India
IST = pytz.timezone('Asia/Kolkata')

def get_vpn_time_pattern(start_dt, duration_days):
    """
    Generate realistic remote-work VPN time patterns:
    - Peak corporate hours: 9 AM–6 PM (70% of sessions)
    - Early morning/evening: 6 PM–9 PM & 7 AM–9 AM (20%)
    - Odd hours: 12 AM–6 AM (10%)
    """
    peak = list(range(9, 18))
    extended = list(range(7, 9)) + list(range(18, 21))
    odd = list(range(0, 7))
    r = random.random()
    if r < 0.7:
        hour = random.choice(peak)
    elif r < 0.9:
        hour = random.choice(extended)
    else:
        hour = random.choice(odd)
    day_offset = random.randint(0, duration_days - 1)
    base = start_dt + timedelta(days=day_offset)
    minute, second = random.randint(0, 59), random.randint(0, 59)
    return IST.localize(base.replace(hour=hour, minute=minute, second=second))

In [3]:
def generate_home_src_ip():
    """
    Generate realistic home-office IPs using private RFC1918 ranges,
    avoiding common hotel/AP conflicts by picking mid-block /24 subnets.
    """
    # Use a random /24 from 10.0.0.0/8 excluding low and high ranges
    return f"10.{random.randint(16, 239)}.{random.randint(1,254)}.{random.randint(1,254)}"

In [4]:
def generate_vpn_server_ip():
    """
    Generate realistic corporate VPN server IPs in a dedicated private subnet
    (e.g., 172.27.224.0/20 default for OpenVPN Access Server).
    """
    return f"172.27.{random.randint(224,239)}.{random.randint(1,254)}"

In [5]:
def inject_vpn_anomalies(records, anomaly_ratio=0.05):
    """
    Inject realistic VPN anomalies:
    - session_drop: abrupt end (zero downlink)
    - long_idle: very long sessions with zero traffic
    - split_tunnel_breach: high downlink to unusual dest_ip
    - geo_change: sudden src_ip subnet jump
    """
    n = int(len(records) * anomaly_ratio)
    idxs = random.sample(range(len(records)), n)
    for i in idxs:
        rec = records[i]
        typ = random.choice(['session_drop','long_idle','split_tunnel_breach','geo_change'])
        rec['is_anomaly'] = 1
        rec['anomaly_type'] = typ
        if typ == 'session_drop':
            rec['end_time'] = rec['start_time']  # zero duration
            rec['session_duration'] = 0
            rec['downlink_volume'] = 0
        elif typ == 'long_idle':
            rec['session_duration'] = random.randint(3600, 14400)  # 1–4 h
            rec['uplink_volume'] = 0
            rec['downlink_volume'] = 0
        elif typ == 'split_tunnel_breach':
            # traffic to non-VPN dest (e.g., public cloud)
            rec['dest_ip'] = f"52.{random.randint(1,254)}.{random.randint(1,254)}.{random.randint(1,254)}"
            rec['downlink_volume'] *= 10
        elif typ == 'geo_change':
            # simulate change to a different /16
            rec['src_ip'] = f"10.{random.randint(240,254)}.{random.randint(1,254)}.{random.randint(1,254)}"
    return records

In [6]:
def generate_remote_vpn_ipdr_dataset(
    num_records=10000,
    start_date="2024-01-01",
    duration_days=30,
    seed=42
):
    """
    Generate realistic remote-work VPN IPDR dataset:
    - Fields: src_ip, src_port, dest_ip, dest_port, protocol,
      start_time, end_time, session_duration, uplink_volume,
      downlink_volume, total_volume, is_anomaly, anomaly_type
    - Realistic time patterns and IP ranges
    - 5% labeled anomalies
    """
    random.seed(seed)
    np.random.seed(seed)
    start_dt = datetime.strptime(start_date, "%Y-%m-%d")

    records = []
    for _ in range(num_records):
        start_time = get_vpn_time_pattern(start_dt, duration_days)
        duration = random.randint(300, 14400)  # 5 min–4 h
        end_time = start_time + timedelta(seconds=duration)
        uplink = random.randint(1000, 500000)   # 1 KB–500 KB
        downlink = random.randint(5000, 5000000)  # 5 KB–5 MB

        rec = {
            "src_ip": generate_home_src_ip(),
            "src_port": random.randint(1024,65535),
            "dest_ip": generate_vpn_server_ip(),
            "dest_port": random.choice([1194, 443, 500, 4500]),  # OpenVPN, SSL VPN, IPSec
            "protocol": random.choice(["UDP","TCP"]),
            "start_time": start_time.strftime("%Y-%m-%d %H:%M:%S"),
            "end_time": end_time.strftime("%Y-%m-%d %H:%M:%S"),
            "session_duration": duration,
            "uplink_volume": uplink,
            "downlink_volume": downlink,
            "total_volume": uplink + downlink,
            "is_anomaly": 0,
            "anomaly_type": "normal"
        }
        records.append(rec)

    records = inject_vpn_anomalies(records, anomaly_ratio=0.05)
    df = pd.DataFrame(records)

    # Derived features
    df['bytes_per_second'] = df['total_volume'] / df['session_duration'].replace(0,1)
    df['hour'] = pd.to_datetime(df['start_time']).dt.hour
    df['is_peak_hour'] = df['hour'].between(9,17).astype(int)
    df['weekday'] = pd.to_datetime(df['start_time']).dt.dayofweek < 5
    df['is_weekday'] = df['weekday'].astype(int)
    return df

if __name__ == "__main__":
    df_vpn = generate_remote_vpn_ipdr_dataset()
    print("=== REMOTE WORK VPN IPDR DATASET ===")
    print(f"Total records: {len(df_vpn)}")
    print(f"Anomalies: {df_vpn['is_anomaly'].sum()} ({df_vpn['is_anomaly'].mean()*100:.1f}%)")
    print("Columns:", df_vpn.columns.tolist())
    print(df_vpn.head())
    df_vpn.to_csv("remote_work_vpn_ipdr_dataset.csv", index=False)

=== REMOTE WORK VPN IPDR DATASET ===
Total records: 10000
Anomalies: 500 (5.0%)
Columns: ['src_ip', 'src_port', 'dest_ip', 'dest_port', 'protocol', 'start_time', 'end_time', 'session_duration', 'uplink_volume', 'downlink_volume', 'total_volume', 'is_anomaly', 'anomaly_type', 'bytes_per_second', 'hour', 'is_peak_hour', 'weekday', 'is_weekday']
           src_ip  src_port         dest_ip  dest_port protocol  \
0  10.189.190.229     36765  172.27.226.152       4500      UDP   
1  10.199.167.180     36737   172.27.237.57       4500      TCP   
2     10.87.40.56     63755   172.27.234.27       1194      TCP   
3   10.153.32.250     61467   172.27.236.21        500      TCP   
4   10.234.60.222      7643   172.27.236.72       4500      TCP   

            start_time             end_time  session_duration  uplink_volume  \
0  2024-01-24 09:17:15  2024-01-24 10:23:12              3957          74158   
1  2024-01-08 12:32:38  2024-01-08 12:44:52               734         295254   
2  2024-01-2