<a href="https://colab.research.google.com/github/Ananya-Ahuja/Anomalyze/blob/ananya_ahuja/Student_Campus_IPDR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import pytz

In [None]:
# Set timezone for India
IST = pytz.timezone('Asia/Kolkata')

def get_student_time_pattern(start_dt, duration_days):
    """Generate realistic Indian student campus usage time patterns
    Based on research: Peak during class hours (9am-5pm), evening study (7pm-11pm),
    weekend late-night usage, exam period intensity"""

    # Different patterns based on day type
    day_offset = random.randint(0, duration_days - 1)
    base = start_dt + timedelta(days=day_offset)
    is_weekend = base.weekday() >= 5

    if is_weekend:
        # Weekend pattern: Later start, more evening/night usage
        leisure_hours = list(range(10, 14)) + list(range(16, 24))  # 10am-2pm, 4pm-12am
        night_hours = list(range(0, 3))  # 12am-3am

        if random.random() < 0.60:      # 60% leisure time
            hour = random.choice(leisure_hours)
        elif random.random() < 0.85:   # 25% night usage
            hour = random.choice(night_hours)
        else:                          # 15% other times
            hour = random.randint(3, 23)
    else:
        # Weekday pattern: Class hours, study time, limited late night
        class_hours = list(range(9, 17))        # 9am-5pm (peak)
        study_hours = list(range(19, 23))       # 7pm-11pm (secondary peak)
        break_hours = list(range(7, 9)) + list(range(17, 19))  # 7-9am, 5-7pm
        night_hours = list(range(23, 24)) + list(range(0, 2))  # 11pm-2am

        if random.random() < 0.45:      # 45% class/work hours
            hour = random.choice(class_hours)
        elif random.random() < 0.70:   # 25% study hours
            hour = random.choice(study_hours)
        elif random.random() < 0.85:   # 15% break hours
            hour = random.choice(break_hours)
        else:                          # 15% night hours
            hour = random.choice(night_hours)

    minute = random.randint(0, 59)
    second = random.randint(0, 59)
    naive = base.replace(hour=hour, minute=minute, second=second)
    return IST.localize(naive)

In [None]:
def generate_realistic_educational_dest_ip(service_type):
    """Generate realistic destination IPs for educational services
    Based on real IP ranges serving Indian universities"""

    # Real IP ranges for educational services
    ip_ranges = {
        "educational_platforms": [
            "74.125.{}.{}",      # Google Classroom, Google Meet
            "142.250.{}.{}",     # Google services
            "172.217.{}.{}",     # YouTube (educational content)
            "13.107.{}.{}",      # Microsoft Teams, Office 365
            "52.96.{}.{}",       # Microsoft education services
        ],
        "research_databases": [
            "157.240.{}.{}",     # Research portals
            "104.16.{}.{}",      # Cloudflare (hosting academic sites)
            "151.101.{}.{}",     # Fastly CDN (academic content)
            "185.199.{}.{}",     # Academic repositories
        ],
        "online_libraries": [
            "216.58.{}.{}",      # Google Scholar
            "208.80.{}.{}",      # Wikipedia
            "54.230.{}.{}",      # Amazon CloudFront (e-books)
            "199.232.{}.{}",     # Archive.org
        ],
        "video_streaming": [
            "172.217.{}.{}",     # YouTube
            "74.125.{}.{}",      # YouTube
            "13.107.{}.{}",      # Stream.microsoft (educational videos)
        ],
        "social_educational": [
            "157.240.{}.{}",     # Facebook groups (study groups)
            "151.101.{}.{}",     # Reddit (educational subreddits)
            "104.244.{}.{}",     # Twitter (academic accounts)
        ],
        "campus_systems": [
            "14.139.{}.{}",      # NKN (National Knowledge Network) - Real Indian education network
            "164.100.{}.{}",     # NIC (National Informatics Centre)
            "103.27.{}.{}",      # Local campus servers
        ]
    }

    if service_type in ip_ranges:
        ip_template = random.choice(ip_ranges[service_type])
        return ip_template.format(
            random.randint(1, 254),
            random.randint(1, 254)
        )
    else:
        # Generic educational IP
        return f"14.139.{random.randint(1, 254)}.{random.randint(1, 254)}"

In [None]:
def generate_campus_broadband_src_ip():
    """Generate realistic Indian campus broadband IP ranges
    Based on actual university network configurations"""

    # Real campus IP ranges from research
    campus_ranges = [
        # Internal campus networks (private ranges commonly used)
        "10.{}.{}.{}",        # Large campus networks
        "172.16.{}.{}",       # Medium campus networks
        "192.168.{}.{}",      # Small department networks

        # NKN (National Knowledge Network) ranges - Real Indian education IPs
        "14.139.{}.{}",       # NKN Core Network (actual range used by Indian universities)
        "103.27.{}.{}",       # IIT Delhi and similar institutions
        "164.100.{}.{}",      # NIC educational institutions

        # Campus NAT gateway IPs (public-facing)
        "49.{}.{}.{}",        # Common Indian ISP ranges for campus gateways
        "117.{}.{}.{}",       # University gateway IPs
        "203.{}.{}.{}",       # Educational institution gateways
    ]

    ip_template = random.choice(campus_ranges)

    # Adjust ranges based on template
    if ip_template.startswith("10."):
        return ip_template.format(
            random.randint(0, 255),
            random.randint(0, 255),
            random.randint(1, 254)
        )
    elif ip_template.startswith("172.16."):
        return ip_template.format(
            random.randint(0, 31),  # 172.16.0.0 to 172.31.255.255
            random.randint(1, 254)
        )
    elif ip_template.startswith("192.168."):
        return ip_template.format(
            random.randint(0, 255),
            random.randint(1, 254)
        )
    else:
        return ip_template.format(
            random.randint(1, 254),
            random.randint(1, 254),
            random.randint(1, 254)
        )

In [None]:
def generate_student_identifiers():
    """Generate realistic student identifiers for campus network"""
    # Indian university student ID patterns
    year = random.choice(['20', '21', '22', '23', '24'])
    dept_codes = ['CS', 'EE', 'ME', 'CE', 'IT', 'EC', 'BT', 'CH', 'PH', 'MA']
    dept = random.choice(dept_codes)
    roll_no = random.randint(1, 200)

    student_id = f"{year}{dept}{roll_no:03d}"

    # MAC address for student devices (laptops, smartphones)
    mac_prefixes = [
        "00:1B:63",  # Apple (MacBooks, iPhones)
        "00:50:56",  # Dell laptops
        "00:15:5D",  # HP laptops
        "20:68:9D",  # Xiaomi smartphones
        "A4:5E:60",  # Samsung smartphones
        "8C:85:90",  # OnePlus smartphones
    ]

    prefix = random.choice(mac_prefixes)
    suffix = f"{random.randint(0, 255):02X}:{random.randint(0, 255):02X}:{random.randint(0, 255):02X}"
    mac_address = f"{prefix}:{suffix}"

    return student_id, mac_address

In [None]:
def inject_campus_anomalies(records, anomaly_ratio=0.05):
    """Inject realistic campus network anomalies for testing ML models"""
    num_anomalies = int(len(records) * anomaly_ratio)
    anomaly_indices = random.sample(range(len(records)), num_anomalies)

    for idx in anomaly_indices:
        anomaly_type = random.choice([
            'academic_procrastination',      # Heavy non-academic usage during study hours
            'exam_period_stress',           # Extremely high usage during exam periods
            'gaming_addiction',             # Excessive gaming during class hours
            'content_downloading_spree',    # Massive downloads
            'suspicious_research_access'    # Access to questionable academic sources
        ])

        if anomaly_type == 'academic_procrastination':
            # Non-academic usage during prime study hours (2pm-5pm weekdays)
            original_time = datetime.strptime(records[idx]['start_time'], "%Y-%m-%d %H:%M:%S")
            if original_time.weekday() < 5:  # Weekday
                new_time = original_time.replace(hour=random.randint(14, 17))
                records[idx]['start_time'] = new_time.strftime("%Y-%m-%d %H:%M:%S")
                records[idx]['end_time'] = (new_time + timedelta(seconds=records[idx]['session_duration'])).strftime("%Y-%m-%d %H:%M:%S")
            records[idx]['downlink_volume'] *= 5  # Heavy streaming/social media
            records[idx]['is_anomaly'] = 1
            records[idx]['anomaly_type'] = 'academic_procrastination'

        elif anomaly_type == 'exam_period_stress':
            # Extremely high usage indicating exam stress/last-minute studying
            records[idx]['session_duration'] = random.randint(10800, 21600)  # 3-6 hours
            records[idx]['uplink_volume'] *= 8    # Heavy uploading (assignments, projects)
            records[idx]['downlink_volume'] *= 12  # Heavy downloading (research papers, videos)
            records[idx]['is_anomaly'] = 1
            records[idx]['anomaly_type'] = 'exam_period_stress'

        elif anomaly_type == 'gaming_addiction':
            # Gaming during class hours with specific traffic patterns
            if random.choice([True, False]):  # 50% chance to set during class hours
                original_time = datetime.strptime(records[idx]['start_time'], "%Y-%m-%d %H:%M:%S")
                class_hour = random.randint(9, 17)
                new_time = original_time.replace(hour=class_hour)
                records[idx]['start_time'] = new_time.strftime("%Y-%m-%d %H:%M:%S")
                records[idx]['end_time'] = (new_time + timedelta(seconds=records[idx]['session_duration'])).strftime("%Y-%m-%d %H:%M:%S")

            records[idx]['session_duration'] = random.randint(7200, 14400)  # 2-4 hours continuous
            records[idx]['uplink_volume'] = records[idx]['downlink_volume'] // 2  # Gaming upload pattern
            records[idx]['dest_port'] = random.choice([27015, 7777, 25565, 3724])  # Gaming ports
            records[idx]['is_anomaly'] = 1
            records[idx]['anomaly_type'] = 'gaming_addiction'

        elif anomaly_type == 'content_downloading_spree':
            # Massive content downloads (movies, software, etc.)
            records[idx]['downlink_volume'] *= 20  # Very heavy downloads
            records[idx]['uplink_volume'] = int(records[idx]['downlink_volume'] * 0.05)  # Low upload ratio
            records[idx]['session_duration'] = max(records[idx]['session_duration'], 5400)  # At least 1.5 hours
            records[idx]['is_anomaly'] = 1
            records[idx]['anomaly_type'] = 'content_downloading_spree'

        elif anomaly_type == 'suspicious_research_access':
            # Access to questionable or unauthorized academic resources
            suspicious_ips = [
                "185.220.{}.{}",     # Tor exit nodes (bypassing campus restrictions)
                "198.96.{}.{}",      # Proxy servers
                "45.32.{}.{}",       # VPN endpoints
                "104.248.{}.{}",     # Suspicious academic scrapers
            ]
            ip_template = random.choice(suspicious_ips)
            records[idx]['dest_ip'] = ip_template.format(
                random.randint(1, 254),
                random.randint(1, 254)
            )
            records[idx]['dest_port'] = random.choice([8080, 3128, 1080, 9050])  # Proxy/Tor ports
            records[idx]['is_anomaly'] = 1
            records[idx]['anomaly_type'] = 'suspicious_research_access'

    return records

In [None]:
def generate_student_campus_network_ipdr_dataset(
    num_records=10000,
    start_date="2024-01-01",
    duration_days=30,
    seed=42
):
    """
    Generate realistic student campus network IPDR dataset based on Indian university patterns
    """
    random.seed(seed)
    np.random.seed(seed)
    start_dt = datetime.strptime(start_date, "%Y-%m-%d")

    # Educational services used by Indian students
    educational_services = [
        {
            "service_name": "Google Classroom",
            "service_type": "educational_platforms",
            "protocol": "TCP",
            "ports": [443],
            "session_duration_range": (300, 3600),    # 5 min to 1 hour
            "uplink_range": (100000, 5000000),        # 100KB - 5MB (assignments, discussions)
            "downlink_range": (500000, 50000000),     # 500KB - 50MB (materials, videos)
            "usage_weight": 0.25  # 25% of campus traffic
        },
        {
            "service_name": "YouTube Educational",
            "service_type": "video_streaming",
            "protocol": "TCP",
            "ports": [443],
            "session_duration_range": (600, 5400),    # 10 min to 1.5 hours
            "uplink_range": (50000, 2000000),         # 50KB - 2MB (comments, likes)
            "downlink_range": (50000000, 500000000),  # 50MB - 500MB (video streaming)
            "usage_weight": 0.20  # 20% of campus traffic
        },
        {
            "service_name": "Research Databases",
            "service_type": "research_databases",
            "protocol": "TCP",
            "ports": [443, 80],
            "session_duration_range": (900, 7200),    # 15 min to 2 hours
            "uplink_range": (200000, 3000000),        # 200KB - 3MB (queries, forms)
            "downlink_range": (1000000, 100000000),   # 1MB - 100MB (papers, data)
            "usage_weight": 0.15  # 15% of campus traffic
        },
        {
            "service_name": "Online Libraries",
            "service_type": "online_libraries",
            "protocol": "TCP",
            "ports": [443, 80],
            "session_duration_range": (600, 10800),   # 10 min to 3 hours
            "uplink_range": (100000, 2000000),        # 100KB - 2MB
            "downlink_range": (2000000, 200000000),   # 2MB - 200MB (e-books, articles)
            "usage_weight": 0.12  # 12% of campus traffic
        },
        {
            "service_name": "Microsoft Teams",
            "service_type": "educational_platforms",
            "protocol": "TCP",
            "ports": [443, 3478],
            "session_duration_range": (1800, 7200),   # 30 min to 2 hours (virtual classes)
            "uplink_range": (5000000, 30000000),      # 5MB - 30MB (video, audio)
            "downlink_range": (10000000, 100000000),  # 10MB - 100MB (video streams)
            "usage_weight": 0.10  # 10% of campus traffic
        },
        {
            "service_name": "Social Study Groups",
            "service_type": "social_educational",
            "protocol": "TCP",
            "ports": [443],
            "session_duration_range": (300, 2400),    # 5 min to 40 min
            "uplink_range": (500000, 10000000),       # 500KB - 10MB (posts, images)
            "downlink_range": (1000000, 50000000),    # 1MB - 50MB (feeds, discussions)
            "usage_weight": 0.08  # 8% of campus traffic
        },
        {
            "service_name": "Campus Portal",
            "service_type": "campus_systems",
            "protocol": "TCP",
            "ports": [443, 80, 8080],
            "session_duration_range": (180, 1800),    # 3 min to 30 min
            "uplink_range": (50000, 5000000),         # 50KB - 5MB (forms, uploads)
            "downlink_range": (200000, 20000000),     # 200KB - 20MB (results, notices)
            "usage_weight": 0.10  # 10% of campus traffic
        }
    ]

    # Generate unique student identifiers
    num_students = 800
    students = []
    for i in range(num_students):
        student_id, mac_address = generate_student_identifiers()
        students.append({
            'student_id': student_id,
            'mac_address': mac_address
        })

    records = []
    for i in range(num_records):
        # Weight service selection based on real campus usage patterns
        weights = [s["usage_weight"] for s in educational_services]
        service = random.choices(educational_services, weights=weights)[0]

        # Generate session parameters
        dest_port = random.choice(service["ports"])
        session_duration = random.randint(*service["session_duration_range"])
        uplink_volume = random.randint(*service["uplink_range"])
        downlink_volume = random.randint(*service["downlink_range"])

        # Generate timing with realistic student patterns
        start_time = get_student_time_pattern(start_dt, duration_days)
        end_time = start_time + timedelta(seconds=session_duration)

        # Generate IPs and identifiers
        src_ip = generate_campus_broadband_src_ip()
        dest_ip = generate_realistic_educational_dest_ip(service["service_type"])
        student = random.choice(students)

        # Generate bandwidth based on campus infrastructure (typical: 2Mbps per student)
        allocated_bandwidth_mbps = random.uniform(1.0, 4.0)  # 1-4 Mbps per student

        # Create campus IPDR record
        record = {
            # Mandatory IPDR fields for broadband campus networks
            "student_id": student['student_id'],       # Student account identifier
            "src_ip": src_ip,                          # Campus network IP
            "src_port": random.randint(1024, 65535),   # Source port
            "dest_ip": dest_ip,                        # Educational service IP
            "dest_port": dest_port,                    # Destination port
            "protocol": service["protocol"],           # Protocol (TCP/UDP)
            "start_time": start_time.strftime("%Y-%m-%d %H:%M:%S"),    # Session start
            "end_time": end_time.strftime("%Y-%m-%d %H:%M:%S"),        # Session end
            "session_duration": session_duration,      # Duration in seconds
            "uplink_volume": uplink_volume,            # Bytes uploaded
            "downlink_volume": downlink_volume,        # Bytes downloaded
            "total_volume": uplink_volume + downlink_volume,  # Total bytes
            "mac_address": student['mac_address'],     # Device MAC address
            "allocated_bandwidth_mbps": allocated_bandwidth_mbps,  # Campus bandwidth allocation
            "service_name": service["service_name"],   # For analysis (not real IPDR)

            # Labels for anomaly detection testing (not in real IPDR)
            "is_anomaly": 0,                          # Anomaly label
            "anomaly_type": "normal"                   # Anomaly type
        }

        records.append(record)

    # Inject realistic campus network anomalies
    records = inject_campus_anomalies(records, anomaly_ratio=0.05)

    # Convert to DataFrame
    df = pd.DataFrame(records)

    # Add derived features for campus network analysis
    df['bytes_per_second'] = df['total_volume'] / df['session_duration']
    df['uplink_ratio'] = df['uplink_volume'] / df['total_volume']
    df['hour'] = pd.to_datetime(df['start_time']).dt.hour
    df['day_of_week'] = pd.to_datetime(df['start_time']).dt.dayofweek
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
    df['is_class_hours'] = ((df['hour'] >= 9) & (df['hour'] <= 17) & (df['day_of_week'] < 5)).astype(int)
    df['is_study_hours'] = ((df['hour'] >= 19) & (df['hour'] <= 22)).astype(int)
    df['bandwidth_utilization'] = (df['bytes_per_second'] * 8) / (df['allocated_bandwidth_mbps'] * 1000000)  # Utilization ratio

    return df

# Generate the dataset
if __name__ == "__main__":
    # Generate student campus network IPDR dataset
    df_student_campus = generate_student_campus_network_ipdr_dataset(
        num_records=10000,
        start_date="2024-01-01",
        duration_days=30,
        seed=42
    )

    print("=== STUDENT CAMPUS NETWORK IPDR DATASET ===")
    print(f"Total records: {len(df_student_campus)}")
    print(f"Anomalies: {df_student_campus['is_anomaly'].sum()} ({df_student_campus['is_anomaly'].mean()*100:.1f}%)")
    print(f"Columns: {list(df_student_campus.columns)}")
    print(f"Date range: {df_student_campus['start_time'].min()} to {df_student_campus['start_time'].max()}")

    print("\nService distribution:")
    print(df_student_campus['service_name'].value_counts())

    print("\nAnomaly breakdown:")
    print(df_student_campus['anomaly_type'].value_counts())

    print("\nTime pattern analysis:")
    print("Class hours usage:", df_student_campus['is_class_hours'].sum())
    print("Study hours usage:", df_student_campus['is_study_hours'].sum())
    print("Weekend usage:", df_student_campus['is_weekend'].sum())

    print("\nCampus network statistics:")
    df_student_campus['total_mb'] = df_student_campus['total_volume'] / (1024*1024)
    print(f"Average data per session: {df_student_campus['total_mb'].mean():.1f} MB")
    print(f"95th percentile usage: {df_student_campus['total_mb'].quantile(0.95):.1f} MB")
    print(f"Average bandwidth utilization: {df_student_campus['bandwidth_utilization'].mean()*100:.1f}%")

    print("\nSample student records:")
    sample = df_student_campus[['student_id', 'src_ip', 'dest_ip', 'service_name', 'total_mb']].head(3)
    for _, row in sample.iterrows():
        print(f"Student: {row['student_id']}, Service: {row['service_name']}, Data: {row['total_mb']:.1f}MB, Route: {row['src_ip']} -> {row['dest_ip']}")

    # Save the dataset
    df_student_campus.to_csv("student_campus_network_ipdr_dataset.csv", index=False)
    print(f"\nDataset saved as 'student_campus_network_ipdr_dataset.csv'")

=== STUDENT CAMPUS NETWORK IPDR DATASET ===
Total records: 10000
Anomalies: 500 (5.0%)
Columns: ['student_id', 'src_ip', 'src_port', 'dest_ip', 'dest_port', 'protocol', 'start_time', 'end_time', 'session_duration', 'uplink_volume', 'downlink_volume', 'total_volume', 'mac_address', 'allocated_bandwidth_mbps', 'service_name', 'is_anomaly', 'anomaly_type', 'bytes_per_second', 'uplink_ratio', 'hour', 'day_of_week', 'is_weekend', 'is_class_hours', 'is_study_hours', 'bandwidth_utilization']
Date range: 2024-01-01 00:12:49 to 2024-01-30 23:57:27

Service distribution:
service_name
Google Classroom       2597
YouTube Educational    2010
Research Databases     1544
Online Libraries       1174
Campus Portal           957
Microsoft Teams         949
Social Study Groups     769
Name: count, dtype: int64

Anomaly breakdown:
anomaly_type
normal                        9500
academic_procrastination       105
suspicious_research_access     105
gaming_addiction               102
exam_period_stress      