<a href="https://colab.research.google.com/github/Ananya-Ahuja/Anomalyze/blob/ananya_ahuja/Mobile_social_media_ipdr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import pytz

In [2]:
# Set timezone for India
IST = pytz.timezone('Asia/Kolkata')
def get_social_media_time_pattern(start_dt, duration_days):
    """Generate realistic Indian social media usage time patterns
    Peak: 7pm-11pm (evening), Secondary: 11am-2pm (lunch), Early morning: 8am-10am
    Based on research: Indians spend average 194 minutes daily on social media"""

    prime_hours = list(range(19, 23))       # 7pm-11pm (40% of activity)
    lunch_hours = list(range(11, 15))       # 11am-3pm (25% of activity)
    morning_hours = list(range(8, 11))      # 8am-11am (20% of activity)
    other_hours = list(range(0, 8)) + list(range(15, 19)) + [23]  # Rest (15%)

    if random.random() < 0.40:      # 40% chance prime time (evening)
        hour = random.choice(prime_hours)
    elif random.random() < 0.65:   # 25% chance lunch time
        hour = random.choice(lunch_hours)
    elif random.random() < 0.85:   # 20% chance morning
        hour = random.choice(morning_hours)
    else:                          # 15% chance other times
        hour = random.choice(other_hours)

    # Weekend bias - more evening and late night usage
    day_offset = random.randint(0, duration_days - 1)
    base = start_dt + timedelta(days=day_offset)

    if base.weekday() >= 5:  # Weekend
        if random.random() < 0.3:  # 30% chance of late night on weekends
            hour = random.choice(list(range(23, 24)) + list(range(0, 3)))

    minute = random.randint(0, 59)
    second = random.randint(0, 59)
    naive = base.replace(hour=hour, minute=minute, second=second)
    return IST.localize(naive)

In [3]:
def generate_realistic_social_dest_ip(platform_type):
    """Generate realistic destination IPs for social media platforms
    Based on real IP ranges from research"""

    # Real IP ranges for social media platforms serving India
    ip_ranges = {
        "meta_platforms": [  # Facebook, Instagram, WhatsApp
            "157.240.{}.{}",     # Meta CDN India (Chennai/Mumbai)
            "163.70.{}.{}",      # Meta CDN India
            "31.13.{}.{}",       # Meta Ireland CDN
            "173.252.{}.{}",     # Meta US CDN
            "69.171.{}.{}",      # Meta US
        ],
        "youtube_google": [   # YouTube, Google services
            "74.125.{}.{}",      # Google
            "142.250.{}.{}",     # Google
            "216.58.{}.{}",      # Google
            "172.217.{}.{}",     # Google
        ],
        "indian_alternatives": [  # Josh, Moj, Chingari, ShareChat
            "13.127.{}.{}",      # AWS India (hosting Indian apps)
            "13.234.{}.{}",      # AWS India
            "52.66.{}.{}",       # AWS India
            "3.6.{}.{}",         # AWS India
        ],
        "twitter": [
            "104.244.{}.{}",     # Twitter
            "192.133.{}.{}",     # Twitter
        ],
        "snapchat": [
            "151.101.{}.{}",     # Snapchat CDN
            "54.230.{}.{}",      # Snapchat AWS
        ]
    }

    if platform_type in ip_ranges:
        ip_template = random.choice(ip_ranges[platform_type])
        return ip_template.format(
            random.randint(1, 254),
            random.randint(1, 254)
        )
    else:
        # Generic CDN
        return f"52.{random.randint(1, 254)}.{random.randint(1, 254)}.{random.randint(1, 254)}"

In [4]:
def generate_indian_mobile_src_ip():
    """Generate realistic Indian mobile operator IP ranges
    Based on major Indian telcos: Jio, Airtel, Vi"""

    # Real Indian mobile operator IP ranges from research
    mobile_ranges = [
        # Reliance Jio (largest with 477.94M users)
        "49.{}.{}.{}",        # Jio primary range
        "117.{}.{}.{}",       # Jio range
        "106.{}.{}.{}",       # Jio range

        # Bharti Airtel (285.17M users)
        "122.{}.{}.{}",       # Airtel range
        "182.{}.{}.{}",       # Airtel range
        "125.{}.{}.{}",       # Airtel range

        # Vodafone Idea (126.36M users)
        "223.{}.{}.{}",       # Vi range
        "27.{}.{}.{}",        # Vi range
        "14.{}.{}.{}",        # Vi range

        # BSNL (37.73M users)
        "61.{}.{}.{}",        # BSNL range
        "202.{}.{}.{}",       # BSNL range
    ]

    ip_template = random.choice(mobile_ranges)
    return ip_template.format(
        random.randint(1, 254),
        random.randint(1, 254),
        random.randint(1, 254)
    )

In [5]:
def generate_mobile_identifiers():
    """Generate realistic mobile device identifiers for Indian IPDR"""
    # Indian mobile numbers start with 6,7,8,9
    msisdn = f"+91{random.choice([6,7,8,9])}{random.randint(100000000, 999999999)}"

    # IMEI: 15-digit identifier
    imei = f"{random.randint(100000000000000, 999999999999999)}"

    # IMSI: 15-digit with India MCC (404/405)
    mcc = random.choice([404, 405])  # India country codes
    mnc = random.choice(['01', '02', '03', '10', '11', '45', '46'])  # Network codes
    msin = f"{random.randint(1000000000, 9999999999)}"
    imsi = f"{mcc}{mnc}{msin}"

    return msisdn, imei, imsi

In [6]:
def generate_cell_tower_id():
    """Generate realistic cell tower IDs for Indian mobile networks"""
    # Format: <circle_code><area_code><tower_id>
    circles = ['DL', 'MH', 'KA', 'TN', 'AP', 'WB', 'GJ', 'UP', 'MP', 'RJ']
    circle = random.choice(circles)
    area = random.randint(1, 999)
    tower = random.randint(1, 9999)
    return f"{circle}{area:03d}{tower:04d}"

In [7]:
def inject_social_media_anomalies(records, anomaly_ratio=0.05):
    """Inject realistic social media anomalies for testing ML models"""
    num_anomalies = int(len(records) * anomaly_ratio)
    anomaly_indices = random.sample(range(len(records)), num_anomalies)

    for idx in anomaly_indices:
        anomaly_type = random.choice([
            'excessive_data_usage',     # Extreme video binging
            'unusual_time_pattern',     # 3am-6am heavy usage
            'rapid_session_switching',  # Multiple short bursts
            'suspicious_location',      # Cell tower jumping
            'bot_like_behavior'        # Mechanical usage patterns
        ])

        if anomaly_type == 'excessive_data_usage':
            # 10x normal data usage (binge-watching/downloading)
            records[idx]['uplink_volume'] *= 3
            records[idx]['downlink_volume'] *= 15  # Heavy video download
            records[idx]['session_duration'] = max(records[idx]['session_duration'], 7200)  # 2+ hours
            records[idx]['is_anomaly'] = 1
            records[idx]['anomaly_type'] = 'excessive_data_usage'

        elif anomaly_type == 'unusual_time_pattern':
            # Very late night usage (3am-6am)
            odd_hour = random.randint(3, 6)
            original_time = datetime.strptime(records[idx]['start_time'], "%Y-%m-%d %H:%M:%S")
            new_time = original_time.replace(hour=odd_hour)
            records[idx]['start_time'] = new_time.strftime("%Y-%m-%d %H:%M:%S")
            records[idx]['end_time'] = (new_time + timedelta(seconds=records[idx]['session_duration'])).strftime("%Y-%m-%d %H:%M:%S")
            records[idx]['is_anomaly'] = 1
            records[idx]['anomaly_type'] = 'unusual_time_pattern'

        elif anomaly_type == 'rapid_session_switching':
            # Very short, frequent sessions (bot-like)
            records[idx]['session_duration'] = random.randint(5, 30)  # 5-30 seconds
            records[idx]['downlink_volume'] = random.randint(10000, 50000)  # Very little data
            records[idx]['is_anomaly'] = 1
            records[idx]['anomaly_type'] = 'rapid_session_switching'

        elif anomaly_type == 'suspicious_location':
            # Unusual cell tower patterns
            records[idx]['cell_id'] = f"XX{random.randint(999, 9999)}{random.randint(9999, 99999)}"  # Invalid format
            records[idx]['is_anomaly'] = 1
            records[idx]['anomaly_type'] = 'suspicious_location'

        elif anomaly_type == 'bot_like_behavior':
            # Mechanical, precise timings
            records[idx]['session_duration'] = 300  # Exactly 5 minutes
            records[idx]['downlink_volume'] = 52428800  # Exactly 50MB
            records[idx]['uplink_volume'] = 1048576    # Exactly 1MB
            records[idx]['is_anomaly'] = 1
            records[idx]['anomaly_type'] = 'bot_like_behavior'

    return records

In [8]:
def generate_mobile_social_media_ipdr_dataset(
    num_records=10000,
    start_date="2024-01-01",
    duration_days=30,
    seed=42
):
    """
    Generate realistic mobile social media IPDR dataset based on Indian usage patterns
    """
    random.seed(seed)
    np.random.seed(seed)
    start_dt = datetime.strptime(start_date, "%Y-%m-%d")

    # Social media platforms popular in India (post-TikTok ban)
    social_platforms = [
        {
            "platform_name": "WhatsApp",
            "platform_type": "meta_platforms",
            "protocol": "TCP",
            "ports": [443, 5222],  # HTTPS and chat protocol
            "session_duration_range": (30, 1800),  # 30 sec to 30 min
            "uplink_range": (10000, 5000000),      # 10KB - 5MB (messages, photos)
            "downlink_range": (50000, 50000000),   # 50KB - 50MB (media, status)
            "usage_weight": 0.35  # 35% of time (most used)
        },
        {
            "platform_name": "Instagram",
            "platform_type": "meta_platforms",
            "protocol": "TCP",
            "ports": [443],
            "session_duration_range": (120, 3600),  # 2 min to 1 hour
            "uplink_range": (100000, 10000000),     # 100KB - 10MB (stories, posts)
            "downlink_range": (1000000, 100000000), # 1MB - 100MB (reels, feed)
            "usage_weight": 0.25  # 25% of time
        },
        {
            "platform_name": "YouTube",
            "platform_type": "youtube_google",
            "protocol": "TCP",
            "ports": [443],
            "session_duration_range": (300, 7200),   # 5 min to 2 hours
            "uplink_range": (50000, 2000000),        # 50KB - 2MB (comments, likes)
            "downlink_range": (10000000, 200000000), # 10MB - 200MB (video streaming)
            "usage_weight": 0.20  # 20% of time
        },
        {
            "platform_name": "Facebook",
            "platform_type": "meta_platforms",
            "protocol": "TCP",
            "ports": [443],
            "session_duration_range": (180, 2400),   # 3 min to 40 min
            "uplink_range": (100000, 5000000),       # 100KB - 5MB
            "downlink_range": (500000, 80000000),    # 500KB - 80MB
            "usage_weight": 0.10  # 10% of time
        },
        {
            "platform_name": "Josh", # Indian TikTok alternative
            "platform_type": "indian_alternatives",
            "protocol": "TCP",
            "ports": [443],
            "session_duration_range": (60, 1800),    # 1 min to 30 min
            "uplink_range": (50000, 3000000),        # 50KB - 3MB
            "downlink_range": (5000000, 150000000),  # 5MB - 150MB (short videos)
            "usage_weight": 0.06  # 6% of time
        },
        {
            "platform_name": "Moj", # Indian TikTok alternative
            "platform_type": "indian_alternatives",
            "protocol": "TCP",
            "ports": [443],
            "session_duration_range": (60, 1200),    # 1 min to 20 min
            "uplink_range": (50000, 2000000),        # 50KB - 2MB
            "downlink_range": (3000000, 120000000),  # 3MB - 120MB
            "usage_weight": 0.04  # 4% of time
        }
    ]

    # Generate unique user identifiers (mobile subscribers)
    num_users = 500
    mobile_users = []
    for i in range(num_users):
        msisdn, imei, imsi = generate_mobile_identifiers()
        mobile_users.append({
            'user_id': f"mobile_user_{i:04d}",
            'msisdn': msisdn,
            'imei': imei,
            'imsi': imsi
        })

    records = []
    for i in range(num_records):
        # Weight platform selection based on usage patterns
        weights = [p["usage_weight"] for p in social_platforms]
        platform = random.choices(social_platforms, weights=weights)[0]

        # Generate session parameters
        dest_port = random.choice(platform["ports"])
        session_duration = random.randint(*platform["session_duration_range"])
        uplink_volume = random.randint(*platform["uplink_range"])
        downlink_volume = random.randint(*platform["downlink_range"])

        # Generate timing with realistic social media patterns
        start_time = get_social_media_time_pattern(start_dt, duration_days)
        end_time = start_time + timedelta(seconds=session_duration)

        # Generate IPs and identifiers
        src_ip = generate_indian_mobile_src_ip()
        dest_ip = generate_realistic_social_dest_ip(platform["platform_type"])
        user = random.choice(mobile_users)
        cell_id = generate_cell_tower_id()

        # Network type based on Indian mobile evolution
        network_type = random.choices(
            ['4G', '5G', '3G'],
            weights=[0.75, 0.20, 0.05]  # 4G dominant, 5G growing, 3G legacy
        )[0]

        # Create mobile IPDR record
        record = {
            # Mandatory mobile IPDR fields
            "msisdn": user['msisdn'],                  # Mobile number
            "imei": user['imei'],                      # Device identifier
            "imsi": user['imsi'],                      # SIM identifier
            "src_ip": src_ip,                          # Mobile user IP
            "src_port": random.randint(1024, 65535),   # Source port
            "dest_ip": dest_ip,                        # Social platform IP
            "dest_port": dest_port,                    # Destination port
            "protocol": platform["protocol"],          # Protocol
            "start_time": start_time.strftime("%Y-%m-%d %H:%M:%S"),    # Session start
            "end_time": end_time.strftime("%Y-%m-%d %H:%M:%S"),        # Session end
            "session_duration": session_duration,      # Duration in seconds
            "uplink_volume": uplink_volume,            # Bytes uploaded
            "downlink_volume": downlink_volume,        # Bytes downloaded
            "total_volume": uplink_volume + downlink_volume,  # Total bytes
            "cell_id": cell_id,                        # Cell tower ID
            "network_type": network_type,              # 3G/4G/5G
            "platform_name": platform["platform_name"], # For analysis (not real IPDR)

            # Labels for anomaly detection testing (not in real IPDR)
            "is_anomaly": 0,                          # Anomaly label
            "anomaly_type": "normal"                   # Anomaly type
        }

        records.append(record)

    # Inject realistic anomalies for model testing
    records = inject_social_media_anomalies(records, anomaly_ratio=0.05)

    # Convert to DataFrame
    df = pd.DataFrame(records)

    # Add derived features for analysis
    df['bytes_per_second'] = df['total_volume'] / df['session_duration']
    df['uplink_ratio'] = df['uplink_volume'] / df['total_volume']
    df['hour'] = pd.to_datetime(df['start_time']).dt.hour
    df['day_of_week'] = pd.to_datetime(df['start_time']).dt.dayofweek
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
    df['is_prime_time'] = ((df['hour'] >= 19) & (df['hour'] <= 22)).astype(int)

    return df

# Generate the dataset
if __name__ == "__main__":
    # Generate mobile social media IPDR dataset
    df_mobile_social = generate_mobile_social_media_ipdr_dataset(
        num_records=10000,
        start_date="2024-01-01",
        duration_days=30,
        seed=42
    )

    print("=== MOBILE SOCIAL MEDIA IPDR DATASET ===")
    print(f"Total records: {len(df_mobile_social)}")
    print(f"Anomalies: {df_mobile_social['is_anomaly'].sum()} ({df_mobile_social['is_anomaly'].mean()*100:.1f}%)")
    print(f"Columns: {list(df_mobile_social.columns)}")
    print(f"Date range: {df_mobile_social['start_time'].min()} to {df_mobile_social['start_time'].max()}")

    print("\nPlatform distribution:")
    print(df_mobile_social['platform_name'].value_counts())

    print("\nAnomaly breakdown:")
    print(df_mobile_social['anomaly_type'].value_counts())

    print("\nTime pattern analysis:")
    print("Peak hours (19-22):", df_mobile_social['is_prime_time'].sum())
    print("Weekend usage:", df_mobile_social['is_weekend'].sum())

    print("\nNetwork type distribution:")
    print(df_mobile_social['network_type'].value_counts())

    print("\nData usage statistics (MB):")
    df_mobile_social['total_mb'] = df_mobile_social['total_volume'] / (1024*1024)
    print(f"Average per session: {df_mobile_social['total_mb'].mean():.1f} MB")
    print(f"95th percentile: {df_mobile_social['total_mb'].quantile(0.95):.1f} MB")

    print("\nSample mobile identifiers:")
    sample = df_mobile_social[['msisdn', 'imei', 'cell_id', 'src_ip', 'dest_ip']].head(3)
    for _, row in sample.iterrows():
        print(f"MSISDN: {row['msisdn']}, Cell: {row['cell_id']}, Route: {row['src_ip']} -> {row['dest_ip']}")

    # Save the dataset
    df_mobile_social.to_csv("mobile_social_media_ipdr_dataset.csv", index=False)
    print(f"\nDataset saved as 'mobile_social_media_ipdr_dataset.csv'")

=== MOBILE SOCIAL MEDIA IPDR DATASET ===
Total records: 10000
Anomalies: 500 (5.0%)
Columns: ['msisdn', 'imei', 'imsi', 'src_ip', 'src_port', 'dest_ip', 'dest_port', 'protocol', 'start_time', 'end_time', 'session_duration', 'uplink_volume', 'downlink_volume', 'total_volume', 'cell_id', 'network_type', 'platform_name', 'is_anomaly', 'anomaly_type', 'bytes_per_second', 'uplink_ratio', 'hour', 'day_of_week', 'is_weekend', 'is_prime_time']
Date range: 2024-01-01 01:32:00 to 2024-01-30 23:18:34

Platform distribution:
platform_name
WhatsApp     3607
Instagram    2426
YouTube      1981
Facebook      965
Josh          624
Moj           397
Name: count, dtype: int64

Anomaly breakdown:
anomaly_type
normal                     9500
suspicious_location         111
bot_like_behavior           105
excessive_data_usage        103
rapid_session_switching      94
unusual_time_pattern         87
Name: count, dtype: int64

Time pattern analysis:
Peak hours (19-22): 3701
Weekend usage: 2603

Network type