In [1]:
import pandas as pd
import numpy as np
import os
import ipaddress
from pathlib import Path


In [2]:
# Load original datasets
print("Loading original datasets...")
df_20250613 = pd.read_csv('data/ipinfo_privacy.20250613.csv')
df_20250923 = pd.read_csv('data/ipinfo_privacy.20250923.csv')

print(f"Original 2025-06-13 shape: {df_20250613.shape}")
print(f"Original 2025-09-23 shape: {df_20250923.shape}")


Loading original datasets...
Original 2025-06-13 shape: (20118676, 7)
Original 2025-09-23 shape: (20394284, 7)


In [3]:
def classify_ip_version(network_str):
    """Classify network as IPv4 or IPv6"""
    try:
        network = ipaddress.ip_network(network_str, strict=False)
        return 4 if network.version == 4 else 6
    except (ipaddress.AddressValueError, ValueError):
        if ':' in network_str:
            return 6  # Likely IPv6
        elif '.' in network_str:
            return 4  # Likely IPv4
        else:
            return None  # Unknown

def normalize_ipv6_to_64(network_str):
    """Normalize IPv6 addresses/networks to /64 prefixes"""
    try:
        network = ipaddress.ip_network(network_str, strict=False)
        if network.version == 6:
            # Get the /64 prefix (top 64 bits)
            prefix_64 = network.supernet(new_prefix=64)
            return str(prefix_64)
        else:
            return network_str  # Return as-is for IPv4
    except (ipaddress.AddressValueError, ValueError):
        return network_str  # Return as-is if parsing fails

def aggregate_ipv6_by_prefix(df):
    """Aggregate IPv6 data by /64 prefixes using logical OR for boolean columns"""
    # Group by normalized network and aggregate
    bool_cols = ['hosting', 'proxy', 'tor', 'relay', 'vpn']
    agg_dict = {col: 'max' for col in bool_cols}  # max is equivalent to logical OR for booleans
    agg_dict['service'] = 'first'  # Take first service value
    agg_dict['ip_version'] = 'first'  # Keep IP version
    
    aggregated = df.groupby('network_normalized').agg(agg_dict).reset_index()
    aggregated.rename(columns={'network_normalized': 'network'}, inplace=True)
    return aggregated


In [4]:
# Classify IP versions
print("Classifying IP versions...")
df_20250613['ip_version'] = df_20250613['network'].apply(classify_ip_version)
df_20250923['ip_version'] = df_20250923['network'].apply(classify_ip_version)

print(f"2025-06-13 IP Version Distribution:")
print(df_20250613['ip_version'].value_counts().sort_index())

print(f"\n2025-09-23 IP Version Distribution:")
print(df_20250923['ip_version'].value_counts().sort_index())


Classifying IP versions...
2025-06-13 IP Version Distribution:
ip_version
4    16450714
6     3667962
Name: count, dtype: int64

2025-09-23 IP Version Distribution:
ip_version
4    16899121
6     3495163
Name: count, dtype: int64


In [5]:
# Split by IP version
print("\nSplitting datasets by IP version...")

# IPv4 datasets - keep original networks
df_20250613_ipv4 = df_20250613[df_20250613['ip_version'] == 4].copy()
df_20250923_ipv4 = df_20250923[df_20250923['ip_version'] == 4].copy()

# IPv6 datasets - normalize to /64 prefixes
df_20250613_ipv6 = df_20250613[df_20250613['ip_version'] == 6].copy()
df_20250923_ipv6 = df_20250923[df_20250923['ip_version'] == 6].copy()

print(f"IPv4 datasets:")
print(f"- 2025-06-13: {len(df_20250613_ipv4):,} networks")
print(f"- 2025-09-23: {len(df_20250923_ipv4):,} networks")

print(f"\nIPv6 datasets (before /64 normalization):")
print(f"- 2025-06-13: {len(df_20250613_ipv6):,} networks") 
print(f"- 2025-09-23: {len(df_20250923_ipv6):,} networks")



Splitting datasets by IP version...
IPv4 datasets:
- 2025-06-13: 16,450,714 networks
- 2025-09-23: 16,899,121 networks

IPv6 datasets (before /64 normalization):
- 2025-06-13: 3,667,962 networks
- 2025-09-23: 3,495,163 networks


In [6]:
# Normalize IPv6 to /64 prefixes and aggregate
print("\nNormalizing IPv6 networks to /64 prefixes...")
df_20250613_ipv6['network_normalized'] = df_20250613_ipv6['network'].apply(normalize_ipv6_to_64)
df_20250923_ipv6['network_normalized'] = df_20250923_ipv6['network'].apply(normalize_ipv6_to_64)

print("Aggregating IPv6 data by /64 prefixes...")
df_20250613_ipv6_agg = aggregate_ipv6_by_prefix(df_20250613_ipv6)
df_20250923_ipv6_agg = aggregate_ipv6_by_prefix(df_20250923_ipv6)

print(f"IPv6 datasets (after /64 aggregation):")
print(f"- 2025-06-13: {len(df_20250613_ipv6_agg):,} /64 prefixes") 
print(f"- 2025-09-23: {len(df_20250923_ipv6_agg):,} /64 prefixes")



Normalizing IPv6 networks to /64 prefixes...
Aggregating IPv6 data by /64 prefixes...
IPv6 datasets (after /64 aggregation):
- 2025-06-13: 226,166 /64 prefixes
- 2025-09-23: 253,807 /64 prefixes


In [7]:
# Create output directory
output_dir = Path('split_data')
output_dir.mkdir(exist_ok=True)

print(f"Created output directory: {output_dir}")


Created output directory: split_data


In [8]:
# Export IPv4 datasets
print("Exporting IPv4 datasets...")

ipv4_20250613_path = output_dir / 'ipv4_20250613.csv'
ipv4_20250923_path = output_dir / 'ipv4_20250923.csv'

df_20250613_ipv4.to_csv(ipv4_20250613_path, index=False)
df_20250923_ipv4.to_csv(ipv4_20250923_path, index=False)

print(f"Exported: {ipv4_20250613_path} ({len(df_20250613_ipv4):,} rows)")
print(f"Exported: {ipv4_20250923_path} ({len(df_20250923_ipv4):,} rows)")


Exporting IPv4 datasets...
Exported: split_data/ipv4_20250613.csv (16,450,714 rows)
Exported: split_data/ipv4_20250923.csv (16,899,121 rows)


In [9]:
# Export IPv6 datasets (aggregated to /64 prefixes)
print("\nExporting IPv6 datasets (aggregated to /64 prefixes)...")

ipv6_20250613_path = output_dir / 'ipv6_64_20250613.csv'
ipv6_20250923_path = output_dir / 'ipv6_64_20250923.csv'

df_20250613_ipv6_agg.to_csv(ipv6_20250613_path, index=False)
df_20250923_ipv6_agg.to_csv(ipv6_20250923_path, index=False)

print(f"Exported: {ipv6_20250613_path} ({len(df_20250613_ipv6_agg):,} rows)")
print(f"Exported: {ipv6_20250923_path} ({len(df_20250923_ipv6_agg):,} rows)")



Exporting IPv6 datasets (aggregated to /64 prefixes)...
Exported: split_data/ipv6_64_20250613.csv (226,166 rows)
Exported: split_data/ipv6_64_20250923.csv (253,807 rows)


In [10]:
# Final summary
print("\n=== EXPORT COMPLETE ===")
print(f"All files exported to: {output_dir.absolute()}")

print("\nFiles created:")
for file_path in sorted(output_dir.glob('*')):
    if file_path.is_file():
        if file_path.suffix == '.csv':
            df_temp = pd.read_csv(file_path)
            print(f"  {file_path.name}: {len(df_temp):,} rows")
        else:
            print(f"  {file_path.name}: {file_path.stat().st_size} bytes")

print("\nReady for churn analysis!")



=== EXPORT COMPLETE ===
All files exported to: /home/py/Documents/aims-hackathon-fall25-vpnbench/split_data

Files created:


  df_temp = pd.read_csv(file_path)


  ipv4_20250613.csv: 16,450,714 rows


  df_temp = pd.read_csv(file_path)


  ipv4_20250923.csv: 16,899,121 rows
  ipv6_64_20250613.csv: 226,166 rows
  ipv6_64_20250923.csv: 253,807 rows

Ready for churn analysis!
