In [33]:
# General Configuration Variables
RESULT_DIR = "synthetic_topologies"
# DEVICES_DATASET_PATH = "eua-dataset/edge-servers/site-optus-melbCBD.csv"
DEVICES_DATASET_PATH = "eua-dataset/edge-servers/site.csv"
VENDORS_TO_CONSIDER = ["Telstra", "Optus", "Vodafone", "Telecom", "Macquarie"]

In [34]:
import os

if not os.path.exists(RESULT_DIR):
  os.makedirs(RESULT_DIR)

In [35]:
import pandas as pd

def load_devices_dataframe(path: str) -> pd.DataFrame:
    """
    Reads the CSV and returns a DataFrame with the required columns.
    """
    df = pd.read_csv(path)
    
    # Rename columns for consistency
    df.rename(
        columns={
            "SITE_ID": "device_id",
            "LATITUDE": "latitude",
            "LONGITUDE": "longitude",
            "NAME": "name",
            "STATE": "state",
            "LICENSING_AREA_ID": "licensing_area_id",
            "POSTCODE": "postcode",
            "SITE_PRECISION": "site_precision",
            "ELEVATION": "elevation",
            "HCIS_L2": "hcis_l2",
        },
        inplace=True,
    )
    
    # Set device_id as index
    df.set_index("device_id", inplace=True, drop=False)
    
    # Remove unnecessary columns if any exist
    df = df[
        [
            "name",
            "latitude",
            "longitude",
            "elevation",
        ]
    ]
    
    return df
  
devices_df = load_devices_dataframe(DEVICES_DATASET_PATH)

print("Dataset size:", len(devices_df))
devices_df.head()

Dataset size: 95562


Unnamed: 0_level_0,name,latitude,longitude,elevation
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1000,Fort Hill Wharf DARWIN,-12.471947,130.845073,
10000,Cnr Castlereagh & Lethbri PENRITH,-33.756158,150.698182,
10000002,Optus 50m Lattice Tower 71 Eastward Road Utakarra,-28.77766,114.63426,
10000003,6 Knuckey Street Darwin,-12.464597,130.840708,
10000004,Cape Wickham Links Clubhouse KING ISLAND,-39.5964,143.9339,


In [36]:
import re

# Filter devices by vendor names
pattern = "|".join(re.escape(v) for v in VENDORS_TO_CONSIDER)
mask = devices_df["name"].str.contains(pattern, case=False, na=False)
devices_df = devices_df.loc[mask].copy()

# Extract and standardize provider names
devices_df["provider"] = (
  devices_df["name"]
  .str.extract(f"({pattern})", flags=re.IGNORECASE)[0]
  .str.upper()
)

# Drop the original name column
devices_df.drop(columns=["name"], inplace=True)

print("Total devices after filtering:", len(devices_df))
devices_df

Total devices after filtering: 18822


Unnamed: 0_level_0,latitude,longitude,elevation,provider
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10000002,-28.777660,114.634260,,OPTUS
100001,-38.248652,144.605442,23.0,TELSTRA
10000114,-31.901910,152.533540,,OPTUS
100002,-37.728550,145.222007,116.0,OPTUS
10000215,-32.981570,121.644400,,TELSTRA
...,...,...,...,...
9954,-34.819950,147.902049,714.0,TELSTRA
9958,-34.971752,147.998115,709.0,TELSTRA
9967,-36.130494,144.750901,100.0,TELSTRA
9980,-33.754568,150.716963,48.0,OPTUS


In [37]:
devices_df.to_csv(os.path.join(RESULT_DIR, "devices.csv"))

In [39]:
import numpy as np
from typing import Dict, List, Tuple, Optional

def assign_device_resources(
    df: pd.DataFrame,
    config: Optional[Dict] = None
) -> pd.DataFrame:
    """
    Assigns available resources to each device intelligently and realistically.
    
    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame with devices
    config : Dict, optional
        Configuration for resource assignment. Structure:
        {
            'global': {
                'group_percentages': {1: 33.0, 2: 33.0, 3: 34.0},  # Percentage of devices in each group
                'group_ranges': {1: (0, 33), 2: (33, 66), 3: (66, 100)}  # % range for each group
            },
            'attributes': {
                'available_RAM': {
                    'min': 1,
                    'max': 128,
                    'local_distribution': {
                        1: [(60, 0, 60), (40, 60, 100)],  # Group 1: (% devices, % min range, % max range)
                        2: [(50, 0, 50), (50, 50, 100)],
                        3: [(30, 0, 40), (70, 40, 100)]
                    }
                },
                'available_Storage': {
                    'min': 10,
                    'max': 2000
                },
                'available_vCPU': {
                    'min': 1,
                    'max': 64
                },
                'GPU_available': {
                    'group_percentages': {1: 10, 2: 30, 3: 60}  # % of true per group
                },
                'TPU_available': {
                    'group_percentages': {1: 5, 2: 15, 3: 40}
                }
            }
        }
    
    Returns:
    --------
    pd.DataFrame with resource columns added
    """
    
    # Default configuration
    default_config = {
        'global': {
            'group_percentages': {1: 33.33, 2: 33.33, 3: 33.34},
            'group_ranges': {1: (0, 33), 2: (33, 66), 3: (66, 100)}
        },
        'attributes': {
            'available_RAM': {'min': 1, 'max': 128},
            'available_Storage': {'min': 10, 'max': 2000},
            'available_vCPU': {'min': 1, 'max': 64},
            'GPU_available': {'group_percentages': {1: 10, 2: 30, 3: 60}},
            'TPU_available': {'group_percentages': {1: 5, 2: 15, 3: 40}}
        }
    }
    
    # Use provided configuration or default
    if config is None:
        config = default_config
    else:
        # Merge with default values for unspecified items
        if 'global' not in config:
            config['global'] = default_config['global']
        else:
            if 'group_percentages' not in config['global']:
                config['global']['group_percentages'] = default_config['global']['group_percentages']
            else:
                # Normalize percentages if they don't sum to 100
                configured_groups = config['global']['group_percentages']
                total_pct = sum(configured_groups.values())
                if total_pct < 100:
                    remaining = 100 - total_pct
                    unconfigured = [g for g in [1, 2, 3] if g not in configured_groups]
                    if unconfigured:
                        split = remaining / len(unconfigured)
                        for g in unconfigured:
                            config['global']['group_percentages'][g] = split
            
            if 'group_ranges' not in config['global']:
                config['global']['group_ranges'] = default_config['global']['group_ranges']
        
        if 'attributes' not in config:
            config['attributes'] = default_config['attributes']
        else:
            for attr in default_config['attributes']:
                if attr not in config['attributes']:
                    config['attributes'][attr] = default_config['attributes'][attr]
    
    df_result = df.copy()
    n_devices = len(df_result)
    
    # Assign global groups to each device
    group_percentages = config['global']['group_percentages']
    groups = []
    for group, percentage in sorted(group_percentages.items()):
        count = int(n_devices * percentage / 100)
        groups.extend([group] * count)
    
    # Adjust if there's a rounding difference
    while len(groups) < n_devices:
        groups.append(3)
    groups = groups[:n_devices]
    
    # Shuffle randomly
    np.random.shuffle(groups)
    df_result['global_group'] = groups
    
    # Process integer attributes
    integer_attrs = ['available_RAM', 'available_Storage', 'available_vCPU']
    
    for attr in integer_attrs:
        attr_config = config['attributes'][attr]
        min_val = attr_config['min']
        max_val = attr_config['max']
        
        values = []
        
        for group in [1, 2, 3]:
            # Filter devices in this group
            group_mask = df_result['global_group'] == group
            group_size = group_mask.sum()
            
            if group_size == 0:
                continue
            
            # Get global range for this group
            group_range = config['global']['group_ranges'][group]
            range_min = min_val + (max_val - min_val) * group_range[0] / 100
            range_max = min_val + (max_val - min_val) * group_range[1] / 100
            
            # Check if there's a configured local distribution
            if 'local_distribution' in attr_config and group in attr_config['local_distribution']:
                local_dist = attr_config['local_distribution'][group]
                group_values = []
                
                for pct_devices, pct_min, pct_max in local_dist:
                    count = int(group_size * pct_devices / 100)
                    sub_min = range_min + (range_max - range_min) * pct_min / 100
                    sub_max = range_min + (range_max - range_min) * pct_max / 100
                    
                    sub_values = np.random.uniform(sub_min, sub_max, count)
                    group_values.extend(sub_values)
                
                # Adjust if there's a rounding difference
                while len(group_values) < group_size:
                    group_values.append(np.random.uniform(range_min, range_max))
                group_values = group_values[:group_size]
                
            else:
                # Uniform random generation
                group_values = np.random.uniform(range_min, range_max, group_size)
            
            values.extend(group_values)
        
        # Convert to integers and assign
        df_result[attr] = np.array(values, dtype=int)
    
    # Process boolean attributes
    boolean_attrs = ['GPU_available', 'TPU_available']
    
    for attr in boolean_attrs:
        attr_config = config['attributes'][attr]
        values = []
        
        for group in [1, 2, 3]:
            group_mask = df_result['global_group'] == group
            group_size = group_mask.sum()
            
            if group_size == 0:
                continue
            
            # Get percentage of True for this group
            if 'group_percentages' in attr_config and group in attr_config['group_percentages']:
                true_pct = attr_config['group_percentages'][group]
            else:
                true_pct = 50  # Default 50%
            
            # Generate boolean values
            true_count = int(group_size * true_pct / 100)
            group_values = [True] * true_count + [False] * (group_size - true_count)
            np.random.shuffle(group_values)
            
            values.extend(group_values)
        
        df_result[attr] = values
    
    return df_result

In [None]:
# Example usage with custom configuration
custom_config = {
    'global': {
        'group_percentages': {1: 40, 2: 35, 3: 25},  # More devices in group 1 (low capacity)
        'group_ranges': {1: (0, 33), 2: (33, 66), 3: (66, 100)}
    },
    'attributes': {
        'available_RAM': {
            'min': 2,
            'max': 128,
            'local_distribution': {
                1: [(70, 0, 50), (30, 50, 100)],  # Group 1: majority in low range
                2: [(50, 20, 70), (50, 70, 100)], # Group 2: medium distribution
                3: [(20, 0, 40), (80, 40, 100)]   # Group 3: majority in high range
            }
        },
        'available_Storage': {
            'min': 20,
            'max': 2000,
            'local_distribution': {
                1: [(80, 0, 60), (20, 60, 100)],
                2: [(40, 0, 50), (60, 50, 100)],
                3: [(30, 0, 50), (70, 50, 100)]
            }
        },
        'available_vCPU': {
            'min': 1,
            'max': 64
        },
        'GPU_available': {
            'group_percentages': {1: 5, 2: 25, 3: 70}  # More GPUs on powerful devices
        },
        'TPU_available': {
            'group_percentages': {1: 2, 2: 10, 3: 50}  # TPUs mainly in group 3
        }
    }
}

# Apply the function
devices_df = assign_device_resources(devices_df, custom_config)

print(f"\nDevices per group:")
print(devices_df['global_group'].value_counts().sort_index())
print(f"\nResource statistics:")
print(devices_df[['available_RAM', 'available_Storage', 'available_vCPU']].describe())
print(f"\nAccelerator availability:")
print(f"GPU_available: {devices_df['GPU_available'].sum()} ({devices_df['GPU_available'].sum()/len(devices_df)*100:.1f}%)")
print(f"TPU_available: {devices_df['TPU_available'].sum()} ({devices_df['TPU_available'].sum()/len(devices_df)*100:.1f}%)")

devices_df.head(10)