In [13]:
# General Configuration Variables
RESULT_DIR = "synthetic_topologies"
# DEVICES_DATASET_PATH = "eua-dataset/edge-servers/site-optus-melbCBD.csv"
DEVICES_DATASET_PATH = "eua-dataset/edge-servers/site.csv"
VENDORS_TO_CONSIDER = ["Telstra", "Optus", "Vodafone", "Telecom", "Macquarie"]

In [2]:
import os

if not os.path.exists(RESULT_DIR):
  os.makedirs(RESULT_DIR)

In [23]:
import pandas as pd

def load_devices_dataframe(path: str) -> pd.DataFrame:
    """
    Reads the CSV and returns a DataFrame with the required columns.
    """
    cols = [
        "SITE_ID",
        "LATITUDE",
        "LONGITUDE",
        "NAME",
        "STATE",
        "LICENSING_AREA_ID",
        "POSTCODE",
        "SITE_PRECISION",
        "ELEVATION",
        "HCIS_L2",
    ]
    df = pd.read_csv(path, usecols=cols)
    
    # Rename columns for consistency
    df.rename(
        columns={
            "SITE_ID": "device_id",
            "LATITUDE": "latitude",
            "LONGITUDE": "longitude",
            "NAME": "name",
            "STATE": "state",
            "LICENSING_AREA_ID": "licensing_area_id",
            "POSTCODE": "postcode",
            "SITE_PRECISION": "site_precision",
            "ELEVATION": "elevation",
            "HCIS_L2": "hcis_l2",
        },
        inplace=True,
    )
    
    # Remove unnecessary columns if any exist
    df = df[
        [
            "device_id",
            "name",
            "latitude",
            "longitude",
            "elevation",
        ]
    ]
    
    return df
  
devices_df = load_devices_dataframe(DEVICES_DATASET_PATH)

print("Dataset size:", len(devices_df))
devices_df.head()

Dataset size: 95562


Unnamed: 0,device_id,name,latitude,longitude,elevation
0,1000,Fort Hill Wharf DARWIN,-12.471947,130.845073,
1,10000,Cnr Castlereagh & Lethbri PENRITH,-33.756158,150.698182,
2,10000002,Optus 50m Lattice Tower 71 Eastward Road Utakarra,-28.77766,114.63426,
3,10000003,6 Knuckey Street Darwin,-12.464597,130.840708,
4,10000004,Cape Wickham Links Clubhouse KING ISLAND,-39.5964,143.9339,


In [24]:
import re

# Filter devices by vendor names
pattern = "|".join(re.escape(v) for v in VENDORS_TO_CONSIDER)
mask = devices_df["name"].str.contains(pattern, case=False, na=False)
devices_df = devices_df.loc[mask].copy()

# Extract and standardize provider names
devices_df["provider"] = (
  devices_df["name"]
  .str.extract(f"({pattern})", flags=re.IGNORECASE)[0]
  .str.upper()
)

# Drop the original name column
devices_df.drop(columns=["name"], inplace=True)

print("Total devices after filtering:", len(devices_df))

Total devices after filtering: 18822
