## Selecting the EC2 instance types based on compute, cost and availability criteria

In [1]:
import os
import re
import requests
import boto3
import json
from collections import Counter, defaultdict
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Literal, Optional, Tuple

In [2]:
import pandas as pd

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 50)

In [4]:
os.environ['AWS_DEFAULT_REGION'] = "us-west-2"
os.environ['AWS_REGION'] = "us-west-2"
# This should be set to the AWS profile you want to use
os.environ['AWS_PROFILE'] = "sandbox"

In [5]:
from aibs_informatics_aws_utils.core import get_region
from aibs_informatics_aws_utils.ec2 import (
    describe_instance_types_by_props,
    get_instance_type_on_demand_price,
    get_instance_type_spot_price,
)

In [7]:

def get_instance_type_spot_interruptions(
    os: Literal["Linux", "Windows"] = "Linux",
    region: Optional[str] = None
) -> Dict[str, Tuple[float, float]]:
    """Gets the spot interruption rate for a list of instance types
    
    https://stackoverflow.com/a/61526188/4544508

    Returns:
        Dict[str, Tuple[float, float]]: A dictionary of instance types and their corresponding
            spot interruption rates. The interruption rate is a tuple of (lower, upper) bounds
    """
    url_interruptions = "https://spot-bid-advisor.s3.amazonaws.com/spot-advisor-data.json"
    response = requests.get(url=url_interruptions)
    spot_advisor = json.loads(response.text)['spot_advisor']
    
    region = region or get_region()
    
    interruption_rate_by_instance_type: Dict[str, Tuple[float, float]] = {}
    for it in spot_advisor[region][os]:
        try:
            rate = spot_advisor[region][os][it]['r']
            if rate == 0:
                interruption_rate_by_instance_type[it] = (0., 0.05)
            elif rate == 1:
                interruption_rate_by_instance_type[it] = (0.05, 0.10)
            elif rate == 2:
                interruption_rate_by_instance_type[it] = (0.10, 0.15)
            elif rate == 3:
                interruption_rate_by_instance_type[it] = (0.15, 0.20)
            else:
                # NOTE: Upper limit is not specified by data, rather just a high number to indicate
                #       that the instance type is not recommended for spot
                interruption_rate_by_instance_type[it] = (0.2, 0.65)
        except KeyError:
            print(f"warning: {it} not found in spot advisor data")
            
            
    return interruption_rate_by_instance_type


def instance_type_sort_key(instance_type: str) -> Tuple[str, int, int]:
    """Converts Instance Type into sort key (family, size rank, factor)
    
    Size Rank:
        1. nano
        2. micro
        3. small
        4. medium
        5. large
        6. metal
    
    
    Examples:
        - c5.2xlarge -> ('c5', 4, 2)
        - m7i-flex.metal -> ('m7i-flex', 5, 0)

    Args:
        instance_type (str): The instance type to split

    Returns:
        Tuple[str, int, int]: The instance type components (family, size rank, factor)
    """
    # Split instance type into prefix and size
    pattern = re.compile(r'([\w-]+)\.((\d*)x)?(nano|micro|small|medium|large|metal)')
    match = pattern.match(instance_type)
    
    if match is None:
        raise ValueError(f"Invalid instance type: {instance_type}. Cannot match regex {pattern}")
    
    family, factorstr, factornum, size = match.groups()
    
    # Define a dictionary to map sizes to numbers for sorting
    size_dict = {'nano': 0, 'micro': 1, 'small': 2, 'medium': 3, 'large': 4, 'metal': 5}
    # If size is a number followed by 'xlarge', extract the number
    size_rank = size_dict[size]
    factor = int(factornum) if factornum else (1 if factorstr and "x" in factorstr else 0)
    return (family, size_rank, factor)


def network_performance_sort_key(network_performance: str) -> float:
    """Converts network performance description into a numerical sort key

    Args:
        network_performance (str): The network performance description
            e.g. "Low", "Moderate", "High", "Up to 10 Gigabit", "25 Gigabit", etc.

    Returns:
        float: The upper limit network performance value in Gbps
    """
    # If it matches a pattern like "10 Gigabit", "25 Gigabit", etc.
    pattern = re.compile(r"(\d+(?:.\d*)?)\s*Gigabit")
    # These are approximate values for the network performance
    conversion_dict = {
        "Low": 0.05,
        "Moderate": 0.3,
        "High": 1.0,
    }
    if network_performance in conversion_dict:
        return conversion_dict[network_performance]
    elif match := pattern.search(network_performance):
        return float(match.group(1))
    else:
        raise ValueError(f"Invalid network performance: {network_performance}")


### Get Instance Type Info List from AWS

In [8]:
it_info_list = describe_instance_types_by_props(
    architectures=["x86_64"],
    vcpu_limits=(1, 256),
    memory_limits=(1, 1024 * 1024),
    gpu_limits=(0, 0),
    on_demand_support=True,
    spot_support=True,
    regions=['us-west-2'],
)
len(it_info_list)

360

### Add Pricing and Spot Interruption Info

In [9]:
for it_info in it_info_list:
    if "spot" not in it_info['SupportedUsageClasses'] or "on-demand" not in it_info['SupportedUsageClasses']:
        continue
    it_info['Pricing'] = {
        "OnDemand": get_instance_type_on_demand_price("us-west-2", it_info['InstanceType']),
        "Spot": get_instance_type_spot_price("us-west-2", it_info['InstanceType']),
    }
    

In [10]:
it_spot_interruptions = get_instance_type_spot_interruptions(region="us-west-2")
for it_info in it_info_list:
    it_info['SpotInterruptionRate'] = it_spot_interruptions.get(it_info['InstanceType'], (0., 0.))

### Convert to Dataframe and expand columns for pricing / compute ratios

In [11]:
it_info_df = pd.DataFrame(it_info_list, index=[it['InstanceType'] for it in it_info_list])

for column in ["VCpuInfo", "Pricing", "MemoryInfo", "NetworkInfo"]:
    # Check if the column contains a dictionary
    if isinstance(it_info_df[column].iloc[0], dict):
        # Flatten the dictionary and create new columns
        df_out = pd.json_normalize(it_info_df[column])
        df_out.columns = [f'{column}.{col}' for col in df_out.columns]
        df_out.index = it_info_df.index
        it_info_df = it_info_df.drop(column, axis=1).join(df_out)


In [12]:
it_info_df["InstanceFamily"] = it_info_df["InstanceType"].apply(lambda x: x.split(".")[0])
it_info_df["InstanceSize"] = it_info_df["InstanceType"].apply(lambda x: x.split(".")[1])

it_info_df["MemoryInfo.SizeInGiB"] = it_info_df["MemoryInfo.SizeInMiB"] / 1024
it_info_df["NetworkInfo.NetworkPerformanceGbps"] = it_info_df["NetworkInfo.NetworkPerformance"].apply(network_performance_sort_key)

it_info_df["PricingSpotOnDemandRatio"] = it_info_df["Pricing.Spot"] / it_info_df["Pricing.OnDemand"] 

it_info_df["PricingOnDemandPerVcpu"] = it_info_df["Pricing.OnDemand"] / it_info_df["VCpuInfo.DefaultVCpus"] 
it_info_df["PricingOnDemandPerMemory"] = it_info_df["Pricing.OnDemand"] / it_info_df["MemoryInfo.SizeInGiB"]

it_info_df["SpotInterruptionRateLower"] = it_info_df["SpotInterruptionRate"].apply(lambda x: x[0])
it_info_df["SpotInterruptionRateUpper"] = it_info_df["SpotInterruptionRate"].apply(lambda x: x[1])

# These coefficients are just rough ways of making apples to apples comparisons
# Basically, we want to make sure that the ratio of vcpus to memory is roughly the same
# We do this by multiplying the vcpu price by 16 and the memory price by 64
# making this the effective price per a 16 core / 64 GB machine
COMPUTE_COEFF = 16.
MEM_COEFF = 64.
it_info_df["PricingOnDemandPerCompute"] = (it_info_df["PricingOnDemandPerVcpu"] * COMPUTE_COEFF + it_info_df["PricingOnDemandPerMemory"] * MEM_COEFF) / 2
it_info_df["PricingSpotPerCompute"] = it_info_df["PricingOnDemandPerCompute"] * it_info_df["PricingSpotOnDemandRatio"]

priority_columns = [
    "InstanceType",
    "InstanceFamily",
    "InstanceSize",
    "VCpuInfo.DefaultVCpus",
    "MemoryInfo.SizeInGiB",
    "NetworkInfo.NetworkPerformanceGbps",
    "PricingOnDemandPerCompute",
    "PricingSpotPerCompute",
    "PricingOnDemandPerVcpu",
    "PricingOnDemandPerMemory",
    "PricingSpotOnDemandRatio",
    "SpotInterruptionRateLower",
    "SpotInterruptionRateUpper",
    "Pricing.OnDemand",
    "Pricing.Spot",
]

In [13]:
it_info_df["NetworkInfo.NetworkPerformance"].value_counts()


NetworkInfo.NetworkPerformance
Up to 12.5 Gigabit    72
Up to 10 Gigabit      54
50 Gigabit            39
25 Gigabit            37
Up to 25 Gigabit      20
37.5 Gigabit          19
18.75 Gigabit         16
12.5 Gigabit          16
Up to 5 Gigabit       14
12 Gigabit            13
10 Gigabit            11
20 Gigabit            10
100 Gigabit           10
75 Gigabit             6
200 Gigabit            4
Up to 15 Gigabit       4
150 Gigabit            2
Up to 50 Gigabit       2
High                   2
Up to 30 Gigabit       2
Up to 40 Gigabit       2
56.25 Gigabit          1
Up to 12 Gigabit       1
40 Gigabit             1
28.12 Gigabit          1
Moderate               1
Name: count, dtype: int64

In [14]:
it_info_df[priority_columns].sort_values("PricingOnDemandPerCompute", ascending=False).head(20)

Unnamed: 0,InstanceType,InstanceFamily,InstanceSize,VCpuInfo.DefaultVCpus,MemoryInfo.SizeInGiB,NetworkInfo.NetworkPerformanceGbps,PricingOnDemandPerCompute,PricingSpotPerCompute,PricingOnDemandPerVcpu,PricingOnDemandPerMemory,PricingSpotOnDemandRatio,SpotInterruptionRateLower,SpotInterruptionRateUpper,Pricing.OnDemand,Pricing.Spot
inf2.xlarge,inf2.xlarge,inf2,xlarge,4,16.0,15.0,3.0328,0.3364,0.18955,0.047387,0.110921,0.2,0.65,0.7582,0.0841
d3en.xlarge,d3en.xlarge,d3en,xlarge,4,16.0,25.0,2.104,0.322,0.1315,0.032875,0.153042,0.0,0.05,0.526,0.0805
d2.xlarge,d2.xlarge,d2,xlarge,4,30.5,0.3,2.103934,0.528118,0.1725,0.022623,0.251014,0.2,0.65,0.69,0.1732
d2.4xlarge,d2.4xlarge,d2,4xlarge,16,122.0,1.0,2.103934,0.306595,0.1725,0.022623,0.145725,0.2,0.65,2.76,0.4022
d2.2xlarge,d2.2xlarge,d2,2xlarge,8,61.0,1.0,2.103934,0.272749,0.1725,0.022623,0.129638,0.2,0.65,1.38,0.1789
d3en.4xlarge,d3en.4xlarge,d3en,4xlarge,16,64.0,25.0,2.103,0.742,0.131438,0.032859,0.352829,0.0,0.05,2.103,0.742
d3en.12xlarge,d3en.12xlarge,d3en,12xlarge,48,192.0,75.0,2.10288,0.223367,0.13143,0.032857,0.106219,0.0,0.05,6.30864,0.6701
d3en.8xlarge,d3en.8xlarge,d3en,8xlarge,32,128.0,50.0,2.10288,0.24285,0.13143,0.032857,0.115484,0.0,0.05,4.20576,0.4857
d3en.6xlarge,d3en.6xlarge,d3en,6xlarge,24,96.0,40.0,2.102667,0.2262,0.131417,0.032854,0.107578,0.0,0.05,3.154,0.3393
d3en.2xlarge,d3en.2xlarge,d3en,2xlarge,8,32.0,25.0,2.102,0.2314,0.131375,0.032844,0.110086,0.0,0.05,1.051,0.1157


In [15]:
it_info_df[priority_columns].sort_values("PricingOnDemandPerCompute", ascending=False).tail(20)

Unnamed: 0,InstanceType,InstanceFamily,InstanceSize,VCpuInfo.DefaultVCpus,MemoryInfo.SizeInGiB,NetworkInfo.NetworkPerformanceGbps,PricingOnDemandPerCompute,PricingSpotPerCompute,PricingOnDemandPerVcpu,PricingOnDemandPerMemory,PricingSpotOnDemandRatio,SpotInterruptionRateLower,SpotInterruptionRateUpper,Pricing.OnDemand,Pricing.Spot
r5a.8xlarge,r5a.8xlarge,r5a,8xlarge,32,256.0,10.0,0.678,0.338813,0.0565,0.007063,0.499723,0.1,0.15,1.808,0.9035
r5a.2xlarge,r5a.2xlarge,r5a,2xlarge,8,64.0,10.0,0.678,0.29925,0.0565,0.007063,0.441372,0.05,0.1,0.452,0.1995
r5a.16xlarge,r5a.16xlarge,r5a,16xlarge,64,512.0,12.0,0.678,0.268388,0.0565,0.007063,0.395852,0.2,0.65,3.616,1.4314
r5a.4xlarge,r5a.4xlarge,r5a,4xlarge,16,128.0,10.0,0.678,0.32415,0.0565,0.007063,0.478097,0.1,0.15,0.904,0.4322
r5a.12xlarge,r5a.12xlarge,r5a,12xlarge,48,384.0,10.0,0.678,0.35,0.0565,0.007063,0.516224,0.15,0.2,2.712,1.4
r5a.large,r5a.large,r5a,large,2,16.0,10.0,0.678,0.3426,0.0565,0.007063,0.50531,0.0,0.05,0.113,0.0571
t3.xlarge,t3.xlarge,t3,xlarge,4,16.0,5.0,0.6656,0.2572,0.0416,0.0104,0.386418,0.0,0.05,0.1664,0.0643
t3.large,t3.large,t3,large,2,8.0,5.0,0.6656,0.2512,0.0416,0.0104,0.377404,0.0,0.05,0.0832,0.0314
t3.2xlarge,t3.2xlarge,t3,2xlarge,8,32.0,5.0,0.6656,0.3098,0.0416,0.0104,0.465445,0.05,0.1,0.3328,0.1549
t3a.2xlarge,t3a.2xlarge,t3a,2xlarge,8,32.0,5.0,0.6016,0.1884,0.0376,0.0094,0.313165,0.0,0.05,0.3008,0.0942


### Filtering for Instance Types that meet criteria 

In [16]:
available = it_info_df[
    # Filter out unusable instance types which are considered  greater than 1 TB of memory or 256 vcpus
    (it_info_df["MemoryInfo.SizeInGiB"] <= 1024) & (it_info_df["VCpuInfo.DefaultVCpus"] <= 256)
    # Filter out T* instances - they are burstable and not suitable for our use case
    & (~it_info_df["InstanceFamily"].apply(lambda _: _.startswith("t")))
    # Filter out instances with less than 1 Gbps network performance
    & (it_info_df["NetworkInfo.NetworkPerformanceGbps"] >= 1.0)
]


target_on_demand = available[
    # Limit price to less than $1 per 16 cores / 64 GB machine (roughly)
    (available["PricingOnDemandPerCompute"] < 1.0)
]

target_spot = target_on_demand[
    # Filter out spot instances with interruption rates greater than 15%
    (target_on_demand["SpotInterruptionRateUpper"] <= 0.15)
]

target_transfer = available[
    # Requires at most 8 GB of memory and 4 vcpus (ensures more instances are used for faster network transfer)
    (available["MemoryInfo.SizeInGiB"] <= 8) 
    & (available["VCpuInfo.DefaultVCpus"] <= 4)
    # Limit price to less than $1.50 per 16 cores / 64 GB machine (roughly)
    & (available["PricingOnDemandPerCompute"] < 1.50)
    # Filter out instances with less than 10 Gbps network performance
    & (available["NetworkInfo.NetworkPerformanceGbps"] >= 10.0)

]

target_lambda_small = available[
    # Filter to instance types with less than 4 GB of memory
    (available["MemoryInfo.SizeInGiB"] <= 4)
    # Limit price to less than $1.50 per 16 cores / 64 GB machine (roughly)
    & (available["PricingOnDemandPerCompute"] < 1.50)
]

target_lambda_medium = available[
    # Filter to instance types between 4 GB and 8 GB of memory
    (available["MemoryInfo.SizeInGiB"] > 4) & (available["MemoryInfo.SizeInGiB"] <= 8)
    # Limit price to less than $1.50 per 16 cores / 64 GB machine (roughly)
    & (available["PricingOnDemandPerCompute"] < 1.50)
]

target_lambda_large = available[
    # Filter to instance types between 8 GB and 16 GB of memory
    (available["MemoryInfo.SizeInGiB"] > 8) & (available["MemoryInfo.SizeInGiB"] <= 16)
    # Limit price to less than $1.50 per 16 cores / 64 GB machine (roughly)
    & (available["PricingOnDemandPerCompute"] < 1.50)
]

In [17]:
price_on_demand_per_compute = it_info_df["PricingOnDemandPerCompute"].to_dict()

    
data_cols = ['On-Demand', 'Spot', 'Transfer', 'Lambda (S)', 'Lambda (M)', 'Lambda (L)']
data = [
    target_on_demand.index.tolist(),
    target_spot.index.tolist(),
    target_transfer.index.tolist(),
    target_lambda_small.index.tolist(),
    target_lambda_medium.index.tolist(),
    target_lambda_large.index.tolist(),
]
# Create a format string that pads each column dynamically
format_string = " ".join(["{:<10}" for _ in data_cols])


print("{:<20} {:} {:<10}".format('Instance Type', format_string.format(*data_cols), 'Price per 16/64GiB'))
for it in sorted(price_on_demand_per_compute.keys(), key=instance_type_sort_key):
    x_or_o = lambda _: "x" if _ else "-"
    
    print(
        "{:<20} {:} {:.3f}".format(
            it,
            format_string.format(*[x_or_o(it in _) for _ in data]),
            price_on_demand_per_compute.get(it, 0.),
        )
    )

Instance Type        On-Demand  Spot       Transfer   Lambda (S) Lambda (M) Lambda (L) Price per 16/64GiB
c5.large             -          -          x          x          -          -          1.020
c5.xlarge            -          -          x          -          x          -          1.020
c5.2xlarge           -          -          -          -          -          x          1.020
c5.4xlarge           -          -          -          -          -          -          1.020
c5.9xlarge           -          -          -          -          -          -          1.020
c5.12xlarge          -          -          -          -          -          -          1.020
c5.18xlarge          -          -          -          -          -          -          1.020
c5.24xlarge          -          -          -          -          -          -          1.020
c5.metal             -          -          -          -          -          -          1.020
c5a.large            x          x          x          x  

### Generate New Lists for Instance Types that meet criteria

You can replace the variables in `gcs_infra.stacks.gwo.batch.instance_types` module

In [18]:
def print_types(var_name: str, instance_types: List[str]):
    print(f"{var_name}: List[str] = {json.dumps(sorted(instance_types, key=instance_type_sort_key), indent=4)}")


In [19]:
print_types(f"ON_DEMAND_INSTANCE_TYPES", target_on_demand["InstanceType"].tolist())
print("\n")
print_types(f"SPOT_INSTANCE_TYPES", target_spot["InstanceType"].tolist())
print("\n")
print_types(f"TRANSFER_INSTANCE_TYPES", target_transfer["InstanceType"].tolist())
print("\n")
print_types(f"LAMBDA_SMALL_INSTANCE_TYPES", target_lambda_small["InstanceType"].tolist())
print("\n")
print_types(f"LAMBDA_MEDIUM_INSTANCE_TYPES", target_lambda_medium["InstanceType"].tolist())
print("\n")
print_types(f"LAMBDA_LARGE_INSTANCE_TYPES", target_lambda_large["InstanceType"].tolist())
print("\n")


ON_DEMAND_INSTANCE_TYPES: List[str] = [
    "c5a.large",
    "c5a.xlarge",
    "c5a.2xlarge",
    "c5a.4xlarge",
    "c5a.8xlarge",
    "c5a.12xlarge",
    "c5a.16xlarge",
    "c5a.24xlarge",
    "c6a.large",
    "c6a.xlarge",
    "c6a.2xlarge",
    "c6a.4xlarge",
    "c6a.8xlarge",
    "c6a.12xlarge",
    "c6a.16xlarge",
    "c6a.24xlarge",
    "c6a.32xlarge",
    "c6a.48xlarge",
    "c6a.metal",
    "i3.large",
    "i3.xlarge",
    "i3.2xlarge",
    "i3.4xlarge",
    "i3.8xlarge",
    "i3.16xlarge",
    "inf2.8xlarge",
    "m5.large",
    "m5.xlarge",
    "m5.2xlarge",
    "m5.4xlarge",
    "m5.8xlarge",
    "m5.12xlarge",
    "m5.16xlarge",
    "m5.24xlarge",
    "m5.metal",
    "m5a.large",
    "m5a.xlarge",
    "m5a.2xlarge",
    "m5a.4xlarge",
    "m5a.8xlarge",
    "m5a.12xlarge",
    "m5a.16xlarge",
    "m5a.24xlarge",
    "m5ad.large",
    "m5ad.xlarge",
    "m5ad.2xlarge",
    "m5ad.4xlarge",
    "m5ad.8xlarge",
    "m5ad.12xlarge",
    "m5ad.16xlarge",
    "m5ad.24xlarge",
 