In [1]:
import numpy as np
from orchestrator import Orchestrator
import asyncio

  from .autonotebook import tqdm as notebook_tqdm
2024-03-04 12:03:37,008	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2024-03-04 12:03:39,853	INFO worker.py:1540 -- Connecting to existing Ray cluster at address: 172.29.14.61:6379...
2024-03-04 12:03:39,866	INFO worker.py:1715 -- Connected to Ray cluster. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


In [2]:
orchestrator = Orchestrator()

In [14]:
orchestrator.all_host_num_devices

array([4, 4])

In [13]:
orchestrator.all_host_num_devices = np.array([4, 4])

In [4]:
orchestrator.register_job()
orchestrator.register_job()

'b0e4bb84-9b8e-4c5f-9cdb-f904958338ed'

In [5]:
orchestrator.jobs

{'d5284783-3609-4ab8-afad-4bd8650520e3': <pollux_job.PolluxJob at 0x7ffe0ead2b20>,
 'b0e4bb84-9b8e-4c5f-9cdb-f904958338ed': <pollux_job.PolluxJob at 0x7ffe0ead2f10>}

In [6]:
# request PGs for the jobs

for job_id, job in orchestrator.jobs.items():
    await orchestrator.initial_request_placement_group(job_id, f"{job_id}_pg")

INFO:orchestrator:Waiting for placement group to start.
INFO:orchestrator:Placement group has started.
INFO:orchestrator:PG table: {'placement_group_id': 'f51dec1f6afe78cad9320037da0036000000', 'name': 'd5284783-3609-4ab8-afad-4bd8650520e3_pg', 'bundles': {0: {'CPU': 1.0, 'GPU': 1.0}}, 'bundles_to_node_id': {0: '30c8a654de19ced690206052390556bdbd769f402d44feda6e48e860'}, 'strategy': 'SPREAD', 'state': 'CREATED', 'stats': {'end_to_end_creation_latency_ms': 2.063, 'scheduling_latency_ms': 1.88, 'scheduling_attempt': 1, 'highest_retry_delay_ms': 0.0, 'scheduling_state': 'FINISHED'}}
INFO:orchestrator:Waiting for placement group to start.
INFO:orchestrator:Placement group has started.
INFO:orchestrator:PG table: {'placement_group_id': '6206b80b4566fa3773ab0999a4ca36000000', 'name': 'b0e4bb84-9b8e-4c5f-9cdb-f904958338ed_pg', 'bundles': {0: {'CPU': 1.0, 'GPU': 1.0}}, 'bundles_to_node_id': {0: '30c8a654de19ced690206052390556bdbd769f402d44feda6e48e860'}, 'strategy': 'SPREAD', 'state': 'CREATED

In [23]:
from itertools import product

In [131]:
def generate_power_of_two_combinations(max_gpus):
    """
    Generate all combinations of GPUs allocations in powers of two up to max_gpus.
    """
    return [2**i for i in range(int(np.log2(max_gpus))+1)]
    

def generate_allocations_for_job(num_nodes, gpus_per_node):
    allocations = []
    # Single-node allocations
    for gpus in generate_power_of_two_combinations(gpus_per_node):
        allocations.append((gpus, 1))

    # Corrected Multi-node allocations
    # The original approach incorrectly represented the total number of GPUs across nodes
    # The corrected logic here ensures we represent the number of GPUs per node and the number of nodes accurately
    if num_nodes > 1:
        for nodes_used in range(2, num_nodes+1):
            # Ensure the allocation represents using all GPUs per node for the number of nodes used
            allocations.append((gpus_per_node, nodes_used))

    return allocations

In [132]:
def is_allocation_valid(combination, num_nodes, gpus_per_node):
    # Initialize GPU usage tracking for each node
    node_usage = [0] * num_nodes
    
    for alloc in combination:
        gpus, nodes = alloc
        if nodes == 1:
            # Attempt to allocate on a node with sufficient available GPUs
            allocated = False
            for i in range(num_nodes):
                if node_usage[i] + gpus <= gpus_per_node:
                    node_usage[i] += gpus
                    allocated = True
                    break
            if not allocated:
                return False
        else:
            # For multi-node allocations, check if it's possible to allocate across the required nodes
            if sum(node_usage) + gpus * nodes > gpus_per_node * num_nodes:
                return False
            # Simulate allocation across nodes
            for i in range(nodes):
                node_usage[i] += gpus
                
    return True

def list_possible_allocations(cluster_config, jobs):
    num_nodes = len(cluster_config)
    gpus_per_node = cluster_config[0]  # Assuming uniform distribution of GPUs across nodes

    job_ids = list(jobs.keys())
    all_job_allocations = [list(generate_allocations_for_job(num_nodes, gpus_per_node)) for _ in job_ids]

    valid_configurations = []
    for combination in product(*all_job_allocations):
        # Check if the combination is valid by considering flexible node usage
        if is_allocation_valid(combination, num_nodes, gpus_per_node):
            # Convert the valid combination into the desired dictionary format
            config_dict = dict(zip(job_ids, combination))
            valid_configurations.append(config_dict)

    return valid_configurations

In [110]:
# cluster_config = np.array([4, 4])  # Example cluster configuration
# jobs = {'d5284783-3609-4ab8-afad-4bd8650520e3': '<job_object>',
#         'b0e4bb84-9b8e-4c5f-9cdb-f904958338ed': '<job_object>'}  # Example job dictionary

# cluster_config = np.array([4])  # 1 node, 4 GPUs
# jobs = {'1': '<job_object>',
#         '2': '<job_object>'}  # 2 jobs

# cluster_config = np.array([4])  # 1 node, 4 GPUs
# jobs = {'1': '<job_object>',
#         '2': '<job_object>',
#         '3': '<job_object>'}  # 3 jobs

# cluster_config = np.array([4, 2])  # 2 nodes, first with 4 GPUs, second with 2 GPUs
# jobs = {'1': '<job_object>',
#         '2': '<job_object>'}  # 2 jobs

# cluster_config = np.array([4, 4])  # 2 nodes, first with 4 GPUs, second with 2 GPUs
# jobs = {'1': '<job_object>'}  # 1 job

# cluster_config = np.array([4, 4])  # 2 nodes, 4 GPUs each
# jobs = {'1': '<job_object>',
#         '2': '<job_object>',
#         '3': '<job_object>',
#         '4': '<job_object>',
#         '5': '<job_object>'}  # 5 jobs

# cluster_config = np.array([2, 2, 2, 2])  # 4 nodes, 2 GPUs each
# jobs = {'1': '<job_object>',
#         '2': '<job_object>'}  # 2 jobs

# cluster_config = np.array([4, 4, 4])  # 3 nodes, 4 GPUs each
# jobs = {'1': '<job_object>'}  # 1 job

Tests with a single 4-GPU node

In [128]:
cluster_config = np.array([4])
jobs = {'1': '<job_object>',
        '2': '<job_object>',
        '3': '<job_object>',
        '4': '<job_object>'}

cluster_config = np.array([4])
jobs = {'1': '<job_object>',
        '2': '<job_object>',
        '3': '<job_object>'}

cluster_config = np.array([4])
jobs = {'1': '<job_object>',
        '2': '<job_object>'}

In [129]:
possible_allocations = list_possible_allocations(cluster_config, jobs)

In [130]:
possible_allocations

[{'1': (1, 1), '2': (1, 1)},
 {'1': (1, 1), '2': (2, 1)},
 {'1': (2, 1), '2': (1, 1)},
 {'1': (2, 1), '2': (2, 1)}]

In [45]:
print(generate_power_of_two_combinations(4))

[1, 2, 4]


In [46]:
print(generate_allocations_for_job(2, 4))

[(1, 1), (2, 1), (4, 1), (4, 2)]
