In [41]:
import sys
sys.path.append('./gym_dagsched/data_generation/tpch/')

import torch
from torch_geometric.data import Batch
from torch_geometric.utils.convert import from_networkx

from gym_dagsched.envs.dagsched_env import DagSchedEnv
from gym_dagsched.policies.decima_agent import ActorNetwork
from gym_dagsched.utils.metrics import avg_job_duration, makespan
from gym_dagsched.data_generation.random_datagen import RandomDataGen
from gym_dagsched.data_generation.tpch_datagen import TPCHDataGen
from visualization import make_gantt


datagen = RandomDataGen(
    max_ops=20,
    # max_tasks=200,
    max_tasks=4,
    mean_task_duration=2000.,
    n_worker_types=1)

# datagen = TPCHDataGen()

n_workers = 5
sim = DagSchedEnv()
policy = ActorNetwork(5, 8, n_workers)


In [36]:
def find_op(op_idx):
    i = 0
    for job in sim.jobs:
        if op_idx < i + len(job.ops):
            op = job.ops[op_idx - i]
            break
        else:
            i += len(job.ops)
    return op

def sample_action(ops, prlvl):
    c = torch.distributions.Categorical(probs=ops)        
    next_op_idx = c.sample().item()
    next_op = find_op(next_op_idx)
    c = torch.distributions.Categorical(probs=prlvl[next_op.job_id])        
    n_workers = c.sample().item()

    return next_op_idx, next_op, n_workers


def run_episode(ep_length, initial_timeline, workers):
    sim.reset(initial_timeline, workers)
    
    actions = []
    obsns = []
    rewards = []

    done = False
    obs = None

    while len(actions) < ep_length and not done:
        if obs is None:
            next_op, n_workers = None, 0
        else:
            dag_batch, op_msk, prlvl_msk = obs
            ops, prlvl = policy(dag_batch, op_msk, prlvl_msk)
            next_op_idx, next_op, n_workers = sample_action(ops, prlvl)
            
            actions += [(next_op_idx, n_workers)]
            obsns += [obs]
            rewards += [reward]

        obs, reward, done = sim.step(next_op, n_workers)
    
    return actions, obsns, rewards



In [42]:
import numpy as np

# number of job arrival sequences to train on
N_SEQUENCES = 1

# number of times to train on a fixed sequence
N_EP_PER_SEQ = 1

# (geometric distribution) mean number of environment 
# steps in an episode; this quantity gradually increases
# as a form of curriculum learning
mean_ep_length = 20

# Adam optimizer for training the actor network
optim = torch.optim.Adam(policy.parameters(), lr=.005)



def compute_action_log_probs(ops, prlvl, next_op_idx, n_workers):
    c = torch.distributions.Categorical(probs=ops)
    next_op_idx_lgp = c.log_prob(torch.tensor(next_op_idx))
    
    next_op = find_op(next_op_idx)
    c = torch.distributions.Categorical(probs=prlvl[next_op.job_id])  
    n_workers_lgp = c.log_prob(torch.tensor(n_workers))

    return next_op_idx_lgp, n_workers_lgp



def run_episodes(initial_timeline, workers, ep_length):
    # run multiple episodes on the same sequence, and
    # records the (action,observation,reward) histories 
    # of each episode. Each of the following list/array
    # objects will have shape (N_EP_PER_SEQ, ep_length)
    actions_histories = []
    obsns_histories = []
    total_rewards_histories = np.zeros((N_EP_PER_SEQ, ep_length))

    for i in range(N_EP_PER_SEQ):
        actions, obsns, rewards = run_episode(ep_length, initial_timeline, workers)
        rewards = np.array(rewards)
        total_rewards = np.cumsum(rewards[::-1])[::-1]
        actions_histories += [actions]
        obsns_histories += [obsns]
        total_rewards_histories[i] = total_rewards

    return actions_histories, obsns_histories, total_rewards_histories




for _ in range(N_SEQUENCES):
    ep_length = np.random.geometric(1/mean_ep_length)

    # sample a job arrival sequence and worker types
    initial_timeline = datagen.initial_timeline(
        n_job_arrivals=20, n_init_jobs=0, mjit=1000.)
    workers = datagen.workers(n_workers=n_workers)

     # run multiple episodes on this fixed sequence
    actions_histories, obsns_histories, total_rewards_histories = \
        run_episodes(initial_timeline, workers, ep_length)

    for k in range(ep_length):
        baseline = total_rewards_histories[:,k].mean()
        zip_histories = zip(actions_histories, obsns_histories, total_rewards_histories)
        for actions, obsns, total_rewards in zip_histories:
            action, obs, total_reward = actions[k], obsns[k], total_rewards[k]
            dag_batch, op_msk, prlvl_msk = obs
            next_op_idx, n_workers = action

            ops, prlvl = policy(dag_batch, op_msk, prlvl_msk)

            next_op_idx_lgp, n_workers_lgp = \
                compute_action_log_probs(ops, prlvl, next_op_idx, n_workers)
                
            loss = -(next_op_idx_lgp + n_workers_lgp) * (total_reward - baseline)

            optim.zero_grad()
            loss.backward()
            optim.step()

            
    mean_ep_length += 3

In [43]:
total_rewards_histories

array([[-0.16883803, -0.16883803, -0.16883803, -0.16883803, -0.02042514,
        -0.00030304]])