In [1]:
import time
import numpy as np
import matplotlib.pyplot as plt
import environment
import job_distribution
import slow_down_cdf
import RL_brain
import parameters
from gurobipy import *

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Instructions for updating:
non-resource variables are not supported in the long term


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
def discount(x, gamma):
    """
    Given vector x, computes a vector y such that
    y[i] = x[i] + gamma * x[i+1] + gamma^2 x[i+2] + ...
    """
    out = np.zeros(len(x))
    out[-1] = x[-1]
    for i in reversed(range(len(x)-1)):
        out[i] = x[i] + gamma*out[i+1]
    assert x.ndim >= 1
    # More efficient version:
    # scipy.signal.lfilter([1],[1,-gamma],x[::-1], axis=0)[::-1]
    return out

In [3]:
def get_traj(agent, env, episode_max_length):
    """
    Run agent-environment loop for one whole episode (trajectory)
    Return dictionary of results
    """
    env.reset()
    obs = []
    acts = []
    rews = []
    info = []

    ob = env.observe()

    for _ in range(episode_max_length):
    #for _ in range(5):

        loss = 0
        #print('ob_len:',len(ob[0]))
        a = agent.choose_action(ob)

        obs.append(ob)  # store the ob at current decision making step
        acts.append(a)

        ob_, rew, done, info = env.step(a, repeat=True)

        # agent.store_transition(ob, a, rew)

        rews.append(rew)

        if done:

            # loss = agent.learn()
            break

        ob = ob_

    # loss = agent.learn()

    return {'reward': np.array(rews),
            'ob': np.array(obs),
            'action': np.array(acts),
            'info': info,
            # 'loss': loss
            }

In [4]:
def concatenate_all_ob(trajs, pa):

    timesteps_total = 0
    for i in range(len(trajs)):
        timesteps_total += len(trajs[i]['reward'])

    all_ob = np.zeros(
        (timesteps_total, pa.network_input_height*pa.network_input_width),
        dtype=np.float64)

    timesteps = 0
    for i in range(len(trajs)):
        for j in range(len(trajs[i]['reward'])):
            all_ob[timesteps, :] = trajs[i]['ob'][j]
            timesteps += 1

    return all_ob

In [5]:
def process_all_info(trajs):
    enter_time = []
    finish_time = []
    job_len = []

    for traj in trajs:
        enter_time.append(np.array([traj['info'].record[i].enter_time for i in range(len(traj['info'].record))]))
        finish_time.append(np.array([traj['info'].record[i].finish_time for i in range(len(traj['info'].record))]))
        job_len.append(np.array([traj['info'].record[i].len for i in range(len(traj['info'].record))]))

    enter_time = np.concatenate(enter_time)
    finish_time = np.concatenate(finish_time)
    job_len = np.concatenate(job_len)

    return enter_time, finish_time, job_len

In [6]:
def plot_lr_curve(output_file_prefix, max_rew_lr_curve, mean_rew_lr_curve, slow_down_lr_curve,
                  ref_discount_rews, ref_slow_down,ref_lr_gurobi=None):
    num_colors = len(ref_discount_rews) + 2
    cm = plt.get_cmap('gist_rainbow')

    fig = plt.figure(figsize=(12, 5))

    ax = fig.add_subplot(121)
    #ax.set_prop_cycle([cm(1. * i / num_colors) for i in range(num_colors)])
    ax.set_prop_cycle(color=['blue', 'green', 'orange', 'red','yellow','gray'])
    ax.plot(mean_rew_lr_curve, linewidth=2, label='PG mean')
    ax.plot(max_rew_lr_curve, linewidth=2, label='PG max')
    for k in ref_discount_rews:
        ax.plot(np.tile(np.average(ref_discount_rews[k]), len(mean_rew_lr_curve)), linewidth=2, label=k)
    plt.legend(loc=4)
    plt.xlabel("Iteration", fontsize=20)
    plt.ylabel("Discounted Total Reward", fontsize=20)

    
    
    ax = fig.add_subplot(122)
    ax.set_prop_cycle(color=['blue', 'green', 'orange', 'red','yellow','gray'])
    #ax.set_color_cycle([cm(1. * i / num_colors) for i in range(num_colors)])

    ax.plot(slow_down_lr_curve, linewidth=2, label='PG mean')
    #######todo：添加gurobi计算结果####
    #ax.plot(ref_lr_gurobi, linewidth=2, label='Gurobi')
    ###########################
    for k in ref_discount_rews:
        ax.plot(np.tile(np.average(np.concatenate(ref_slow_down[k])), len(slow_down_lr_curve)), linewidth=2, label=k)

    plt.legend(loc=1)
    plt.xlabel("Iteration", fontsize=20)
    plt.ylabel("Completion time", fontsize=20)

    plt.savefig(output_file_prefix + "_lr_curve" + ".pdf")


In [7]:
def get_traj_worker(rl, env, pa):

    trajs = []

    for i in range(pa.num_seq_per_batch):
        traj = get_traj(rl, env, pa.episode_max_length)
        trajs.append(traj)

    all_ob = concatenate_all_ob(trajs, pa)

    # Compute discounted sums of rewards
    rets = [discount(traj["reward"], pa.discount) for traj in trajs]
    maxlen = max(len(ret) for ret in rets)
    padded_rets = [np.concatenate([ret, np.zeros(maxlen - len(ret))]) for ret in rets]

    # Compute time-dependent baseline
    baseline = np.mean(padded_rets, axis=0)

    # Compute advantage function
    advs = [ret - baseline[:len(ret)] for ret in rets]
    all_action = np.concatenate([traj["action"] for traj in trajs])
    all_adv = np.concatenate(advs)

    all_eprews = np.array([discount(traj["reward"], pa.discount)[0] for traj in trajs])  # episode total rewards
    all_eplens = np.array([len(traj["reward"]) for traj in trajs])  # episode lengths
    # all_loss = np.array([traj["loss"] for traj in trajs])

    # All Job Stat
    enter_time, finish_time, job_len = process_all_info(trajs)
    finished_idx = (finish_time >= 0)
    completion_time = finish_time[finished_idx] - enter_time[finished_idx]

    return all_eprews, all_eplens, completion_time, all_ob, all_action, all_adv

In [9]:
def launch(pa, pg_resume=None, render=False, repre='image', end='no_new_job'):

    # ----------------------------
    print("Preparing for workers...")
    # ----------------------------

    pg_learners = []
    envs = []

    nw_len_seqs, nw_size_seqs = job_distribution.generate_sequence_work(pa, seed=42)#生成一序列的任务，其中包括num_ex个task

    for ex in range(pa.num_ex):#对于每个task

        print("-prepare for env-", ex)

        env = environment.Env(pa, nw_len_seqs=nw_len_seqs, nw_size_seqs=nw_size_seqs,
                              render=False, repre=repre, end=end)#初始化一个环境
        env.seq_no = ex
        envs.append(env)

    print("-prepare for worker-")

    rl = RL_brain.PolicyGradient(n_actions=pa.network_output_dim,
                                 network_input_height=pa.network_input_height,
                                 network_input_width=pa.network_input_width,
                                 n_features=pa.network_input_height*pa.network_input_width,
                                 learning_rate=0.002)


    if pg_resume is not None:
        rl.load_data(pg_resume)


    # --------------------------------------
    print("Preparing for reference data...")
    # --------------------------------------

    ref_discount_rews, ref_slow_down = slow_down_cdf.launch(pa, pg_resume=None,render=True,
                                                            plot=False, repre=repre, end=end)
    mean_rew_lr_curve = []
    max_rew_lr_curve = []
    slow_down_lr_curve = []

    # --------------------------------------
    print("Start training...")
    # --------------------------------------

    timer_start = time.time()
    
    ref_lr_gurobi=[]

    for iteration in range(1, pa.num_epochs):#进行每一次迭代
    #for iteration in range(1, 2):

        ex_indices = list(range(pa.num_ex))
        np.random.shuffle(ex_indices)#打乱每一次的所有task

        all_eprews = []#所有迭代次数的rewards
        eprews = []#每次迭代的总rewards
        eplens = []#每次迭代完成所有任务的总时长
        all_slowdown = []#所有迭代的总slowdown

        eprewlist = []
        eplenlist =[]
        slowdownlist =[]
        losslist = []
        

        ex_counter = 0
        for ex in range(pa.num_ex):

            ex_idx = ex_indices[ex]

            eprew, eplen, completion_time, all_ob, all_action, all_adv = get_traj_worker(rl, envs[ex_idx], pa)
            eprewlist.append(eprew)
            eplenlist.append(eplen)
            slowdownlist.append(completion_time)
            
            
            loss = rl.learn(all_ob, all_action, all_adv)
            losslist.append(loss)

            ex_counter += 1

            if ex_counter >= pa.batch_size or ex == pa.num_ex - 1:

                print("\n\n")

                ex_counter = 0

               

        timer_end = time.time()

        print("-----------------")
        print("Iteration: \t %i" % iteration)
        print("NumTrajs: \t %i" % len(eprewlist))
        print("NumTimesteps: \t %i" % np.sum(eplenlist))
        print("Loss:     \t %s" % np.mean(losslist))
        print("MaxRew: \t %s" % np.average([np.max(rew) for rew in eprewlist]))
        print("MeanRew: \t %s +- %s" % (np.mean(eprewlist), np.std(eprewlist)))
        print("MeanSlowdown: \t %s" % np.mean([np.mean(sd) for sd in slowdownlist]))
        print("MeanLen: \t %s +- %s" % (np.mean(eplenlist), np.std(eplenlist)))
        print("Elapsed time\t %s" % (timer_end - timer_start), "seconds")
        print("-----------------")

        timer_start = time.time()

        max_rew_lr_curve.append(np.average([np.max(rew) for rew in eprewlist]))
        mean_rew_lr_curve.append(np.mean(eprewlist))
        slow_down_lr_curve.append(np.mean([np.mean(sd) for sd in slowdownlist]))
        
        #slowdown_gurobi=ref_gurobi(pa)
        #ref_lr_gurobi.append(slowdown_gurobi)
        
        
        if iteration % pa.output_freq == 0:

            rl.save_data(pa.output_filename + '_' + str(iteration))

            pa.unseen = True
            slow_down_cdf.launch(pa, pa.output_filename + '_' + str(iteration) + '.ckpt',render=False, plot=True, repre=repre, end=end)
            pa.unseen = False
            #print(slow_down_lr_curve)
            #mean_lr_gurobi=np.mean(ref_lr_gurobi)
            #for i in range(len(ref_lr_gurobi)):
                #ref_lr_gurobi[i]=mean_lr_gurobi

            plot_lr_curve(pa.output_filename,
                          max_rew_lr_curve, mean_rew_lr_curve, slow_down_lr_curve,
                          ref_discount_rews, ref_slow_down,ref_lr_gurobi)

def ref_gurobi(pa):
        
    nw_len_lst=np.zeros(pa.simu_len, dtype=int)
    nw_res_lst=np.zeros((pa.simu_len,pa.num_res), dtype=int)
    for i in range(pa.simu_len):
        if np.random.rand() < pa.new_job_rate:  # a new job comes，在每个工作的位置上随机生成一个数字，如果小于新工作率，则在当前位置新生成一个任务
            dist=job_distribution.Dist(pa.num_res,pa.max_job_size,pa.max_job_len)
            nw_len_lst[i], nw_res_lst[i, :] = dist.bi_model_dist()

    T=[]
    T_period=pa.max_job_len*pa.simu_len
    for i in range(T_period):
        T.append(i)

    arrval_time_lst=np.zeros(pa.simu_len,dtype=int)

    jobs=[]
    for i in range(pa.simu_len):
        jobs.append(str(i+1))

    dauer_dict={}
    arrval_dict={}
    res_dict={}
    i=0
    for job in jobs:
        dauer_dict[job]=nw_len_lst[i]
        arrval_dict[job]=arrval_time_lst[i]
        res_dict[job]=nw_res_lst[i]
        i=i+1
    
    m=Model()
    m.Params.OutputFlag=0
    x=m.addVars(jobs,T,name='start_time_bool',vtype=GRB.BINARY)
    omiga=m.addVars(jobs,lb=0,ub=T_period,name='end_time',vtype=GRB.INTEGER)
    alpha=m.addVars(jobs,lb=0,ub=T_period,name='start_time',vtype=GRB.INTEGER)
    y=m.addVars(jobs,T,name='dauer_time_bool',vtype=GRB.BINARY)
    slowdown=m.addVars(jobs,lb=0,name='slow_down',vtype=GRB.INTEGER)
        
    for job in jobs:
        if dauer_dict[job]==0:
            m.addConstr(slowdown[job]==1)
        else:
            m.addConstr(slowdown[job]==(omiga[job]-arrval_dict[job]))
            
    m.addConstrs((alpha[job]==omiga[job]-dauer_dict[job] for job in jobs))
        
    for i in range(pa.simu_len):
        m.addConstrs(alpha[job]>=arrval_dict[job] for job in jobs)
            
    for job in jobs:
        m.addConstr(quicksum(x[(job),time] for time in T)==1)
        m.addConstr(quicksum(y[(job),time] for time in T)==dauer_dict[job])
        m.addConstr(quicksum(x[(job),time]*time for time in T)==alpha[job]) 
        for timee in T[:T_period-max(dauer_dict.values())]:
            m.addGenConstrIndicator(x[job,timee],True,quicksum(y[job,timee+timeee] for timeee in range(dauer_dict[job])),GRB.EQUAL,dauer_dict[job])
        
    for i in range(len(T)):
        for j in range(pa.num_res):
            m.addConstr(quicksum(y[(job),T[i]]*res_dict[job][j] for job in jobs)<=pa.res_slot)
                
    m.setObjective(1/pa.simu_len*quicksum(slowdown[job] for job in jobs),GRB.MINIMIZE)
        
    m.optimize()

    
    return m.ObjVal
        


    
    
        
    
    
    
    
def main():

    import parameters

    pa = parameters.Parameters()

    pa.simu_len = 40  # 1000
    pa.num_ex = 10  #50 # 100
    pa.num_nw = pa.simu_len
    pa.num_seq_per_batch = 10 #20
    pa.output_freq = 10 #50
    pa.batch_size = 10

    # pa.max_nw_size = 5
    # pa.job_len = 5
    pa.new_job_rate = 1
    

    pa.episode_max_length = 2000  # 2000

    pa.compute_dependent_parameters()

    pg_resume = None
    # pg_resume = 'data/tmp_450.pkl'

    render = False

    launch(pa, pg_resume, render, repre='image', end='all_done')


if __name__ == '__main__':
    main()


Preparing for workers...
-prepare for env- 0
-prepare for env- 1
-prepare for env- 2
-prepare for env- 3
-prepare for env- 4
-prepare for env- 5
-prepare for env- 6
-prepare for env- 7
-prepare for env- 8
-prepare for env- 9
-prepare for worker-
Preparing for reference data...
Load on # 0 resource dimension is 2.8845
Load on # 1 resource dimension is 3.44875



---------- Tetris -----------
total discount reward : 	 -2632.0
---------- SJF -----------
total discount reward : 	 -1144.0
---------- packer -----------
total discount reward : 	 -2534.0
---------- Random -----------
total discount reward : 	 -2108.0



---------- Tetris -----------
total discount reward : 	 -2508.0
---------- SJF -----------
total discount reward : 	 -1574.0
---------- packer -----------
total discount reward : 	 -2339.0
---------- Random -----------
total discount reward : 	 -2447.0



---------- Tetris -----------
total discount reward : 	 -1763.0
---------- SJF -----------
total discount reward : 	 -1130.0

KeyboardInterrupt: 