In [1]:
import sys
import gym
import numpy as np
import random
import matplotlib.pyplot as plt
import os
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions.normal import Normal
import numpy as np
import pdb
import copy
import copy
import torch

def set_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    T.manual_seed(seed)
    T.cuda.manual_seed(seed)
    T.backends.cudnn.deterministic = True

def softmax(x):
    return np.exp(x)/np.sum(np.exp(x), keepdims=True)

device = T.device("cuda" if T.cuda.is_available() else "cpu")

In [2]:
def soft_update(target, source, tau):
	for target_param, param in zip(target.parameters(), source.parameters()):
		target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau)

def hard_update(target, source):
	for target_param, param in zip(target.parameters(), source.parameters()):
		target_param.data.copy_(param.data)

In [3]:
class ReplayBuffer():
    def __init__(self,input_shape, n_actions,max_size=int(1e6)):
        self.memory_size = max_size
        self.memory_counter = 0
        self.state = np.zeros((self.memory_size, input_shape))
        self.state_ = np.zeros((self.memory_size, input_shape))
        self.action = np.zeros((self.memory_size, n_actions))
        self.reward = np.zeros(self.memory_size)
        self.done = np.zeros(self.memory_size)

    def add(self, state, action, reward, state_, done):
        index = self.memory_counter % self.memory_size
        self.state[index] = state
        self.state_[index] = state_
        self.action[index] = action
        self.reward[index] = reward
        self.done[index] = done
        self.memory_counter += 1

    def sample(self, batch_size):
        max_memory = min(self.memory_counter, self.memory_size)
        batch = np.random.choice(max_memory, batch_size)
        state = self.state[batch]
        action= self.action[batch]
        reward = self.reward[batch]
        state_ = self.state_[batch]
        done = self.done[batch]
        return state, action, reward, state_, done

In [4]:
class Actor(nn.Module):
	def __init__(self, state_dim, action_dim, max_action):
		super(Actor, self).__init__()
		self.l1 = nn.Linear(state_dim, 256)
		self.l2 = nn.Linear(256, 256)
		self.l3 = nn.Linear(256, action_dim)
		self.max_action = max_action
	def forward(self, state):
		a = F.relu(self.l1(state))
		a = F.relu(self.l2(a))
		return self.max_action * torch.sigmoid(self.l3(a))

class Critic(nn.Module):
	def __init__(self, state_dim, action_dim):
		super(Critic, self).__init__()
		# Q1 architecture
		self.l1 = nn.Linear(state_dim + action_dim, 256)
		self.l2 = nn.Linear(256, 256)
		self.l3 = nn.Linear(256, 1)
		# Q2 architecture
		self.l4 = nn.Linear(state_dim + action_dim, 256)
		self.l5 = nn.Linear(256, 256)
		self.l6 = nn.Linear(256, 1)
	def forward(self, state, action):
		sa = torch.cat([state, action], 1)
		q1 = F.relu(self.l1(sa))
		q1 = F.relu(self.l2(q1))
		q1 = self.l3(q1)
		q2 = F.relu(self.l4(sa))
		q2 = F.relu(self.l5(q2))
		q2 = self.l6(q2)
		return q1, q2
	def Q1(self, state, action):
		sa = torch.cat([state, action], 1)
		q1 = F.relu(self.l1(sa))
		q1 = F.relu(self.l2(q1))
		q1 = self.l3(q1)
		return q1

In [5]:
class TD3(object):
	def __init__(
		self,
		state_dim,
		action_dim,
		max_action,
		discount=0.99,
		tau=0.005,
		policy_noise=0.2,
		noise_clip=0.5,
		policy_freq=2
	):

		self.actor = Actor(state_dim, action_dim, max_action).to(device)
		self.actor_target = copy.deepcopy(self.actor)
		self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=3e-4)
		self.critic = Critic(state_dim, action_dim).to(device)
		self.critic_target = copy.deepcopy(self.critic)
		self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4)
		self.max_action = max_action
		self.discount = discount
		self.tau = tau
		self.policy_noise = policy_noise
		self.noise_clip = noise_clip
		self.policy_freq = policy_freq
		self.total_it = 0

	def select_action(self, state):
		state = torch.FloatTensor(state.reshape(1, -1)).to(device)
		return self.actor(state).cpu().data.numpy().flatten()

	def train(self, replay_buffer, batch_size=100):
		if replay_buffer.memory_size<batch_size:
			return
		self.total_it += 1
		state, action, reward,next_state, not_done = replay_buffer.sample(batch_size)
		reward = T.tensor(reward, dtype=T.float)
		not_done = T.tensor(not_done,dtype=T.float)
		next_state = T.tensor(next_state, dtype=T.float)
		state = T.tensor(state, dtype=T.float)
		action = T.tensor(action, dtype=T.float)
		with torch.no_grad():
			noise = (
				torch.randn_like(action) * self.policy_noise
			).clamp(-self.noise_clip, self.noise_clip)

			next_action = (
				self.actor_target(next_state) + noise
			).clamp(-self.max_action, self.max_action)
			target_Q1, target_Q2 = self.critic_target(next_state, next_action)
			target_Q = torch.squeeze(torch.min(target_Q1, target_Q2))
			target_Q = reward + not_done * self.discount * target_Q
		current_Q1, current_Q2 = self.critic(state, action)
		critic_loss = F.mse_loss(torch.squeeze(current_Q1), target_Q) + F.mse_loss(torch.squeeze(current_Q2), target_Q)
		self.critic_optimizer.zero_grad()
		critic_loss.backward()
		self.critic_optimizer.step()
		if self.total_it % self.policy_freq == 0:
			actor_loss = -self.critic.Q1(state, self.actor(state)).mean()
			self.actor_optimizer.zero_grad()
			actor_loss.backward()
			self.actor_optimizer.step()
			for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
				target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
			for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
				target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

	def save(self, filename):
		torch.save(self.critic.state_dict(), filename + "_critic")
		torch.save(self.critic_optimizer.state_dict(), filename + "_critic_optimizer")
		torch.save(self.actor.state_dict(), filename + "_actor")
		torch.save(self.actor_optimizer.state_dict(), filename + "_actor_optimizer")

	def load(self, filename):
		self.critic.load_state_dict(torch.load(filename + "_critic"))
		self.critic_optimizer.load_state_dict(torch.load(filename + "_critic_optimizer"))
		self.critic_target = copy.deepcopy(self.critic)
		self.actor.load_state_dict(torch.load(filename + "_actor"))
		self.actor_optimizer.load_state_dict(torch.load(filename + "_actor_optimizer"))
		self.actor_target = copy.deepcopy(self.actor)

In [6]:
task_dependencies={0:[2,4],
    2:[1,2]}
task_dependencies

{0: [2, 4], 2: [1, 2]}

In [7]:
resource_requirements=[(softmax(np.random.randint(0,5,size=[6])) * 10 * 50) for i in range(5)]
resource_requirements

[array([119.4708202 , 119.4708202 , 119.4708202 , 119.4708202 ,
          5.94810189,  16.16861729]),
 array([ 19.86715367,  19.86715367, 146.799513  , 146.799513  ,
        146.799513  ,  19.86715367]),
 array([ 14.32206915,  14.32206915, 287.66644869, 105.82657239,
         38.93142031,  38.93142031]),
 array([140.72607591,  51.77023016, 140.72607591,   7.00633876,
         19.04520334, 140.72607591]),
 array([  3.77980841, 206.37054669, 206.37054669,   3.77980841,
         75.91948139,   3.77980841])]

In [8]:
class Resource_Manager:
    def __init__(self,action_dims=6,resource_amount=10,max_steps=2000,):
        self.resource_amount=np.expand_dims(np.array(resource_amount),axis=0)
        self.action_dims = (1,action_dims)
        self.space_dims=(1,self.action_dims[1]+1)
        self.max_steps=max_steps
        
    def reset(self):
        self.steps_so_far=0
        self.resource_requirements=softmax(np.random.randint(0,4,size=[self.action_dims[1]])) * self.resource_amount * 50
        return self.get_state()
    
    def get_state(self):
        self.resource_amount=np.expand_dims(np.random.randint(1,20),axis=0)
        return np.concatenate([self.resource_amount,self.resource_requirements])
    
    def step(self, action):
        action=softmax(action) * self.resource_amount
        self.resource_requirements-= action    
        self.resource_requirements=np.clip(self.resource_requirements,0,max(max(self.resource_requirements),0))
        
        self.steps_so_far+=1
        if np.count_nonzero(self.resource_requirements)<1:
            return self.get_state(),100,True,False
        return self.get_state(),-1,False,self.steps_so_far>=self.max_steps

In [9]:
#set_seed(97)
def get_trained_policy():
    env =Resource_Manager()

    state_dim = env.space_dims[1]
    action_dim = env.action_dims[1]
    max_action = float(1)

    kwargs = {"state_dim": state_dim,"action_dim": action_dim,"max_action": max_action,"discount": 0.99,"tau": 0.005,}
    kwargs["policy_noise"] = 0.2 * max_action
    kwargs["noise_clip"] = 0.5 * max_action
    kwargs["policy_freq"] = 2
    policy = TD3(**kwargs)
    
    replay_buffer = ReplayBuffer(state_dim, action_dim)

    time_step=0
    start_timesteps=10000
    rewards=[]

    for episode in range(2000):
        episode_reward=0
        state = env.reset()
        done = False
        truncuated = False
        while (not done) and (not truncuated):
            action = (policy.select_action(state)+ np.random.normal(0, max_action * 0.1, size=action_dim)).clip(0, max_action)
            next_state, reward, done,truncuated = env.step(action)
            replay_buffer.add(state, action, reward,next_state, int(not(done)))
            if time_step > start_timesteps:
                policy.train(replay_buffer)
            state = next_state
            time_step+=1
            episode_reward+=reward
        rewards.append(episode_reward)
        if (episode%50)==0:
            print(f"episode:{episode} reward:{episode_reward} last 50 ep AVG:{sum(rewards[-100:])/100}")
    return copy.deepcopy(policy.actor),copy.deepcopy(TD3(**kwargs).actor)

In [10]:
allocator,random_allocator=get_trained_policy()
allocator,random_allocator

episode:0 reward:-131 last 50 ep AVG:-1.31
episode:50 reward:79 last 50 ep AVG:-17.3
episode:100 reward:39 last 50 ep AVG:-35.98
episode:150 reward:-123 last 50 ep AVG:-44.49
episode:200 reward:81 last 50 ep AVG:-40.1
episode:250 reward:5 last 50 ep AVG:-37.29
episode:300 reward:-94 last 50 ep AVG:-42.13
episode:350 reward:88 last 50 ep AVG:-44.8
episode:400 reward:-88 last 50 ep AVG:-37.85
episode:450 reward:-34 last 50 ep AVG:-21.47
episode:500 reward:-127 last 50 ep AVG:-12.84
episode:550 reward:66 last 50 ep AVG:-3.31
episode:600 reward:-72 last 50 ep AVG:-0.94
episode:650 reward:26 last 50 ep AVG:-4.56
episode:700 reward:6 last 50 ep AVG:2.82
episode:750 reward:28 last 50 ep AVG:9.63
episode:800 reward:31 last 50 ep AVG:7.93
episode:850 reward:6 last 50 ep AVG:6.8
episode:900 reward:16 last 50 ep AVG:14.1
episode:950 reward:-2 last 50 ep AVG:16.68
episode:1000 reward:51 last 50 ep AVG:18.38
episode:1050 reward:-8 last 50 ep AVG:16.76
episode:1100 reward:-18 last 50 ep AVG:14.97
ep

(Actor(
   (l1): Linear(in_features=7, out_features=256, bias=True)
   (l2): Linear(in_features=256, out_features=256, bias=True)
   (l3): Linear(in_features=256, out_features=6, bias=True)
 ),
 Actor(
   (l1): Linear(in_features=7, out_features=256, bias=True)
   (l2): Linear(in_features=256, out_features=256, bias=True)
   (l3): Linear(in_features=256, out_features=6, bias=True)
 ))

In [11]:
class Resouce_Allocator:
    def __init__(self,allocator,num_tasks,num_resources,task_dependencies=None,resource_requirements=None):
        self.task_dependencies=task_dependencies
        self.num_tasks=num_tasks
        self.num_resources=num_resources
        self.allocator=allocator
        self.resources=resource_requirements
        self.steps=0
        self.done=np.array([[False for j in range(num_tasks)] for i in range(num_resources)])
        self.terminal_state=False
    def alloc(self,available_amount,resourcces):
        state = torch.FloatTensor(np.concatenate([available_amount,resourcces]).reshape(1, -1)).to(device)
        return softmax(self.allocator(state).cpu().data.numpy().flatten()) * available_amount
        
    def solve(self):
        while not self.terminal_state: 
            available_resource=[np.expand_dims(np.array(random.randint(1,5)),axis=0) for i in range(self.num_resources)]
            self.steps+=1
            for index,done in zip(range(self.num_resources),self.done):
                if done.all(where=False):
                    resource=copy.deepcopy(self.resources[index])
                    for t_key in self.task_dependencies.keys():
                        for r_key in self.task_dependencies[t_key].keys():
                            if r_key == index and (not self.done[self.task_dependencies[t_key][r_key]][t_key]):                                
                                resource[r_key] = 0.0
                    change=self.alloc(available_resource[index],resource)
                    change[resource==0.0] = 0
                    self.resources[index]-= change
                    self.resources[index] = np.clip(self.resources[index],0,max(max(self.resources[index]),0))
            self.check_to_do_list()
        return self.steps
    def check_to_do_list(self):
        for recources,done in zip(self.resources,self.done):
            indexes=np.where(recources==0)
            for index in indexes:
                done[index]=True 
        if sum(sum(i) for i in self.resources)<=0:
            self.terminal_state=True

In [12]:

num_resources=5        
num_tasks=6

resource_requirements=[(softmax(np.random.randint(0,5,size=[num_tasks])) * 10 * 200) for i in range(num_resources)]
print(resource_requirements,"\n\n\n") 

order1={0:{2:1}}
order2={0:{1:0,3:2,4:3},1:{1:0,3:2,4:3},2:{1:0,3:2,4:3},3:{2:1,3:2,4:3},4:{2:1,3:2,4:3},5:{2:1,3:2,4:3}}

r1=Resouce_Allocator(allocator=allocator,num_tasks=num_tasks,num_resources=num_resources,task_dependencies=order1,resource_requirements= copy.deepcopy(resource_requirements))
print(r1.solve())


r2=Resouce_Allocator(allocator=allocator,num_tasks=num_tasks,num_resources=num_resources,task_dependencies=order2,resource_requirements= copy.deepcopy(resource_requirements))
print(r2.solve())


r3=Resouce_Allocator(allocator=random_allocator,num_tasks=num_tasks,num_resources=num_resources,task_dependencies=order1,resource_requirements= copy.deepcopy(resource_requirements))
print(r3.solve())


r4=Resouce_Allocator(allocator=random_allocator,num_tasks=num_tasks,num_resources=num_resources,task_dependencies=order2,resource_requirements= copy.deepcopy(resource_requirements))
print(r4.solve())


[array([692.18288001, 692.18288001,  12.67777168,  93.67676612,
       254.63985109, 254.63985109]), array([211.94155762, 576.11688477, 211.94155762, 211.94155762,
       211.94155762, 576.11688477]), array([ 93.67676612, 254.63985109, 692.18288001, 692.18288001,
        12.67777168, 254.63985109]), array([1014.88413735,  373.3550093 ,  137.34963218,   50.52810593,
         50.52810593,  373.3550093 ]), array([ 385.67336034,  141.88130028,  385.67336034, 1048.36888713,
         19.20154596,   19.20154596])] 



1334
1530
2840
4313
