In [9]:
import gym
import numpy
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.multiprocessing as mp
from torch.distributions import Categorical

from collections import namedtuple


In [10]:
class SharedAdam(T.optim.Adam):
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.99), eps=1e-8,
                 weight_decay=0):
        super(SharedAdam, self).__init__(params, lr=lr, betas=betas, eps=eps,
                                         weight_decay=weight_decay)

        for group in self.param_groups:
            for p in group['params']:
                state = self.state[p]
                state['step'] = T.tensor(0.0).share_memory_()
                state['exp_avg'] = T.zeros_like(p.data).share_memory_()
                state['exp_avg_sq'] = T.zeros_like(p.data).share_memory_()


NameError: name 'T' is not defined

In [None]:
class A3C_NET(nn.Module):
    def __init__(self, input_dims, output_dims, fc1, fc2):
        super(A3C_NET, self).__init__()
        self.fc1 = nn.Linear(*input_dims, fc1)
        self.fc2 = nn.Linear(fc1, fc2)
        self.fc3 = nn.Linear(fc2, output_dims)

    def forward(self, x):
        x = F.leaky_relu(self.fc1(x))
        x = F.leaky_relu(self.fc2(x))
        x = F.leaky_relu(self.fc3(x))
        return x


In [14]:
class ACTOR_CRITIC_AGENT:
    def __init__(self, input_dims, n_actions, gamma=0.99):
        super(ACTOR_CRITIC_AGENT, self).__init__()
        self.gamma = gamma

        self.actor = A3C_NET(input_dims, n_actions, 64, 128)
        self.critic = A3C_NET(input_dims, 1, 64, 128)

        self.rewards = []
        self.states = []
        self.actions = []

    def push(self, state, action, reward):
        self.rewards.append(reward)
        self.actions.append(action)
        self.states.append(state)

    def reset_mem(self):
        self.rewards.clear()
        self.states.clear()
        self.actions.clear()

    def choose_action(self, state):
        state = torch.tensor([state], dtype=torch.float)

        actions = self.actor(state)
        probs = F.softmax(actions, dim=1)

        dist = Categorical(probs)
        action = dist.sample().numpy()[0]

        return action

    def calc_returns(self, done):
        states = torch.tensor(self.states, dtype=torch.float)
        values = self.critic(states)

        R = values[-1] * (1 - int(done))
        returns = []

        for reward in reversed(self.rewards):
            R = R * self.gamma + reward
            returns.append(R)
        returns.reverse()

        return torch.tensor(returns, dtype=torch.float)


In [18]:
class A3C_AGENT(mp.Process):
    def __init__(self, global_actor_critic, optimizer, input_dims, n_actions,
                 gamma, lr, name, global_ep_idx, env_id):
        super(A3C_AGENT, self).__init__()
        self.local_actor_critic = ACTOR_CRITIC_AGENT(input_dims, n_actions, gamma)
        self.global_actor_critic = global_actor_critic
        self.name = 'w%02i' % name
        self.episode_idx = global_ep_idx
        self.env = gym.make(env_id)
        self.optimizer = optimizer

    def run(self):
        t_step = 1
        while self.episode_idx.value < N_GAMES:
            
            done = False
            observation = list(self.env.reset()[0])
            score = 0
            self.local_actor_critic.clear_memory()

            while not done:
                action = self.local_actor_critic.choose_action(observation)
                observation_, reward, truncated, terminated, info = self.env.step(action)
                score += reward

                done = terminated or truncated

                self.local_actor_critic.remember(observation, action, reward)

                if t_step % T_MAX == 0 or done:
                    loss = self.local_actor_critic.calc_loss(done)
                    self.optimizer.zero_grad()
                    loss.backward()

                    for local_param, global_param in zip(
                            self.local_actor_critic.parameters(),
                            self.global_actor_critic.parameters()):
                        global_param._grad = local_param.grad
                    self.optimizer.step()
                    self.local_actor_critic.load_state_dict(
                        self.global_actor_critic.state_dict())
                    self.local_actor_critic.clear_memory()

                t_step += 1
                observation_ = list(observation_)
                observation = observation_

            with self.episode_idx.get_lock():
                self.episode_idx.value += 1
                print(self.name, ' episode ', self.episode_idx.value, ' reward %.1f ' % score)


In [17]:
lr = 1e-4
env_id = 'CartPole-v1'
n_actions = 2
input_dims = [4]
N_GAMES = 3000
T_MAX = 5
global_actor_critic = ACTOR_CRITIC_AGENT(input_dims, n_actions)
global_actor_critic.share_memory()
optim = SharedAdam(global_actor_critic.parameters(), lr=lr,
                   betas=(0.92, 0.999))
global_ep = mp.Value('i', 0)

workers = [A3C_AGENT(global_actor_critic,
                 optim,
                 input_dims,
                 n_actions,
                 gamma=0.99,
                 lr=lr,
                 name=i,
                 global_ep_idx=global_ep,
                 env_id=env_id) for i in range(mp.cpu_count())]
[w.start() for w in workers]
[w.join() for w in workers]

TypeError: empty(): argument 'size' must be tuple of ints, but found element of type list at pos 2