In [1]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
from time import sleep
from tqdm import tqdm_notebook

In [2]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fe = nn.Sequential(
            nn.Linear(4, 64), nn.ReLU(True),
            nn.Linear(64, 128), nn.ReLU(True)
        )
        self.policy = nn.Linear(128, 2)
        self.value = nn.Linear(128, 1)

    def forward(self, observation):
        f = self.fe(observation)
        return self.policy(f), self.value(f)

net = Net()
optimizer = torch.optim.Adam(net.parameters(), lr=1e-3)

In [3]:
def act(observation):
    with torch.no_grad():
        logit, _ = net(torch.tensor(observation, dtype=torch.float32))
    p = F.softmax(logit, dim=-1)
    log_p = torch.log(p)
    action = torch.multinomial(p, 1)
    return action[:, 0].cpu().numpy()

In [4]:
def make_env():
    env = gym.make('CartPole-v0')
    return env

class EnvPool:
    def __init__(self, n_envs):
        self.environments = [make_env() for _ in range(n_envs)]

    def reset(self):
        return [env.reset() for env in self.environments]

    def step(self, actions):
        results = [env.step(a) for env, a in zip(self.environments, actions)]
        observation, reward, done, _ = map(list, zip(*results))

        for i in range(len(self.environments)):
            if done[i]:
                observation[i] = self.environments[i].reset()

        return observation, reward, done

In [5]:
env_pool = EnvPool(20)
gamma = 0.99

In [6]:
# Q(s, a) = V(s) + A(s, a)
# A(s, a) = Q(s, a) - V(s) = r(s, a) + gamma * V(s') - V(s)
# V(s) = E_a [r(s, a) + gamma * V(s')]

observations = env_pool.reset()
for step in tqdm_notebook(range(5000)):
    actions = act(observations)
    logits, value = net(torch.tensor(observations, dtype=torch.float32))
    p = F.softmax(logits, dim=-1)
    log_p = torch.log(p)
    log_p_for_actions = log_p[torch.arange(20), actions]
    observations, rewards, done = env_pool.step(actions)
    rewards = torch.tensor(rewards, dtype=torch.float32)
    not_done = 1.0 - torch.tensor(done, dtype=torch.float32)
    
    with torch.no_grad():
        _, next_value = net(torch.tensor(observations, dtype=torch.float32))

    advantage = rewards + gamma * not_done * next_value[:, 0] - value[:, 0]
    policy_loss = (log_p_for_actions * advantage.detach()).mean()
    value_loss = (advantage ** 2).mean()
    entropy = -(p * log_p).sum(-1).mean()
    
    optimizer.zero_grad()
    loss = value_loss - policy_loss - 0.01 * entropy
    loss.backward()
    optimizer.step()

HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))




In [7]:
environment = gym.make('CartPole-v0')
environment = environment.env

def play_episode(render=False):
    observation = environment.reset()
    rewards = []
    done = False
    while not done:
        action = act([observation])
        observation, reward, done, _ = environment.step(action[0])
        if render:
            environment.render()
            sleep(0.01)

        rewards.append(reward)
    return rewards

In [8]:
r = play_episode(True)
print(sum(r))

215.0
