In [1]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
from time import sleep
from tqdm import tqdm_notebook

In [2]:
environment = gym.make('CartPole-v0')
environment = environment.env
gamma = 0.99

In [3]:
# net returns log_p
net = nn.Sequential(
    nn.Linear(4, 64), nn.ReLU(True),
    nn.Linear(64, 128), nn.ReLU(True),
    nn.Linear(128, 2)
)
optimizer = torch.optim.Adam(net.parameters(), lr=1e-3)

In [4]:
def act(observation):
    logit = net(torch.tensor(observation, dtype=torch.float32))
    p = F.softmax(logit, dim=-1)
    log_p = torch.log(p)
    action = torch.multinomial(p, 1).item()
    log_p_for_action = log_p[action]
    entropy = -(p * log_p).sum()
    return action, log_p_for_action, entropy

In [5]:
def play_episode(render=False):
    observation = environment.reset()
    log_p_for_actions, entropy_for_actions, rewards = [], [], []
    done = False
    while not done:
        action, log_p_for_action, entropy_for_action = act(observation)
        observation, reward, done, _ = environment.step(action)
        if render:
            environment.render()
            sleep(0.1)

        log_p_for_actions.append(log_p_for_action)
        entropy_for_actions.append(entropy_for_action)
        rewards.append(reward)
    return log_p_for_actions, entropy_for_actions, rewards

In [6]:
def calculate_discounted_reward(reward):
    discounter_reward = [reward[-1]]
    for r in reversed(reward[:-1]):
        discounter_reward.append(r + gamma * discounter_reward[-1])
    return discounter_reward[::-1]

In [7]:
def training_step():
    optimizer.zero_grad()
    log_p_for_actions, entropy_for_actions, rewards = play_episode()
    discounter_reward = calculate_discounted_reward(rewards)
    reinforce_obj = (torch.stack(log_p_for_actions) * torch.tensor(discounter_reward)).mean()
    entropy_reg = torch.stack(entropy_for_actions).mean()
    loss = -reinforce_obj - 0.01 * entropy_reg
    loss.backward()
    optimizer.step()
    return loss.item(), discounter_reward[0]

In [8]:
# this is actually number of episodes to play
for _ in tqdm_notebook(range(1000)):
    training_step()

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [9]:
_, _, r = play_episode()
sum(r)

122.0