<a href="https://colab.research.google.com/github/Benteaux/sketchbook/blob/main/REINFORCE_vpg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

read later:
1. https://arxiv.org/abs/1506.02438
2. https://openai.com/research/vpt
3. https://openai.com/research/openai-gym-beta

In [None]:
import torch
import torch.nn as nn
from torch.distributions.categorical import Categorical
from torch.optim import Adam
import numpy as np
import gym
from gym.spaces import Discrete, Box

In [None]:
# feedforward network
def mlp(sizes, activation = nn.Tanh, output_activation = nn.Identity):
  layers = []
  for j in range(len(sizes) - 1):
    act = activation if j < len(sizes) - 2 else output_activation
    layers += [nn.Linear(sizes[j], sizes[j + 1], act())]
  return nn.Sequential(*layers)

1. Make the environment
2. Define the policy for the environment
3. Compute actions via the policy
4. Compute a loss via the actions

In [None]:
def train(env_name = 'CartPole-v0', hidden_sizes = [32], lr = 1e-2,
          epochs = 50, batch_size = 5000, render = False):

  # make environment
  env = gym.make(env_name)
  assert isinstance(env.observation_space, Box), \
    "Error: This example only works for envs with continuous state spaces."
  assert isinstance(env.action_space, Discrete), \
    "Error: this example only works for environments with discrete action spaces"

  obs_dim = env.observation_space.shape[0]
  act_dim = env.action_space.n

  logits_net = mlp(sizes = [obs_dim] + hidden_sizes + [act_dim])

  # get the policy - action distribution for observations of our states
  def get_policy(obs):
    logits = logits_net(obs)
    return Categorical(logits = logits)

  # assumes we make 1 set of observations. computes the action to take according to the policy
  def get_action(obs):
    return get_policy(obs).sample().item()

  def compute_loss(obs, act, weights):
    logp = get_policy(obs).log_prob(act)
    return -(logp * weights).mean()

  optimizer = Adam(logits_net.parameters(), lr = lr)

  # for training the policy
  def train_one_epoch():
    batch_obs = []      # observatoins
    batch_acts = []     # actions
    batch_weights = []  # weights for our trajectory returns
    batch_rets = []     # for trajectory returns
    batch_lens = []     # for trajectory lengths

    obs = env.reset() # first obs from starting dist
    done = False      # are we done or not
    ep_rews = []      # rewards acquired throughout episode

    finished_rendering_epoch = False # what does this do

    while True:

      # render
      if (not finished_rendering_epoch) and render:
        env.render()

      # save current observation
      batch_obs.append(obs.copy())

      # act
      act = get_action(torch.as_tensor(obs, dtype = torch.float32))
      obs, rew, done, info = env.step(act)

      # save action & reward
      batch_acts.append(act)
      ep_rews.append(rew)

      if done:

        ep_ret, ep_len = sum(ep_rews), len(ep_rews)
        batch_rets.append(ep_ret)
        batch_lens.append(ep_len)

        # weight for each logprob(a|s) is the return for the trajectory
        batch_weights += [ep_ret] *ep_len

        # reset values
        obs, done, ep_rews = env.reset(), False, []

        # don't render epoch again, i.e only render an epoch for 1 trajectory
        finished_rendering_epoch = True

        # break if we have all our episodes for this epoch
        if len(batch_obs) > batch_size:
          break

    optimizer.zero_grad()
    batch_loss = compute_loss(obs = torch.as_tensor(batch_obs, dtype = torch.float32),
                              act = torch.as_tensor(batch_acts, dtype = torch.float32),
                              weights = torch.as_tensor(batch_weights, dtype = torch.float32))
    batch_loss.backward()
    optimizer.step()
    return batch_loss, batch_rets, batch_lens

  for i in range(epochs):
    batch_loss, batch_rets, batch_lens = train_one_epoch()
    print(f'epoch: {i:3d} \t loss: {batch_loss:.3f} \t return: {np.mean(batch_rets):.3f} \t ep_len: {np.mean(batch_lens):.3f}')

In [None]:
train(env_name = 'CartPole-v1', hidden_sizes = [32], lr = 1e-2,
          epochs = 50, batch_size = 5000, render = True)

If you want to render in human mode, initialize the environment in this way: gym.make('EnvName', render_mode='human') and don't call the render method.
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(
  if not isinstance(terminated, (bool, np.bool8)):
  batch_loss = compute_loss(obs = torch.as_tensor(batch_obs, dtype = torch.float32),


epoch:   0 	 loss: 18.276 	 return: 20.727 	 ep_len: 20.727
epoch:   1 	 loss: 20.534 	 return: 22.786 	 ep_len: 22.786
epoch:   2 	 loss: 23.125 	 return: 26.120 	 ep_len: 26.120
epoch:   3 	 loss: 24.620 	 return: 27.522 	 ep_len: 27.522
epoch:   4 	 loss: 28.486 	 return: 32.348 	 ep_len: 32.348
epoch:   5 	 loss: 31.794 	 return: 37.154 	 ep_len: 37.154
epoch:   6 	 loss: 33.761 	 return: 40.508 	 ep_len: 40.508
epoch:   7 	 loss: 35.423 	 return: 42.227 	 ep_len: 42.227
epoch:   8 	 loss: 39.483 	 return: 46.907 	 ep_len: 46.907
epoch:   9 	 loss: 32.011 	 return: 42.441 	 ep_len: 42.441
epoch:  10 	 loss: 40.096 	 return: 50.717 	 ep_len: 50.717
epoch:  11 	 loss: 37.258 	 return: 48.553 	 ep_len: 48.553
epoch:  12 	 loss: 51.118 	 return: 66.461 	 ep_len: 66.461
epoch:  13 	 loss: 47.593 	 return: 62.825 	 ep_len: 62.825
epoch:  14 	 loss: 54.957 	 return: 70.465 	 ep_len: 70.465
epoch:  15 	 loss: 55.876 	 return: 79.672 	 ep_len: 79.672
epoch:  16 	 loss: 53.468 	 return: 72.6