<a href="https://colab.research.google.com/github/ChGol/notebooks-workplace/blob/main/notebooks/rl/Part_III.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  Setup
Run below cells and hide it afterwards with the arrow on the left. 

In [1]:
!pip install gym[Box2D] pyvirtualdisplay pyglet > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

In [2]:
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) #error only
import numpy as np
import random
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (10.0, 8.0)
import math
import glob
import io
import base64
from IPython.display import HTML

from typing import List, Tuple

import torch
from torch import nn
import torch.nn.functional as F
from collections import deque

from IPython import display as ipythondisplay
from IPython.display import display, update_display, clear_output
from time import sleep

from pyvirtualdisplay import Display
xdisplay = Display(visible=0, size=(1300, 900), backend="xvfb")
xdisplay.start()


"""
Utility functions to enable video recording of gym environment and displaying it
To enable video, just do "env = wrap_env(env)""
"""

class DoneWrapper(gym.Wrapper):

  def step(self, action):
    observation, reward, done, info = self.env.step(action) 
    return observation, reward, False, info
      

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    
    
def wrap_env(env, done=True):
  if not done:
    env = DoneWrapper(env)
  env = Monitor(env, './video', force=True, mode='evaluation')
  return env


def print_ansi(screen, display_id='42', wait=0.5):
    clear_output(wait=True)
    update_display(print(screen.getvalue()), display_id=display_id)
    sleep(wait)


def plot(img):
  fig = plt.figure(figsize=(8,6))
  ax = fig.add_subplot(111)
  ax.imshow(img)
  ax.set_xticks([])
  ax.set_yticks([])

In [3]:
def gather_trajectories(env: gym.Env, policy, num_trajs: int = 10):
    """Gather `num_trajs` trajectories by interacting with the environment using the given policy."""
    
    # preapre a list for the trajectories
    history = []
    
    for traj_idx in range(num_trajs):
        obs = env.reset()
        done = False
        current_traj = []
        while not done:
            
            # sample an action from the policy
            action = policy.sample(obs)
            # feed it into the environment
            next_obs, reward, done, _ = env.step(action)
            
            # save into the history
            current_traj += [(obs, action, reward)]

            obs = next_obs
        history += [current_traj]
        
    return history

def calculate_return(rewards: List[float]) -> Tuple[float, List[float]]:
    """Calulated and episode and step returns"""
    # calculate the sum of rewards from the episode
    rewards = np.array(rewards)
    episode_return = np.sum(rewards)
    
    # prepare a list for the step returns
    step_returns = []

    # calculate discounted return for each step
    # hint: it's easier to go backwards
    step_returns = [rewards[-1]]
    for reward in reversed(rewards[:-1]):
        last_return = step_returns[-1]
        step_returns += [reward + last_return]
    step_returns.reverse()

    return episode_return, step_returns

def process_trajectories(history: List):
    """Process gathered trajectories into tensors and calculate returns"""
    # prepare containers for each element
    obs_array = []
    action_array = []
    return_array = []
    episode_returns = []
    
    # loop over the whole history
    rewards = []
    for traj_idx, traj in enumerate(history):
        # unpack the elements
        traj_obs, traj_actions, traj_rewards = list(zip(*traj))

        # process the end of an episode - calculate episode and step returns

        episode_return, step_returns = calculate_return(traj_rewards)
        
        episode_returns += [episode_return]
        obs_array += traj_obs
        action_array += traj_actions
        return_array += step_returns

    # cast out data to tensors (will be useful later)     
    obs_array = torch.tensor(obs_array, dtype=torch.float32)
    action_array = torch.tensor(action_array, dtype=torch.float32)
    return_array = torch.tensor(return_array, dtype=torch.float32)
    episode_returns = torch.tensor(episode_returns, dtype=torch.float32)
    
    return obs_array, action_array, return_array, episode_returns

def visualize(env, policy):
    """Run the provided policy on the environment"""

    env = wrap_env(env)
    obs = env.reset()
    done = False
    
    while not done:
        action = policy.sample(obs) # ???
        obs, reward, done, _ = env.step(action)
        env.render()

    env.close()
    show_video()


class NetworkPolicy(nn.Module):

    def __init__(self, obs_dim: int, action_dim: int, h_dim: int = 16):
        super(NetworkPolicy, self).__init__()

        self.model = nn.Sequential(nn.Linear(obs_dim, h_dim),
                                   nn.Tanh(),
                                   nn.Linear(h_dim, action_dim))

    def probs(self, obs):
        # cast the numpy array to a torch tensor if necessary
        if not isinstance(obs, torch.Tensor):
            obs = torch.tensor(obs, dtype=torch.float32)
        # get logits from the model
        logits = self.model(obs)
        # use softmax function to transform logits into probability distribution
        return F.softmax(logits, -1)

    def log_probs(self, obs: np.ndarray):
        # cast the numpy array to a torch tensor if necessary
        if not isinstance(obs, torch.Tensor):
            obs = torch.tensor(obs, dtype=torch.float32)
        # get logits from the model
        logits = self.model(obs)
        # use *log* softmax function to transform logits into probability distribution
        return F.log_softmax(logits, -1)
        
    def sample(self, obs):
        # again, sample from the prepared probability vector 
        # remember the `.item()` method!
        probs = self.probs(obs)
        return torch.multinomial(probs, 1).item()


def policy_gradient_step(policy: NetworkPolicy,
                         optimizer: torch.optim.Optimizer, 
                         obs: torch.Tensor, 
                         actions: torch.Tensor, 
                         step_returns: torch.Tensor,
                         num_trajs: int):

    # pass the obs to the policy to get log probabilities of each action
    log_probs = policy.log_probs(obs)
    
    # get the probability of the action thast was actual performed for each observation
    actions = actions.view(-1, 1).long()
    action_log_probs = log_probs.gather(1, actions).squeeze() 
    # calculat the gradient
    target = -(action_log_probs * step_returns).sum() / num_trajs
    # pass it to the optimizer
    optimizer.zero_grad()
    target.backward()
    optimizer.step()

# Part 3. Values Function

(If you didn't finish the last code part or want to play around with the code more, now is a good chance as this part is significantly smaller than the last one)


## Exercise: Average Baseline
Change the function `train_policy_gradient` so that it normalizes the `step_return` by subtracting the mean return. Compare the version with normalization and without it. Do you see an improvement?


In [4]:
def train_policy_gradient(env: gym.Env, 
                          policy: torch.nn.Module, 
                          num_iterations: int = 100, 
                          trajs_per_gather: int = 10,
                          normalize=True):

    # we'll use adam to update the weights of our network
    optimizer = torch.optim.Adam(policy.parameters(), lr=5e-3)
    # training loop
    for idx in range(num_iterations + 1):
        # gather trajectories using current policy
        history = gather_trajectories(env, policy, num_trajs=trajs_per_gather) # ???
        
        # calculate the obs, actions and returns array by processing the trajectories
        obs, actions, step_returns, ep_returns = process_trajectories(history) # ???

        # normalize the returns (i.e. substract the mean)
        if normalize:
            step_returns = step_returns - step_returns.mean() # ???

        # policy gradient training
        policy_gradient_step(policy=policy,
                                optimizer=optimizer,
                                obs=obs,
                                actions=actions,
                                step_returns=step_returns,
                                num_trajs=trajs_per_gather)
        # log traning progress
        if idx % 10 == 0:
            print(f"Traning iteration {idx}, mean episode returns: {ep_returns.mean():.3f}")

In [5]:
# moon lander
# env = gym.make("LunarLander-v2")
# or cart pole
env = gym.make("CartPole-v1")

# gather necessary dimensions for our netowrk
obs_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
# initialize the policy
network_policy = NetworkPolicy(obs_dim, action_dim)

# train the model
train_policy_gradient(env, 
                      network_policy, 
                      num_iterations=100,
                      normalize=True, 
                      trajs_per_gather=20)



Traning iteration 0, mean episode returns: 22.300
Traning iteration 10, mean episode returns: 33.150
Traning iteration 20, mean episode returns: 42.250
Traning iteration 30, mean episode returns: 50.250
Traning iteration 40, mean episode returns: 63.150
Traning iteration 50, mean episode returns: 98.300
Traning iteration 60, mean episode returns: 158.900
Traning iteration 70, mean episode returns: 294.900
Traning iteration 80, mean episode returns: 396.650
Traning iteration 90, mean episode returns: 500.000
Traning iteration 100, mean episode returns: 500.000


## Exercise: Building a Simple Network.

Here we will implement Policy Gradient training with a baseline function. We'll use a separate *Value Network* to estimate values $V_{\pi_\theta}$, implement its training procedure and use it in the Policy Gradient algorithm.

Your task is to create a simple fully connected neural network using pytorch's [`nn.Sequential`](https://pytorch.org/docs/stable/nn.html#torch.nn.Sequential) interface with three [`nn.Linear`](https://pytorch.org/docs/stable/nn.html#torch.nn.Linear) layers and [`nn.Tanh`](https://pytorch.org/docs/stable/nn.html#torch.nn.Tanh) activations functions. In other words, the network should have 2 hidden layers, both os size `h_dim`, should be albe to process environment observations and return a single scalar.



In [6]:
def get_value_network(env: gym.Env, h_dim: int = 32):
    """Create a value network with 2 hidden layers, both with `h_dim` neurons
       and Tanh nonlinear activations"""

    obs_dim = env.observation_space.shape[0]
    
    # build the network
    value_network = nn.Sequential( # ???
        nn.Linear(obs_dim, h_dim),
        nn.Tanh(),
        nn.Linear(h_dim, h_dim),
        nn.Tanh(),
        nn.Linear(h_dim, 1))
    
    return value_network

## Exercise: Value Network training step

Now we need a way to train our Value Network. As we know its task is to predict a return $R_t$ for a given state $s_t$. We already have gathered those for our policy gradient so let's feed it into out Value network.

As a reminder, we will use the Mean Squared Error as the loss function to train the Value Network:

$$ \frac{1}{M}\sum_{j=1}^{N}\sum_{t=1}^T\big(R_t(\tau_j)-V_\psi(s_t(\tau_j))\big)^2 $$
        where $M$ is the total number of samples in all trajectories.

The function `value_net_step` receives pair of vectors containing $s_i$ and $R_i$, so all you need to do here is to calculate and minimize:

$$ \frac{1}{M}\sum_{i=1}^{M}\big(R_i -V_\psi(s_i)\big)^2 $$


In [7]:
def value_net_step(obs: torch.Tensor, 
                   step_returns: torch.Tensor,
                   model: torch.nn.Module, 
                   optim: torch.optim.Optimizer):
    """"Train the value network on a single batch of states and returns"""
    
    # pass the observatrion to get network and get the predicted values
    values = model(obs).squeeze() # ???

    # calculate the loss function: mean squared error
    loss = ((values - step_returns) ** 2).mean() # ???
    
    # pass gradients to the optimizer
    optim.zero_grad()
    loss.backward()
    optim.step()

Next we can reuse our `policy_gradient_step` function from previous part, all we need to do is to modify the `step_returns` passed. 

As a reminder we want to replace the returns $R_t$ with advantages  $A_t = R_t - V_t$ where $V_t$ is the value predicted by our new network.


In [8]:
def train_pg_baseline(env: gym.Env, 
                      policy: torch.nn.Module, 
                      value_network: torch.nn.Module, 
                      num_iterations: int = 100, 
                      value_net_epochs: int = 20,
                      trajs_per_gather: int = 10):
    
    # prepare optimizers for both networks
    policy_optimizer = torch.optim.Adam(policy.parameters(), lr=5e-3)
    value_optimizer = torch.optim.Adam(value_network.parameters(), lr=5e-1)
    
    # training loop
    for idx in range(num_iterations + 1):
        
        # gather trajectories using current policy
        history = gather_trajectories(env, policy, num_trajs=trajs_per_gather)
        # calculate the obs, actions and returns array by processing the trajectories
        obs, actions, step_returns, ep_returns = process_trajectories(history)
        
        # now modify step_returns using the baseline from value network
        # here you need to use `.detach()` method to detach the value
        # network's output from the gradient calculation graph
        advantage = step_returns - value_network(obs).detach().squeeze() # ???

        # first train the value network
        for val_idx in range(value_net_epochs):
            value_net_step(obs=obs,  # ???
                        step_returns=step_returns, 
                        model=value_network, 
                        optim=value_optimizer) 
        

        # run the policy gradient step like before
        policy_gradient_step(policy=policy,
                         optimizer=policy_optimizer, 
                         obs=obs, 
                         actions=actions, 
                         step_returns=advantage,
                         num_trajs=trajs_per_gather)
    
        # log training progress
        if idx % 10 == 0:
            print(f"Traning iteration {idx}, mean episode returns: {ep_returns.mean():.3f}")

All that's left is to run our Policy Gradient with a baseline functions.

In [9]:
# moon lander
# env = gym.make("LunarLander-v2")
# or cart pole
env = gym.make("CartPole-v1")

# gather necessary dimensions for our netowrk
obs_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

# initialize the policy
network_policy = NetworkPolicy(obs_dim, action_dim)
value_network = get_value_network(env)

# train the model
train_pg_baseline(env, 
    policy=network_policy, 
    value_network=value_network,
    num_iterations=100,
    trajs_per_gather=20
)


Traning iteration 0, mean episode returns: 26.350
Traning iteration 10, mean episode returns: 40.900
Traning iteration 20, mean episode returns: 41.650
Traning iteration 30, mean episode returns: 57.000
Traning iteration 40, mean episode returns: 54.800
Traning iteration 50, mean episode returns: 85.250
Traning iteration 60, mean episode returns: 171.250
Traning iteration 70, mean episode returns: 352.100
Traning iteration 80, mean episode returns: 454.050
Traning iteration 90, mean episode returns: 494.050
Traning iteration 100, mean episode returns: 500.000


## Bonus Exercise 1
Try to play with `value_net_epochs` to find balance between computation cost and sample efficiency. How many value network epochs are needed for it to perform better than the simple average baseline?

## Bonus Exercise 2
It might be a good idea for value network and policy to share some parameters, since they both need to extract important features from the data.

Build a neural network such that:
1. The first two layers process the state $s$ to get the representation $f(s)$
2. The representation $f(s)$ is then passed to the policy network which, based on that, returns the vector or probabilities $\pi_\theta$.
3. The representation $f(s)$ is also passed to the value network which returns the value $V_\psi(s)$.

Try to train the whole network end-to-end, so that the first two layers are updated both when minimizing the value network loss and policy loss. Do you see an improvement?