[Hands on RL Policy Gradient](https://github.com/PacktPublishing/Hands-on-Reinforcement-Learning-with-PyTorch/blob/master/Section%204/4.3%20Policy%20Gradients%20REINFORCE.ipynb)

[Policy Gradient Math](https://towardsdatascience.com/policy-gradients-in-reinforcement-learning-explained-ecec7df94245)

A widely used variation of REINFORCE is to subtract a baseline value from the return to reduce the variance of gradient estimation while keeping the bias unchanged (Remember we always want to do this when possible). For example, a common baseline is to subtract state-value from action-value, and if applied, we would use advantage:

$$
A(s,a) = Q(s,a) - V(s)
$$

in the gradient ascent update. This [post](https://danieltakeshi.github.io/2017/03/28/going-deeper-into-reinforcement-learning-fundamentals-of-policy-gradients/) nicely explained why a baseline works for reducing the variance, in addition to a set of fundamentals of policy gradient.

In [1]:
#!pip install swig
#!pip install gymnasium[box2d]

## Policy Gradient with Baseline

![Reinforce_bl](reinforce_bl2.png) 

In [58]:
import torch
from torch import cuda, device, distributions
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
#from torch.distributions import Categorical
import math

import gymnasium as gym
import os, random
from pathlib import Path
from collections import deque

import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import clear_output

In [59]:
env_id = "LunarLander-v2"
env = gym.make(env_id)#,render_mode="human")

s_size = env.observation_space.shape[0]
a_size = env.action_space.n

print("_____OBSERVATION SPACE_____ \n")
print("The State Space is: ", s_size)
print("Sample observation", env.observation_space.sample()) # Get a random observation

_____OBSERVATION SPACE_____ 

The State Space is:  8
Sample observation [-0.5759283   0.238531   -4.376683   -4.756889   -1.4022433  -3.6932883
  0.8724708   0.33947608]


In [60]:
device = device("cuda:0" if cuda.is_available() else "cpu")

In [103]:
def calc_disc_return(r_t , gamma = 0.998):

    G_t = deque(maxlen = len(r_t))
    G_t.append(r_t[-1])

    for i in reversed(r_t[:-1]):
        disc = i + (gamma*G_t[0])
        G_t.appendleft(disc)

    return np.array(G_t)

class ActorCriticNet(nn.Module):
    def __init__(self, state_size, action_size, hidden_size):
        super(ActorCriticNet, self).__init__()
        self.pi_1 = nn.Linear(state_size, hidden_size)
        self.pi_2 = nn.Linear(hidden_size, hidden_size)
        self.policy = nn.Linear(hidden_size, action_size)

        self.v_1 = nn.Linear(state_size, hidden_size)
        self.v_2 = nn.Linear(hidden_size, hidden_size)
        self.value = nn.Linear(hidden_size, 1)
    
    def forward(self, obs):
        x1 = torch.clamp(obs,-1.1,1.1)
        x1 = F.relu(self.pi_1(x1))
        x1 = F.relu(self.pi_2(x1))

        x2 = torch.clamp(obs,-1.1,1.1)
        x2 = F.relu(self.v_1(x2))
        x2 = F.relu(self.v_2(x2))
        
        return F.softmax(self.policy(x1),dim = 1), self.value(x2)
        
      
class Agent():
    def __init__(self, state_size, action_size, hidden_size, gamma = 0.99, learning_rate = 0.001):
        
        self.state_size = state_size
        self.action_size = action_size
        self.hidden_size = hidden_size
        self.gamma = gamma
        
        self.ac_net = ActorCriticNet(state_size, action_size, hidden_size).to(device)
        self.optimizer = optim.Adam(self.ac_net.parameters(), lr = learning_rate)

    '''
    def forward(self, obs):

        p_vals = self.policy_net(obs)
        v = self.value_net(state_t) 
        
        return p_vals, v
    '''
    

In [104]:
hidden_layer = 64
gamma = 0.995
policy_lr = 0.001
value_lr = 0.001
episodes = 100_000
avg_win_size = 50
epi_results = deque(maxlen=avg_win_size)

agent = Agent(s_size, a_size, hidden_layer)

In [105]:
for epi in range(10):

    if not train:
        print("set train flag to True for Training")
        break
        
    s = env.reset()[0]
    term , trunc = False, False
    rewards, states , actions = [], [], []
    win = 0

    while not any([term, trunc]):

        states.append(s)
        obs = torch.FloatTensor(np.expand_dims(s,0)).to(device)

        with torch.no_grad():
            p_vals,_ = agent.ac_net(obs)
            p_vals = torch.squeeze(p_vals)

        p_vals = p_vals.detach().cpu().numpy()
        a = np.random.choice(a_size, p=p_vals)

        s_, r, done ,trunc, _  = env.step(a)
        actions.append(a)
        rewards.append(r)
        s=np.copy(s_)



In [106]:
state_t = torch.FloatTensor(states).to(device)
action_t = torch.LongTensor(actions).to(device).view(-1,1)
return_t = torch.FloatTensor(calc_disc_return(rewards, gamma)).to(device).view(-1,1)

In [108]:
pi, values = agent.ac_net.forward(state_t)

In [110]:
critic_loss = (return_t-values)**2

In [112]:
_, v = agent.ac_net.forward(state_t)

R = v[-1]*(1-int(True))

batch_return = []
for reward in rewards[::-1]:
    R = reward + gamma*R
    batch_return.append(R)
batch_return.reverse()
batch_return = torch.tensor(batch_return, dtype=torch.float)

In [113]:
batch_return

tensor([-12166.2705, -12226.3809, -12286.4609, -12346.2871, -12406.2910,
        -12467.5439, -12529.0615, -12591.8486, -12655.3340, -12719.1377,
        -12783.6641, -12848.7881, -12911.5684, -12975.7285, -13040.2451,
        -13103.9502, -13168.8018, -13232.5176, -13298.8740, -13363.3457,
        -13430.3027, -13497.4570, -13564.1152, -13630.8779, -13697.0986,
        -13763.4834, -13831.0264, -13897.8623, -13967.1279, -14035.8818,
        -14103.9512, -14174.2539, -14242.4209, -14312.5391, -14382.9941,
        -14452.5020, -14524.3555, -14595.7988, -14666.2480, -14736.9248,
        -14809.7852, -14881.1387, -14952.6963, -15024.6777, -15096.8232,
        -15169.0215, -15241.3594, -15314.5078, -15389.6172, -15464.3994,
        -15540.7988, -15615.2656, -15690.2021, -15765.2021, -15842.9072,
        -15918.9082, -15994.6279, -16071.6152, -16150.8018, -16230.7627,
        -16310.1182, -16389.8711, -16470.0234, -16548.0352, -16629.8535,
        -16710.2949, -16792.0547, -16874.2109, -169

In [91]:
return_t = torch.FloatTensor(calc_disc_return(rewards, gamma)).to(device)

In [93]:
pi, values = agent.forward(state_t)
values = values.squeeze()
critic_loss = (return_t-values)**2

tensor([ 6325.3652,  6025.6157,  5811.7891,  5676.4771,  5584.6191,  5360.4746,
         5203.5566,  5133.5522,  5019.4141,  4912.9917,  4751.9443,  4574.1519,
         4447.8901,  4276.2393,  4509.3032,  4732.8408,  4640.6831,  4867.9355,
         4751.7153,  5173.2993,  5719.9565,  6601.9297,  6451.7129,  6310.8389,
         6179.1172,  6104.8530,  6014.9248,  5933.4595,  5860.3853,  5863.0269,
         5735.8169,  5599.9106,  5461.5483,  5301.7437,  5655.2256,  5543.0977,
         5527.6865,  5466.0093,  5412.0430,  5277.6958,  5129.0117,  5733.6655,
         5710.2446,  5645.8188,  6125.1030,  6174.2842,  6053.5503,  5929.5088,
         5866.4551,  5899.8320,  6520.4727,  7150.3633,  7102.9341,  6917.8740,
         6914.9277,  6830.8901,  6796.6987,  6798.1631,  6671.7920,  6587.0317,
         6755.8638, 10009.8311], device='cuda:0', grad_fn=<PowBackward0>)

In [90]:
values.shape

torch.Size([62])