[Hands on RL Policy Gradient](https://github.com/PacktPublishing/Hands-on-Reinforcement-Learning-with-PyTorch/blob/master/Section%204/4.3%20Policy%20Gradients%20REINFORCE.ipynb)

[Policy Gradient Math](https://towardsdatascience.com/policy-gradients-in-reinforcement-learning-explained-ecec7df94245)

A widely used variation of REINFORCE is to subtract a baseline value from the return to reduce the variance of gradient estimation while keeping the bias unchanged (Remember we always want to do this when possible). For example, a common baseline is to subtract state-value from action-value, and if applied, we would use advantage:

$$
A(s,a) = Q(s,a) - V(s)
$$

in the gradient ascent update. This [post](https://danieltakeshi.github.io/2017/03/28/going-deeper-into-reinforcement-learning-fundamentals-of-policy-gradients/) nicely explained why a baseline works for reducing the variance, in addition to a set of fundamentals of policy gradient.

In [None]:
#!pip install swig
#!pip install gymnasium[box2d]

## Policy Gradient with Baseline

![Reinforce_bl](acritic.png) 

In [1]:
import torch
from torch import cuda, device, distributions
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
#from torch.distributions import Categorical
import math

import gymnasium as gym
import os, random
from pathlib import Path
from collections import deque

import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

from IPython.display import clear_output

In [2]:
env_id = "LunarLander-v2"
env = gym.make(env_id)#,render_mode="human")

s_size = env.observation_space.shape[0]
a_size = env.action_space.n

print("_____OBSERVATION SPACE_____ \n")
print("The State Space is: ", s_size)
print("Sample observation", env.observation_space.sample()) # Get a random observation

_____OBSERVATION SPACE_____ 

The State Space is:  8
Sample observation [-0.9573402   0.90683043  2.0253897   0.03843589 -1.9880785  -1.2015078
  0.12112498  0.19466022]


In [3]:
device = device("cuda:0" if cuda.is_available() else "cpu")

In [4]:
def calc_disc_return(r_t , gamma = 0.998):

    G_t = deque(maxlen = len(r_t))
    G_t.append(r_t[-1])

    for i in reversed(r_t[:-1]):
        disc = i + (gamma*G_t[0])
        G_t.appendleft(disc)

    return np.array(G_t)

class Actor(nn.Module):
    def __init__(self, state_size, action_size, hidden_size):
        super(Actor, self).__init__()
        self.dense_layer_1 = nn.Linear(state_size, hidden_size)
        self.dense_layer_2 = nn.Linear(hidden_size, hidden_size)
        self.output = nn.Linear(hidden_size, action_size)

    def forward(self, x):
        x = torch.clamp(x,-1.1,1.1)
        x = F.relu(self.dense_layer_1(x))
        x = F.relu(self.dense_layer_2(x))
        return F.softmax(self.output(x),dim = 1)

class Critic(nn.Module):
    def __init__(self, state_size, hidden_size):
        super(Critic, self).__init__()
        self.dense_layer_1 = nn.Linear(state_size, hidden_size)
        self.dense_layer_2 = nn.Linear(hidden_size, hidden_size)
        self.output = nn.Linear(hidden_size, 1)

    def forward(self, x):
        x = torch.clamp(x,-1.1,1.1)
        x = F.relu(self.dense_layer_1(x))
        x = F.relu(self.dense_layer_2(x))
        return self.output(x)

class ActorCritic():
    def __init__(self, state_size, action_size, hidden_size, gamma = 0.99, learning_rate = 0.001):
        
        self.state_size = state_size
        self.action_size = action_size
        self.hidden_size = hidden_size
        self.gamma = gamma
        
        self.actor = Actor(state_size, action_size, hidden_size).to(device)
        self.critic = Critic(s_size, hidden_layer).to(device)
        self.optimizer = optim.Adam(self.actor.parameters(), lr = learning_rate)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr = learning_rate)
    
    def train(self, states, rewards, actions):
        
        state_t = torch.FloatTensor(states).to(device)
        action_t = torch.LongTensor(actions).to(device).view(-1,1)
        return_t = torch.FloatTensor(calc_disc_return(rewards, gamma)).to(device).view(-1,1)
    
        vf_t = self.value_net(state_t).to(device)
        with torch.no_grad():
            advantage_t = return_t - vf_t
    
        selected_action_prob = self.policy_net(state_t).gather(1, action_t)
        loss = torch.mean(-torch.log(selected_action_prob) * advantage_t)
    
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
    
        loss_fn = nn.MSELoss()
        vf_loss = loss_fn(vf_t, return_t)
        self.v_optimizer.zero_grad()
        vf_loss.backward()
        self.v_optimizer.step()

        grads = np.concatenate([p.grad.data.detach().cpu().numpy().flatten()
                                for p in self.policy_net.parameters()
                                if p.grad is not None])
        
        grad_l2 = np.sqrt(np.mean(np.square(grads)))
        grad_max = np.max(np.abs(grads))
    
        return loss.item(), grad_l2, grad_max
    
    def save(self, model_file):
        torch.save({
            'actor_dict': self.actor.state_dict(),
            'critic_dict': self.vcritic.state_dict(),
            }, model_file)
        
    def load(self, model_file):
        checkpoint = torch.load(model_file)
        self.policy_net.load_state_dict(checkpoint['actor_dict'])
        self.value_net.load_state_dict(checkpoint['critic_dict'])

In [5]:
hidden_layer = 64
gamma = 0.995
policy_lr = 0.001
value_lr = 0.001
episodes = 3 #100_000
avg_win_size = 50
epi_results = deque(maxlen=avg_win_size)

ac = ActorCritic(s_size, a_size, hidden_layer)

In [6]:
log_file_name = os.path.join('.','artefacts',f'{env_id}_policygradient_ac.csv')
model_file = os.path.join('.','models',f'{env_id}_policygradient_ac.pt')

In [7]:
train = True
'''
if train:
    log_file = open(log_file_name, "w")
    log_file.write(f'episode,rewards,loss,l2_grad,max_grad\n')
'''

for epi in range(episodes):

    if not train:
        print("set train flag to True for Training")
        break
        
    s = env.reset()[0]
    done , trunc = False, False
    states , nxt_states, rewards, actions, dones = [], [], [], [], []
    win = 0

    while not any([done, trunc]):

        states.append(s)
        obs = torch.FloatTensor(np.expand_dims(s,0)).to(device)

        with torch.no_grad():
            p_vals = ac.actor(obs)
            p_vals = torch.squeeze(p_vals)

        p_vals = p_vals.detach().cpu().numpy()
        a = np.random.choice(a_size, p=p_vals)

        s_, r, done ,trunc, _  = env.step(a)
        actions.append(a)
        rewards.append(r)
        dones.append(done)
        nxt_states.append(s_)
        
        s=np.copy(s_)

In [13]:
state_t = torch.FloatTensor(states).to(device)
action_t = torch.LongTensor(actions).to(device).view(-1,1)
return_t = torch.FloatTensor(calc_disc_return(rewards, gamma)).to(device).view(-1,1)

  state_t = torch.FloatTensor(states).to(device)


In [14]:
action_prob = ac.actor(state_t).gather(1, action_t)
#loss = torch.mean(-torch.log(action_prob) * return_t)

In [15]:
T = len(rewards) 
discounts = np.logspace(0, T, num=T, base=gamma, endpoint=False)

returns = np.array([np.sum(discounts[:T-t] * rewards[t:]) for t in range(T)])

In [16]:
return_t

tensor([[-261.2838],
        [-261.1110],
        [-260.6499],
        [-259.7742],
        [-258.5236],
        [-259.9392],
        [-261.2383],
        [-262.8831],
        [-264.5254],
        [-266.8300],
        [-265.9666],
        [-266.0929],
        [-265.3258],
        [-263.9409],
        [-265.4377],
        [-268.0513],
        [-266.6037],
        [-265.9030],
        [-265.1532],
        [-265.4020],
        [-265.9242],
        [-269.3294],
        [-268.9385],
        [-267.6998],
        [-266.1357],
        [-268.3803],
        [-271.0439],
        [-271.5903],
        [-270.0543],
        [-270.2497],
        [-268.6083],
        [-267.7786],
        [-268.0731],
        [-266.5521],
        [-264.8605],
        [-264.7093],
        [-264.5811],
        [-265.6229],
        [-266.0389],
        [-265.0164],
        [-263.7697],
        [-262.3778],
        [-261.6683],
        [-259.8918],
        [-258.2677],
        [-257.9777],
        [-256.3021],
        [-257

In [17]:
eps = np.finfo(np.float32).eps.item()

## eps is the smallest representable float, which is
# added to the standard deviation of the returns to avoid numerical instabilities
#returns = torch.tensor(returns)
return_t = (return_t - return_t.mean()) / (return_t.std() + eps)

In [18]:
return_t

tensor([[-0.5166],
        [-0.5118],
        [-0.4989],
        [-0.4745],
        [-0.4396],
        [-0.4791],
        [-0.5154],
        [-0.5613],
        [-0.6071],
        [-0.6714],
        [-0.6473],
        [-0.6509],
        [-0.6294],
        [-0.5908],
        [-0.6326],
        [-0.7055],
        [-0.6651],
        [-0.6456],
        [-0.6246],
        [-0.6316],
        [-0.6461],
        [-0.7412],
        [-0.7303],
        [-0.6957],
        [-0.6521],
        [-0.7147],
        [-0.7890],
        [-0.8043],
        [-0.7614],
        [-0.7669],
        [-0.7211],
        [-0.6979],
        [-0.7061],
        [-0.6637],
        [-0.6165],
        [-0.6122],
        [-0.6087],
        [-0.6377],
        [-0.6494],
        [-0.6208],
        [-0.5860],
        [-0.5472],
        [-0.5274],
        [-0.4778],
        [-0.4325],
        [-0.4244],
        [-0.3776],
        [-0.4210],
        [-0.4103],
        [-0.4888],
        [-0.4369],
        [-0.3767],
        [-0.

In [26]:
state_t = torch.FloatTensor(states).to(device)
action_t = torch.LongTensor(actions).to(device).view(-1,1)
return_t = torch.FloatTensor(calc_disc_return(rewards, gamma)).to(device).view(-1,1)
nxt_state_t = torch.FloatTensor(nxt_states).to(device)
done_t = torch.FloatTensor(dones).to(device).view(-1,1)

critic_t = ac.critic(state_t).view(-1,1)
with torch.no_grad():
    critic_td_t = return_t + done_t * ac.gamma * ac.critic(nxt_state_t).view(-1,1)
    advantage_t = critic_td_t - critic_t
    


In [31]:
selected_action_prob = ac.actor(state_t).gather(1, action_t)
actor_loss = torch.mean(-torch.log(selected_action_prob) * advantage_t)
ac.optimizer.zero_grad()
actor_loss.backward()
ac.optimizer.step() 


critic_loss = F.smooth_l1_loss(critic_t, critic_td_t)
ac.critic_optimizer.zero_grad()
critic_loss.backward()
ac.critic_optimizer.step() 

In [32]:
critic_loss

tensor(128.5814, device='cuda:0', grad_fn=<SmoothL1LossBackward0>)