[Hands on RL Policy Gradient](https://github.com/PacktPublishing/Hands-on-Reinforcement-Learning-with-PyTorch/blob/master/Section%204/4.3%20Policy%20Gradients%20REINFORCE.ipynb)

[Policy Gradient Math](https://towardsdatascience.com/policy-gradients-in-reinforcement-learning-explained-ecec7df94245)

A widely used variation of REINFORCE is to subtract a baseline value from the return to reduce the variance of gradient estimation while keeping the bias unchanged (Remember we always want to do this when possible). For example, a common baseline is to subtract state-value from action-value, and if applied, we would use advantage:

$$
A(s,a) = Q(s,a) - V(s)
$$

in the gradient ascent update. This [post](https://danieltakeshi.github.io/2017/03/28/going-deeper-into-reinforcement-learning-fundamentals-of-policy-gradients/) nicely explained why a baseline works for reducing the variance, in addition to a set of fundamentals of policy gradient.

In [None]:
#!pip install swig
#!pip install gymnasium[box2d]

## Actor Critic

![Reinforce_bl](acritic.png) 

In [8]:
import torch as T
from torch import cuda, device, distributions
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
#from torch.distributions import Categorical
import math

import gymnasium as gym
import os, random
from pathlib import Path
from collections import deque

import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

from IPython.display import clear_output

In [11]:
env_id = "CartPole-v1"
env = gym.make(env_id)#,render_mode="human")

s_size = env.observation_space.shape[0]
a_size = env.action_space.n

print("_____OBSERVATION SPACE_____ \n")
print("The State Space is: ", s_size)
print("The Action Space is: ", a_size)
print("Sample observation", env.observation_space.sample()) # Get a random observation

_____OBSERVATION SPACE_____ 

The State Space is:  4
The Action Space is:  2
Sample observation [-4.4889826e-01 -3.4762937e+37  1.8709894e-01  6.2282093e+37]


In [12]:
device = device("cuda:0" if cuda.is_available() else "cpu")

In [13]:
hidden_layer = 64
gamma = 0.995
actor_lr = 0.0001
critic_lr = 0.0001
episodes = 100_000
avg_win_size = 50
epi_results = deque(maxlen=avg_win_size)

#ac = ActorCritic(s_size, a_size, hidden_layer, actor_lr, critic_lr , gamma = gamma)

In [30]:
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

class ReplayBuffer():
    def __init__(self, max_size, input_shape):
        self.mem_size = max_size
        self.mem_cntr = 0
        self.state_memory = np.zeros((self.mem_size, *input_shape),
                                    dtype=np.float32)
        self.new_state_memory = np.zeros((self.mem_size, *input_shape),
                                        dtype=np.float32)
        self.log_probs = np.zeros(self.mem_size, dtype=np.float32)
        self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.uint8)

    def store_transition(self, state, log_prob, reward, state_, done):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        self.log_probs[index] = log_prob
        self.reward_memory[index] = reward
        self.terminal_memory[index] = done
        self.mem_cntr += 1

    def sample_buffer(self, batch_size):
        max_mem = min(self.mem_cntr, self.mem_size)
        batch = np.random.choice(max_mem, batch_size, replace=False)

        states = self.state_memory[batch]
        probs = self.log_probs[batch]
        rewards = self.reward_memory[batch]
        states_ = self.new_state_memory[batch]
        terminal = self.terminal_memory[batch]

        return states, probs, rewards, states_, terminal

class ActorCriticNetwork(nn.Module):
    def __init__(self, lr, input_dims, fc1_dims, fc2_dims,
                 n_actions):
        super(ActorCriticNetwork, self).__init__()
        self.input_dims = input_dims
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions
        self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
        self.pi = nn.Linear(self.fc2_dims, n_actions)
        self.v = nn.Linear(self.fc2_dims, 1)
        self.optimizer = optim.Adam(self.parameters(), lr=lr)

        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cuda:1')
        self.to(self.device)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        pi = self.pi(x)
        v = self.v(x)
        return (pi, v)

class Agent():
    def __init__(self, lr, input_dims, n_actions, gamma=0.99,
                 l1_size=32, l2_size=32, batch_size=32,
                 mem_size=1000000):
        self.gamma = gamma
        self.batch_size = batch_size
        self.memory = ReplayBuffer(mem_size, input_dims)
        self.actor_critic = ActorCriticNetwork(lr, input_dims, l1_size,
                                    l2_size, n_actions=n_actions)
        self.log_probs = []

    def store_transition(self, state, prob, reward, state_, done):
        self.memory.store_transition(state, prob, reward, state_, done)

    def choose_action(self, observation):
       
        state = T.tensor([observation]).to(self.actor_critic.device)
        probabilities, _ = self.actor_critic.forward(state)
        probabilities = F.softmax(probabilities)
        action_probs = T.distributions.Categorical(probabilities)
        action = action_probs.sample()
        log_probs = action_probs.log_prob(action)

        return action.item(), log_probs

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return
        self.actor_critic.optimizer.zero_grad()

        state, prob, reward, new_state, done = \
                                self.memory.sample_buffer(self.batch_size)

        states = T.tensor(state).to(self.actor_critic.device)
        probs = T.tensor(prob).to(self.actor_critic.device)
        rewards = T.tensor(reward).to(self.actor_critic.device)
        dones = T.tensor(done).to(self.actor_critic.device)
        states_ = T.tensor(new_state).to(self.actor_critic.device)

        _, critic_value_ = self.actor_critic.forward(states_)
        _, critic_value = self.actor_critic.forward(states)

        critic_value_[dones] = 0.0

        delta = rewards + self.gamma*critic_value_

        actor_loss = -T.mean(probs*(delta-critic_value))
        critic_loss = F.mse_loss(delta, critic_value)

        (actor_loss + critic_loss).backward()

        self.actor_critic.optimizer.step()

In [39]:
env = gym.make('LunarLander-v2')
num_games = 1500    
agent = Agent(lr=1e-5, input_dims=[s_size], n_actions=a_size)

In [40]:
env = gym.make('CartPole-v1')
num_games = 100_000
scores = []

for i in range(num_games):
    done = False
    observation = env.reset()[0]
    score = 0

    while not done:
        
        action, prob = agent.choose_action(observation)
        observation_, reward, done, info , _ = env.step(action)
        score += reward
        agent.store_transition(observation, prob,
                                reward, observation_, int(done))
        agent.learn()
        observation = observation_
        if score >500:
            done = True
        
    scores.append(score)
    avg_score = np.mean(scores[max(0, i-50):(i+1)])
    '''
    print('episode: ', i,'score %.1f ' % score,
            ' average score %.1f' % avg_score)
    '''

    if i%100==0:
        clear_output()
    if i%10==0:
        print(f'epi:{i:05d} reward:{score:8.2f} avg_rewards:{avg_score:8.2f}')

epi:16000 reward:   17.00 avg_rewards:   17.51
epi:16010 reward:   11.00 avg_rewards:   16.27
epi:16020 reward:   28.00 avg_rewards:   16.96
epi:16030 reward:   15.00 avg_rewards:   17.14


KeyboardInterrupt: 

In [29]:
env.step(1)

(array([ 0.04635795,  0.23186207,  0.02870866, -0.27556905], dtype=float32),
 1.0,
 False,
 False,
 {})