[Hands on RL Policy Gradient](https://github.com/PacktPublishing/Hands-on-Reinforcement-Learning-with-PyTorch/blob/master/Section%204/4.3%20Policy%20Gradients%20REINFORCE.ipynb)<br>
[Policy Gradient Math](https://towardsdatascience.com/policy-gradients-in-reinforcement-learning-explained-ecec7df94245)<br>
[Vanilla Policy Gradient](https://spinningup.openai.com/en/latest/algorithms/vpg.html)<br>
[RL by Phil Tabor](https://github.com/philtabor/Youtube-Code-Repository/tree/master/ReinforcementLearning)

In [1]:
#!pip install swig
#!pip install gymnasium[box2d]

In [2]:
import os

import torch
from torch import cuda, device, distributions
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
#from torch.distributions import Categorical

from collections import deque

import gymnasium as gym
import os, random
from pathlib import Path
from collections import deque

import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import clear_output

In [3]:
env_id = "LunarLander-v2"
env = gym.make(env_id)#,render_mode="human")

s_size = env.observation_space.shape[0]
a_size = env.action_space.n

print("_____OBSERVATION SPACE_____ \n")
print("The State Space is: ", s_size)
print("Sample observation", env.observation_space.sample()) # Get a random observation

_____OBSERVATION SPACE_____ 

The State Space is:  8
Sample observation [ 0.9462635  -0.229104    2.8404567  -4.154978    2.1169014   0.6737268
  0.83467776  0.4847395 ]


In [4]:
device = device("cuda:0" if cuda.is_available() else "cpu")

In [5]:
def calc_disc_return(r_t , gamma = 0.998):

    G_t = deque(maxlen = len(r_t))
    G_t.append(r_t[-1])

    for i in reversed(r_t[:-1]):
        disc = i + (gamma*G_t[0])
        G_t.appendleft(disc)

    return np.array(G_t)

class PolicyNet(nn.Module):
    def __init__(self, state_size, action_size, hidden_size):
        super(PolicyNet, self).__init__()
        self.dense_layer_1 = nn.Linear(state_size, hidden_size)
        self.dense_layer_2 = nn.Linear(hidden_size, hidden_size)
        self.output = nn.Linear(hidden_size, action_size)

    def forward(self, x):
        x = torch.clamp(x,-1.1,1.1)
        x = F.relu(self.dense_layer_1(x))
        x = F.relu(self.dense_layer_2(x))
        return F.softmax(self.output(x),dim = 1)

In [13]:
hidden_layer = 64
gamma = 0.99
learning_rate = 0.001
episodes = 100_000
avg_win_size = 50
epi_results = deque(maxlen=avg_win_size)

policy_net = PolicyNet(s_size, a_size, hidden_layer).to(device)
optimizer = optim.Adam(policy_net.parameters(), lr = learning_rate)

In [14]:
log_file_name = os.path.join('.','artefacts',f'{env_id}_policygradient.csv')
log_file = open(log_file_name, "w")
log_file.write(f'episode,loss,rewards,l2_grad,max_grad\n')

model_file = os.path.join('.','models',f'{env_id}_policygradient.pt')

In [15]:
for epi in range(episodes):

    s = env.reset()[0]
    term , trunc = False, False
    rewards, states , actions = [], [], []
    win = 0

    while not any([term, trunc]):

        states.append(s)
        obs = torch.FloatTensor(np.expand_dims(s,0)).to(device)

        with torch.no_grad():
            p_vals = policy_net(obs)
            p_vals = torch.squeeze(p_vals)

        p_vals = p_vals.detach().cpu().numpy()
        a = np.random.choice(a_size, p=p_vals)

        s_, r, term ,trunc, _  = env.step(a)
        actions.append(a)
        rewards.append(r)
        s=np.copy(s_)

    state_t = torch.FloatTensor(states).to(device)
    action_t = torch.LongTensor(actions).to(device).view(-1,1)
    return_t = torch.FloatTensor(calc_disc_return(rewards, gamma)).to(device).view(-1,1)
    epi_results.append(np.sum(rewards))

    selected_action_prob = policy_net(state_t).gather(1, action_t)
    loss = torch.mean(-torch.log(selected_action_prob) * return_t)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    
    grads = np.concatenate([p.grad.data.detach().cpu().numpy().flatten()
    for p in policy_net.parameters()
    if p.grad is not None])
    
    grad_l2 = np.sqrt(np.mean(np.square(grads)))
    grad_max = np.max(np.abs(grads))
    
    
    log_file.write(f'{epi},{loss.item():.2f},{np.sum(rewards):.2f},{grad_l2:.4f},{grad_max:.4f}\n')

    if epi%100==0:
        clear_output()
    if epi%10==0:
        print(f'epi:{epi:05d} reward:{np.sum(rewards):8.2f} loss:{loss:8.2f} mean_rewards:{np.mean(epi_results):8.2f}')
    if np.mean(np.mean(epi_results))>200:
        break
log_file.close()

epi:06200 reward:   92.42 loss:    4.02 mean_rewards:  167.83
epi:06210 reward:  253.87 loss:   32.09 mean_rewards:  181.06
epi:06220 reward:  210.94 loss:   11.78 mean_rewards:  195.84


In [19]:
torch.save(policy_net, model_file)

In [20]:
saved_model = torch.load(model_file)

In [21]:
eval_env = gym.make(env_id,render_mode="human")

for epi in range(10):

    s = eval_env.reset()[0]
    term = False
    trunc = False
    score = 0
    n=0
    while not any([term, trunc]):

        obs = torch.FloatTensor(np.expand_dims(s,0)).to(device)

        with torch.no_grad():
            p_vals = saved_model(obs)
            p_vals = torch.squeeze(p_vals)

        p_vals = p_vals.detach().cpu().numpy()
        #a = np.random.choice(a_size, p=p_vals)
        a = np.argmax(p_vals)
        s, r, term ,trunc , _  = eval_env.step(a)
        #s = np.copy(s_)
        #env.render()
        n+=1
        score+=r
        if score >=200:
            break

    print(f'{epi = } result {score:4.2f}')
eval_env.close()

epi = 0 result 109.03
epi = 1 result 200.57
epi = 2 result 123.43
epi = 3 result 87.41
epi = 4 result 136.71
epi = 5 result 229.54
epi = 6 result 223.78
epi = 7 result 223.66
epi = 8 result 97.88
epi = 9 result 243.42
