In [1]:
# Import all the different libraries and modules needed
from torch.distributions import Categorical, Normal
import gymnasium
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
# Establish the discounted return rate.
gamma = .99

# Neural network estimating the policy function
class Pi(nn.Module):
    def __init__(self, in_dim, out_dim):
        super(Pi, self).__init__()
        layers = [
            nn.Linear(in_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
        ]
        self.meanLayer = nn.Linear(128, out_dim)
        self.model = nn.Sequential(*layers)
        self.onpolicy_reset()
        self.train()
    def onpolicy_reset(self):
        self.log_probs = []
        self.rewards = []

    def forward(self, x):
        pdparam = self.model(x)
        return pdparam

    def act(self, state):
        x = torch.from_numpy(state).to('cuda')
        pdparam = self.forward(x).to('cuda')
        pd = Normal(loc=pdparam[0], scale=pdparam[1])
        action = (pd.sample(),pd.sample(),pd.sample(),pd.sample())
        log_prob = pd.log_prob(action).to('cuda')
        self.log_probs.append(log_prob)
        return action.item()


In [3]:
def train(pi, optimizer):
    T = len(pi.rewards)
    rets = np.empty(T, dtype=np.float32)
    future_ret = 0.0
    for t in reversed(range(T)):
        future_ret = pi.rewards[t] + gamma * future_ret
        rets[t] = future_ret
    rets = torch.tensor(rets, dtype=torch.float32)
    log_probs = torch.stack(pi.log_probs).to('cuda')
    loss = -log_probs*rets.to('cuda')
    loss = torch.sum(loss).to('cuda')
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return loss

In [4]:
def main():
    env = gymnasium.make('BipedalWalker-v3')
    in_dim = env.observation_space.shape[0]
    out_dim = 2
    pi = Pi(in_dim, out_dim).to('cuda')
    optimizer = optim.Adam(pi.parameters(), lr=.01)
    for epi in range(300):
        state = env.reset()[0]
        for t in range(500):
            action = pi.act(state)
            state, reward, done, _, _ = env.step(action)
            pi.rewards.append(reward)
            env.render()
            if done:
                break
        loss = train(pi, optimizer)
        total_reward = sum(pi.rewards)
        solved = total_reward > 195.0
        pi.onpolicy_reset()
        print(f'Episode {epi}, loss: {loss}, '
              f'total_reward: {total_reward}, solve: {solved}')

if __name__ == '__main__':
    main()

ValueError: Expected parameter scale (Tensor of shape ()) of distribution Normal(loc: -0.03613238409161568, scale: -0.049705736339092255) to satisfy the constraint GreaterThan(lower_bound=0.0), but found invalid values:
-0.049705736339092255