In [1]:
# Import all the different libraries and modules needed
from torch.distributions import Categorical, Normal
import gymnasium
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
# Establish the discounted return rate.
gamma = .99

# Neural network estimating the policy function
class Pi(nn.Module):
    def __init__(self, in_dim, out_dim):
        super(Pi, self).__init__()
        layers = [
            nn.Linear(in_dim, 250),
            nn.Tanh(),
            nn.Linear(250, 250),
            nn.Tanh(),
            nn.Linear(250, 250),
            nn.Tanh(),
            nn.Linear(250, 250),
            nn.Tanh(),
        ]
        self.meanLayer = nn.Linear(250, out_dim)
        self.sdLayer = nn.Linear(250, out_dim)
        self.model = nn.Sequential(*layers)
        self.onpolicy_reset()
        self.train()
        self.beta = .4
        self.eps = 1e-6
        self.sd = None
    def onpolicy_reset(self):
        self.log_probs = []
        self.rewards = []

    def forward(self, x):
        sharedValue = self.model(x)
        means = self.meanLayer(sharedValue)
        self.sd = torch.log( 1 + self.eps + torch.exp(self.sdLayer(sharedValue)))
        return means, self.sd

    def act(self, state):
        x = torch.from_numpy(state).float().to('cuda')
        mean, sd = self.forward(x)
        pd = Normal(loc=mean, scale=sd)
        action = pd.sample()
        log_prob = pd.log_prob(action).to('cuda')
        action = action.to('cpu').numpy()
        self.log_probs.append(log_prob)
        return action


In [3]:
def train(pi, optimizer):
    T = len(pi.rewards)
    rets = np.empty(T, dtype=np.float32)
    future_ret = 0.0
    for t in reversed(range(T)):
        future_ret = pi.rewards[t] + gamma * future_ret
        rets[t] = future_ret
    rets = torch.tensor(rets, dtype=torch.float32)
    log_probs = torch.stack(pi.log_probs).to('cuda')
    variances = torch.square(pi.sd)
    entropy = 0.5 * torch.sum(torch.log(2 * np.pi * np.e * variances)).mean()
    loss = 0.0
    for log_prob, rets in zip(log_probs, rets):
        loss += log_prob.mean() * rets * (-1)
    loss = (loss - pi.beta * entropy).to('cuda')
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return loss

In [4]:
def main():
    env = gymnasium.make('BipedalWalker-v3')
    in_dim = env.observation_space.shape[0]
    out_dim = env.action_space.high.size
    pi = Pi(in_dim, out_dim).to('cuda')
    optimizer = optim.AdamW(pi.parameters(), lr=.001)
    for epi in range(1000000):
        state = env.reset()[0]
        for t in range(5):
            action = pi.act(state)
            state, reward, done, _, _ = env.step(action)
            pi.rewards.append(reward)
            env.render()
            if done:
                break
        loss = train(pi, optimizer)
        total_reward = sum(pi.rewards)
        solved = total_reward > 195.0
        pi.onpolicy_reset()
        print(f'Episode {epi}, loss: {loss}, '
              f'total_reward: {total_reward}, solve: {solved}')

if __name__ == '__main__':
    main()

  gym.logger.warn(


Episode 0, loss: -3.1905741691589355, total_reward: -0.41883498430252075, solve: False
Episode 1, loss: -4.227116107940674, total_reward: -0.7810536623001099, solve: False
Episode 2, loss: -4.3323564529418945, total_reward: -0.756112813949585, solve: False
Episode 3, loss: -3.9907279014587402, total_reward: -0.43758004903793335, solve: False
Episode 4, loss: -9.38312816619873, total_reward: -1.4829272031784058, solve: False
Episode 5, loss: -5.560580730438232, total_reward: -0.6501624584197998, solve: False
Episode 6, loss: -6.069578170776367, total_reward: -0.9007478356361389, solve: False
Episode 7, loss: -6.749181747436523, total_reward: -0.9461935758590698, solve: False
Episode 8, loss: -5.970715522766113, total_reward: -0.6407614350318909, solve: False
Episode 9, loss: -8.816635131835938, total_reward: -0.9528779983520508, solve: False
Episode 10, loss: -11.078393936157227, total_reward: -1.2052435874938965, solve: False
Episode 11, loss: -10.966824531555176, total_reward: -1.3313

ValueError: Expected parameter loc (Tensor of shape (4,)) of distribution Normal(loc: torch.Size([4]), scale: torch.Size([4])) to satisfy the constraint Real(), but found invalid values:
tensor([nan, nan, nan, nan], device='cuda:0', grad_fn=<ViewBackward0>)