In [1]:
# Import all the different libraries and modules needed
from torch.distributions import Categorical
import gymnasium
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
# Establish the discounted return rate.
gamma = .99

# Neural network estimating the policy function
class Pi(nn.Module):
    def __init__(self, in_dim, out_dim):
        super(Pi, self).__init__()
        layers = [
            nn.Linear(in_dim, 128),
            nn.ReLU(),
            nn.Linear(128, out_dim),
        ]
        self.model = nn.Sequential(*layers)
        self.onpolicy_reset()
        self.train()
    def onpolicy_reset(self):
        self.log_probs = []
        self.rewards = []

    def forward(self, x):
        pdparam = self.model(x)
        return pdparam

    def act(self, state):
        x = torch.from_numpy(state).to('cuda')
        pdparam = self.forward(x).to('cuda')
        pd = Categorical(logits = pdparam)
        action = pd.sample().to('cuda')
        log_prob = pd.log_prob(action).to('cuda')
        self.log_probs.append(log_prob)
        return action.item()


In [3]:
def train(pi, optimizer):
    T = len(pi.rewards)
    rets = np.empty(T, dtype=np.float32)
    future_ret = 0.0
    for t in reversed(range(T)):
        future_ret = pi.rewards[t] + gamma * future_ret
        rets[t] = future_ret
    rets = torch.tensor(rets, dtype=torch.float32)
    log_probs = torch.stack(pi.log_probs).to('cuda')
    loss = -log_probs*rets.to('cuda')
    loss = torch.sum(loss).to('cuda')
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return loss

In [4]:
def main():
    env = gymnasium.make('CartPole-v1')
    in_dim = env.observation_space.shape[0]
    out_dim = env.action_space.n
    pi = Pi(in_dim, out_dim).to('cuda')
    optimizer = optim.Adam(pi.parameters(), lr=.01)
    for epi in range(300):
        state = env.reset()[0]
        for t in range(500):
            action = pi.act(state)
            state, reward, done, _, _ = env.step(action)
            pi.rewards.append(reward)
            env.render()
            if done:
                break
        loss = train(pi, optimizer)
        total_reward = sum(pi.rewards)
        solved = total_reward > 195.0
        pi.onpolicy_reset()
        print(f'Episode {epi}, loss: {loss}, '
              f'total_reward: {total_reward}, solve: {solved}')

if __name__ == '__main__':
    main()

  gym.logger.warn(


Episode 0, loss: 78.89933013916016, total_reward: 15.0, solve: False
Episode 1, loss: 19.2451229095459, total_reward: 9.0, solve: False
Episode 2, loss: 174.44635009765625, total_reward: 22.0, solve: False
Episode 3, loss: 90.367919921875, total_reward: 17.0, solve: False
Episode 4, loss: 33.669307708740234, total_reward: 11.0, solve: False
Episode 5, loss: 194.5782928466797, total_reward: 22.0, solve: False
Episode 6, loss: 103.8862533569336, total_reward: 16.0, solve: False
Episode 7, loss: 99.87071990966797, total_reward: 18.0, solve: False
Episode 8, loss: 42.51801300048828, total_reward: 12.0, solve: False
Episode 9, loss: 76.96240234375, total_reward: 15.0, solve: False
Episode 10, loss: 43.29662322998047, total_reward: 11.0, solve: False
Episode 11, loss: 18.956520080566406, total_reward: 8.0, solve: False
Episode 12, loss: 233.75511169433594, total_reward: 27.0, solve: False
Episode 13, loss: 215.53536987304688, total_reward: 25.0, solve: False
Episode 14, loss: 41.859970092773