We will begin by importing necessary packages and defining the policy network that the agent will use to decide its actions:

In [None]:
### In progress

import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

class PolicyNetwork(nn.Module):
    def __init__(self, n_inputs, n_outputs):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(n_inputs, 24)
        self.fc2 = nn.Linear(24, 24)
        self.fc3 = nn.Linear(24, n_outputs)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

class PolicyGradientAgent:
    def __init__(self, n_inputs, n_outputs):
        self.policy_network = PolicyNetwork(n_inputs, n_outputs)
        self.optimizer = optim.Adam(self.policy_network.parameters(), lr=0.01)
        self.gamma = 0.99

    def get_action(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        action_scores = self.policy_network(state)
        action_probs = torch.softmax(action_scores, dim=1)
        action = np.random.choice(len(action_probs[0]), p=action_probs.detach().numpy()[0])
        log_prob = torch.log(action_probs[0, action])
        return action, log_prob

    def update_policy(self, rewards, log_probs):
        discounted_rewards = []
        for t in range(len(rewards)):
            Gt = 0
            pw = 0
            for r in rewards[t:]:
                Gt = Gt + self.gamma**pw * r
                pw = pw + 1
            discounted_rewards.append(Gt)

        discounted_rewards = torch.tensor(discounted_rewards)
        discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9) # normalize discounted rewards

        policy_gradient = []
        for log_prob, Gt in zip(log_probs, discounted_rewards):
            policy_gradient.append(-log_prob * Gt)

        self.optimizer.zero_grad()
        policy_gradient = torch.stack(policy_gradient).sum()
        policy_gradient.backward()
        self.optimizer.step()


In [None]:
def main():
    env = gym.make("CartPole-v0")
    agent = PolicyGradientAgent(env.observation_space.shape[0], env.action_space.n)
    n_episodes = 1000

    for episode in range(n_episodes):
        state = env.reset()
        log_probs = []
        rewards = []
        done = False

        while not done:
            action, log_prob = agent.get_action(state)
            new_state, reward, done, _ = env.step(action)
            log_probs.append(log_prob)
            rewards.append(reward)
            state = new_state

            if done:
                agent.update_policy(rewards, log_probs)
                episode_reward = sum(rewards)
                print("Episode " + str(episode) + ": " + str(episode_reward))

if __name__ == "__main__":
    main()


Episode 0: 14.0
Episode 1: 15.0
Episode 2: 17.0
Episode 3: 21.0
Episode 4: 18.0
Episode 5: 16.0
Episode 6: 19.0
Episode 7: 11.0
Episode 8: 9.0
Episode 9: 13.0
Episode 10: 15.0
Episode 11: 14.0
Episode 12: 18.0
Episode 13: 11.0
Episode 14: 22.0
Episode 15: 25.0
Episode 16: 14.0
Episode 17: 14.0
Episode 18: 17.0
Episode 19: 74.0
Episode 20: 14.0
Episode 21: 42.0
Episode 22: 29.0
Episode 23: 14.0
Episode 24: 12.0
Episode 25: 12.0
Episode 26: 13.0
Episode 27: 10.0
Episode 28: 22.0
Episode 29: 25.0
Episode 30: 16.0
Episode 31: 28.0
Episode 32: 33.0
Episode 33: 15.0
Episode 34: 26.0
Episode 35: 18.0
Episode 36: 38.0
Episode 37: 39.0
Episode 38: 34.0
Episode 39: 17.0
Episode 40: 14.0
Episode 41: 18.0
Episode 42: 23.0
Episode 43: 26.0
Episode 44: 13.0
Episode 45: 77.0
Episode 46: 55.0
Episode 47: 15.0
Episode 48: 16.0
Episode 49: 14.0
Episode 50: 18.0
Episode 51: 43.0
Episode 52: 12.0
Episode 53: 14.0
Episode 54: 129.0
Episode 55: 17.0
Episode 56: 93.0
Episode 57: 61.0
Episode 58: 25.0
Episode