# REINFORCE: Monte-Carlo Policy-Gradient Control (episodic)
From Chapter 13, Policy Gradient Methods (Sutton and Barto, 2018) **[1]**

## Import Packages

In [1]:
import gymnasium as gym
import numpy as np
import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


## CartPole Problem

In [2]:
env = gym.make('CartPole-v1')

state_size = env.observation_space.shape[0]
action_size = env.action_space.n

print("_____Observation Space______")
print("State space size: ", state_size)
print("Sample observation: ", env.observation_space.sample())

print("_____Action Space______")
print("Action space size: ", action_size)
print("Sample action: ", env.action_space.sample())

_____Observation Space______
State space size:  4
Sample observation:  [ 2.1195223e+00 -3.2292586e+38  4.1371867e-01 -3.1036727e+38]
_____Action Space______
Action space size:  2
Sample action:  1


### Define an Agent (Parametrized Policy)
The policy must be differentiable. Here we define a linear policy.

In [3]:
class LinearPolicyAgent():
    def __init__(self, state_size, action_size, alpha=0.0001, gamma=1):
        self.alpha = alpha # learning rate
        self.gamma = gamma # discount factor

        self.reset()

    def reset(self):
        self.weights = torch.rand((action_size, state_size), device=device)

    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        probabilities = torch.matmul(self.weights, state.T) # compute action preferences eq 13.3
        policy = torch.functional.F.softmax(probabilities, dim=0).T # softmax in action preferences eq 13.2

        action_idx = torch.multinomial(policy, 1).item() # sample action from policy

        return action_idx
    
    def update_policy(self, weights):
        self.weights = weights

# test the policy
debug_policy = LinearPolicyAgent(state_size, action_size)
debug_policy.act(env.reset()[0])

0

### Define the Learning Algorithm

In [4]:
def reinforce(environment, agent, n_episodes, max_steps):
    total_rewards = np.zeros(n_episodes)

    for i in range(n_episodes):
        rewards_history = []
        action_history = []
        state_history = []

        state = env.reset()[0]

        # Generate an episode S0, A0, R1, ..., ST-1, AT-1, RT, following pi(.|., theta)
        for j in range(max_steps):

            action_idx = agent.act(state)

            next_state, reward, done, _, _ = env.step(action_idx)

            action_history.append(action_idx)
            state_history.append(state)
            rewards_history.append(reward)

            state = next_state

            if done:
                break

        total_rewards[i] = sum(rewards_history)

        # Update policy parameters, eq. 13.8
        for t in range(len(state_history))[::-1]: # exclude the goal state from the update
            state = torch.from_numpy(state_history[t]).float().unsqueeze(0).to(device)
            action_idx = action_history[t]

            # compute the return following time t
            gt = agent.gamma ** np.arange(len(rewards_history[t:])) * np.array(rewards_history[t:])
            gt = agent.gamma ** t * np.sum(gt)

            # compute the gradient of the log policy
            probabilities = torch.matmul(agent.weights, state.T) # compute action preferences eq 13.3
            policy = torch.functional.F.softmax(probabilities, dim=0) # softmax in action preferences eq 13.2
            current_features = state[:, action_idx]
            grad_log_pi = current_features - torch.dot(policy[action_idx], current_features)

            # update the policy parameters
            agent.update_policy(agent.weights + agent.alpha * agent.gamma ** t * gt * grad_log_pi)

    return total_rewards

### Train

In [6]:
linear_policy_agent = LinearPolicyAgent(state_size, action_size, alpha=0.01, gamma=1)
scores = reinforce(env, linear_policy_agent, n_episodes=1000, max_steps=1000)

print("Average score over 1000 episodes: {}".format(np.mean(scores)))

Average score over 1000 episodes: 34.219


## References

- **[1]** R. S. Sutton and A. G. Barto, Reinforcement learning: an introduction, Second edition. in Adaptive computation and machine learning series. Cambridge, Massachusetts: The MIT Press, 2018.