# TODO:

* Use whole episode as batch when learning?
* Try without gradient clipping

In [1]:
import sys
sys.path.append("..")
import models
import policy_gradient_methods
import utils
import gymnasium as gym
import torch
import random

# Settings

In [2]:
ENVIRONMENT_ID = "CartPole-v1"
NUM_EPISODES = 10000
GRAD_CLIP_VALUE = 100
GAMMA = 0.9
NN_HIDDEN_LAYER_SIZES = [16, 16]
RNG_SEED = 7
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
print("PyTorch device:", device)
torch.manual_seed(RNG_SEED)
random.seed(RNG_SEED)

PyTorch device: cpu


# REINFORCE

## Linear model

In [5]:
def select_action_softmax(model, state):
    with torch.no_grad():
        return torch.distributions.categorical.Categorical(model(state)).sample().item()

def train_episodic_reinforce(env, policy_model, optimizer, device, rng_seed, num_episodes, gamma, grad_clip_value=None):
    returns = []
    for episode in range(num_episodes):
        # Initiate episode
        # Set seed only one time per training run. For more info, see https://gymnasium.farama.org/api/env/.
        seed = rng_seed if episode == 0 else None
        observation, info = env.reset(seed=seed)
        state = torch.tensor(observation, device=device)
        truncated = False
        terminated = False
        G = 0
        states = [state]
        actions = []
        rewards = []

        # Generate episode
        while not (terminated or truncated):
            action = select_action_softmax(policy_model, state)
            observation, reward, terminated, truncated, info = env.step(action)
            next_state = torch.tensor(observation, device=device)
            states.append(next_state)
            actions.append(action)
            rewards.append(reward)
            G += reward
            state = next_state
        returns.append(G)

        # Learn from episode
        G = 0
        for t in reversed(range(len(rewards))):
            G = gamma * G + rewards[t]
            loss = -gamma**t * G * torch.log(policy_model(states[t])[actions[t]])
            optimizer.zero_grad()
            loss.backward()
            if grad_clip_value != None:
                torch.nn.utils.clip_grad_value_(policy_model.parameters(), grad_clip_value)
            optimizer.step()
        
    return returns, states, actions, rewards

In [6]:
env = gym.make(ENVIRONMENT_ID)
policy_model = torch.nn.Sequential(
    torch.nn.Linear(env.observation_space.shape[0], env.action_space.n),
    torch.nn.Softmax(dim=0)
).to(device)
optimizer = torch.optim.SGD(policy_model.parameters(), lr=0.01)
returns, states, actions, rewards = train_episodic_reinforce(
    env, policy_model, optimizer, device, RNG_SEED, NUM_EPISODES, GAMMA, GRAD_CLIP_VALUE
)
utils.plot_returns(returns)

## Neural network model

In [None]:
env = gym.make(ENVIRONMENT_ID)
policy_model = models.create_simple_nn(
    env.observation_space.shape[0],
    env.action_space.n,
    NN_HIDDEN_LAYER_SIZES,
    output_activation="softmax"
).to(device)
#optimizer = torch.optim.SGD(policy_model.parameters(), lr=0.001)
optimizer = torch.optim.AdamW(policy_model.parameters(), lr=0.001, amsgrad=True)
returns, states, actions, rewards = train_episodic_reinforce(
    env, policy_model, optimizer, device, RNG_SEED, NUM_EPISODES, GAMMA, GRAD_CLIP_VALUE
)
utils.plot_returns(returns)

# REINFORCE with baseline

## Linear model

## Neural network model